Added image proxying, refactored filter class

Images were previously directly fetched from google search results, which was a potential privacy hazard. All image sources are now modified to be passed through shoogle's routing first, which will then fetch raw image data and pass it through to the user. Filter class was refactored to split the primary clean method into smaller, more manageable submethods.
2020-04-27 20:21:36 -06:00 · 2020-04-27 20:21:36 -06:00 · 4180aedd87
parent b0e6167733
commit 4180aedd87
3 changed files with 106 additions and 67 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -30,9 +30,11 @@ class Filter:
        return page
    def clean(self, soup):
-        # Remove all ads
+        def remove_ads():
-        main_divs = soup.find('div', {'id': 'main'})
+            main_divs = soup.find('div', {'id': 'main'})
-        if main_divs is not None:
+            if main_divs is None:
                return
            result_divs = main_divs.findAll('div', recursive=False)
            # Only ads/sponsored content use classes in the list of result divs
@ -40,78 +42,92 @@ class Filter:
            for div in ad_divs:
                div.decompose()
-        # Remove unnecessary button(s)
+        def sync_images():
-        for button in soup.find_all('button'):
+            for img in soup.find_all('img'):
-            button.decompose()
+                if img['src'].startswith('//'):
                    img['src'] = 'https:' + img['src']
-        # Remove svg logos
+                img['src'] = '/tmp?image_url=' + img['src']
        for svg in soup.find_all('svg'):
            svg.decompose()
-        # Update logo
+        def update_styling():
-        logo = soup.find('a', {'class': 'l'})
+            # Remove unnecessary button(s)
-        if logo and self.mobile:
+            for button in soup.find_all('button'):
-            logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; font-size:18px;'
+                button.decompose()
-        # Fix search bar length on mobile
+            # Remove svg logos
-        try:
+            for svg in soup.find_all('svg'):
-            search_bar = soup.find('header').find('form').find('div')
+                svg.decompose()
            search_bar['style'] = 'width: 100%;'
        except AttributeError:
            pass
-        # Replace hrefs with only the intended destination (no "utm" type tags)
+            # Update logo
-        for a in soup.find_all('a', href=True):
+            logo = soup.find('a', {'class': 'l'})
-            href = a['href']
+            if logo and self.mobile:
-            if '/advanced_search' in href:
+                logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \
-                a.decompose()
+                                'font-size:18px; '
                continue
-            if 'url?q=' in href:
+            # Fix search bar length on mobile
-                # Strip unneeded arguments
+            try:
-                result_link = urlparse.urlparse(href)
+                search_bar = soup.find('header').find('form').find('div')
-                result_link = parse_qs(result_link.query)['q'][0]
+                search_bar['style'] = 'width: 100%;'
            except AttributeError:
                pass
-                parsed_link = urlparse.urlparse(result_link)
+            # Set up dark mode if active
-                link_args = parse_qs(parsed_link.query)
+            if self.dark:
-                safe_args = {}
+                soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
                for input_element in soup.findAll('input'):
                    input_element['style'] = 'color:#fff;'
-                for arg in link_args.keys():
+        def update_links():
-                    if arg in SKIP_ARGS:
+            # Replace hrefs with only the intended destination (no "utm" type tags)
-                        continue
+            for a in soup.find_all('a', href=True):
                href = a['href']
                if '/advanced_search' in href:
                    a.decompose()
                    continue
-                    safe_args[arg] = link_args[arg]
+                if 'url?q=' in href:
                    # Strip unneeded arguments
                    result_link = urlparse.urlparse(href)
                    result_link = parse_qs(result_link.query)['q'][0]
-                # Remove original link query and replace with filtered args
+                    parsed_link = urlparse.urlparse(result_link)
-                result_link = result_link.replace(parsed_link.query, '')
+                    link_args = parse_qs(parsed_link.query)
-                if len(safe_args) > 1:
+                    safe_args = {}
                    result_link = result_link + urlparse.urlencode(safe_args)
                else:
                    result_link = result_link.replace('?', '')
-                a['href'] = result_link
+                    for arg in link_args.keys():
                        if arg in SKIP_ARGS:
                            continue
-                # Add no-js option
+                        safe_args[arg] = link_args[arg]
                if self.nojs:
                    nojs_link = soup.new_tag('a')
                    nojs_link['href'] = '/window?location=' + result_link
                    nojs_link['style'] = 'display:block;width:100%;'
                    nojs_link.string = 'NoJS Link: ' + nojs_link['href']
                    a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
                    a.append(nojs_link)
-        # Set up dark mode if active
+                    # Remove original link query and replace with filtered args
-        if self.dark:
+                    result_link = result_link.replace(parsed_link.query, '')
-            soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
+                    if len(safe_args) > 1:
-            for input_element in soup.findAll('input'):
+                        result_link = result_link + urlparse.urlencode(safe_args)
-                input_element['style'] = 'color:#fff;'
+                    else:
                        result_link = result_link.replace('?', '')
-        # Ensure no extra scripts passed through
+                    a['href'] = result_link
        try:
            for script in soup('script'):
                script.decompose()
            soup.find('div', id='sfooter').decompose()
        except Exception:
            pass
                    # Add no-js option
                    if self.nojs:
                        nojs_link = soup.new_tag('a')
                        nojs_link['href'] = '/window?location=' + result_link
                        nojs_link['style'] = 'display:block;width:100%;'
                        nojs_link.string = 'NoJS Link: ' + nojs_link['href']
                        a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
                        a.append(nojs_link)
            # Ensure no extra scripts passed through
            try:
                for script in soup('script'):
                    script.decompose()
                soup.find('div', id='sfooter').decompose()
            except Exception:
                pass
        remove_ads()
        sync_images()
        update_styling()
        update_links()
        return soup
--- a/app/request.py
+++ b/app/request.py
@ -1,5 +1,4 @@
 from app import rhyme
 from app.filter import Filter
 from io import BytesIO
 import pycurl
 import urllib.parse as urlparse
@ -60,7 +59,7 @@ class Request:
    def __getitem__(self, name):
        return getattr(self, name)
-    def send(self, base_url=SEARCH_URL, query=''):
+    def send(self, base_url=SEARCH_URL, query='', return_bytes=False):
        response_header = []
        b_obj = BytesIO()
@ -73,4 +72,7 @@ class Request:
        crl.perform()
        crl.close()
-        return b_obj.getvalue().decode('utf-8', 'ignore')
+        if return_bytes:
            return b_obj.getvalue()
        else:
            return b_obj.getvalue().decode('utf-8', 'ignore')
--- a/app/routes.py
+++ b/app/routes.py
@ -2,7 +2,8 @@ from app import app
 from app.filter import Filter
 from app.request import Request, gen_query
 from bs4 import BeautifulSoup
-from flask import g, make_response, request, redirect, render_template
+from flask import g, make_response, request, redirect, render_template, send_file
 import io
 import json
 import os
 import urllib.parse as urlparse
@ -18,6 +19,11 @@ def before_request_func():
    g.user_request = Request(request.headers.get('User-Agent'))
 # @app.after_request
 # def after_request(response):
 #     return response
@app.route('/', methods=['GET'])
 def index():
    bg = '#000' if 'dark' in user_config and user_config['dark'] else '#fff'
@ -87,6 +93,21 @@ def imgres():
    return redirect(request.args.get('imgurl'))
@app.route('/tmp')
 def tmp():
    file_data = g.user_request.send(base_url=request.args.get('image_url'), return_bytes=True)
    tmp_mem = io.BytesIO()
    tmp_mem.write(file_data)
    tmp_mem.seek(0)
    return send_file(
        tmp_mem,
        as_attachment=True,
        attachment_filename='tmp.png',
        mimetype='image/png'
    )
@app.route('/window')
 def window():
    get_body = g.user_request.send(base_url=request.args.get('location'))