From b2133edaa326a7b14a49a0506d2f76d1424efe8f Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Thu, 11 Jun 2020 13:38:51 -0600 Subject: [PATCH] Session refactoring and improved filter (#86) * Project refactor (#85) * Major refactor of requests and session management - Switches from pycurl to requests library - Allows for less janky decoding, especially with non-latin character sets - Adds session level management of user configs - Allows for each session to set its own config -- users with blocked cookies fall back to the "default" profile (same usage as before) - Updates key gen/regen to more aggressively swap out keys after each request * Added ability to save/load configs by name - New PUT method for config allows changing config with specified name - New methods in js controller to handle loading/saving of configs * Result formatting and removal of unused elements - Fixed question section formatting from results page (added appropriate padding and made questions styled as italic) - Removed user agent display from main config settings * Minor change to save config button label (now "Save As...") * Fixed issue with "de-pickling" of flask session Having a gitignore-everything ("*") file within a flask session folder seems to cause a weird bug where the state of the app becomes unusable from continuously trying to prune files listed in the gitignore (and it can't prune '*'). * Switched to pickling saved configs * Updated ad/sponsored content filter and conf naming Configs are now named with a .conf extension to allow for easier manual cleanup/modification of named config files Sponsored content now removed by basic string matching of span content * Version bump to 0.2.0 * Fixed request.send return style * Moved custom conf files to their own directory * Refactored whoogle session mgmt Now allows a fallback "default" session to be used if a user's browser is blocking cookies * Reworked pytest client fixture to support new session mgmt * Added better multilingual support, updated filter Results page now includes method for switching to "All Languages" from whichever language is specified as the primary in the config (see #74). Also removes the non-Whoogle links from the page footer, leaving only the page navigation controls Added support for the date range filter on the results page, though I'd still recommend using the ":past " query instead. * Removed no-cache enforcement, minor styling/formatting improvements * Improving ad filtering for non-English languages * Added footer to results page --- .gitignore | 4 + app/__init__.py | 23 ++++- app/filter.py | 181 ++++++++++++++++++++-------------- app/request.py | 71 ++++++------- app/routes.py | 181 ++++++++++++++++++++-------------- app/static/js/autocomplete.js | 2 +- app/static/js/controller.js | 35 +++++++ app/templates/display.html | 10 +- app/templates/header.html | 4 +- app/templates/index.html | 20 ++-- app/utils/__init__.py | 0 app/utils/misc.py | 29 ++++++ app/utils/routing_utils.py | 72 ++++++++++++++ requirements.txt | 3 +- setup.py | 2 +- test/conftest.py | 9 +- test/test_misc.py | 33 +++++++ test/test_results.py | 6 +- test/test_routes.py | 6 +- 19 files changed, 476 insertions(+), 215 deletions(-) create mode 100644 app/utils/__init__.py create mode 100644 app/utils/misc.py create mode 100644 app/utils/routing_utils.py create mode 100644 test/test_misc.py diff --git a/.gitignore b/.gitignore index 20747c7..bbffdb4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,12 @@ venv/ __pycache__/ *.pyc *.pem +*.conf config.json test/static +flask_session/ +app/static/config +app/static/custom_config # pip stuff build/ diff --git a/app/__init__.py b/app/__init__.py index 4b78a8d..22e436d 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,12 +1,27 @@ -from cryptography.fernet import Fernet +from app.utils.misc import generate_user_keys from flask import Flask +from flask_session import Session import os app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') -app.secret_key = Fernet.generate_key() -app.config['VERSION_NUMBER'] = '0.1.4' +app.user_elements = {} +app.default_key_set = generate_user_keys() +app.no_cookie_ips = [] +app.config['SECRET_KEY'] = os.urandom(32) +app.config['SESSION_TYPE'] = 'filesystem' +app.config['VERSION_NUMBER'] = '0.2.0' app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) -app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' +app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config')) +app.config['DEFAULT_CONFIG'] = os.path.join(app.config['CONFIG_PATH'], 'config.json') +app.config['SESSION_FILE_DIR'] = os.path.join(app.config['CONFIG_PATH'], 'session') + +if not os.path.exists(app.config['CONFIG_PATH']): + os.makedirs(app.config['CONFIG_PATH']) + +if not os.path.exists(app.config['SESSION_FILE_DIR']): + os.makedirs(app.config['SESSION_FILE_DIR']) + +Session(app) from app import routes diff --git a/app/filter.py b/app/filter.py index 8c25fe4..1cc9f87 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,5 +1,7 @@ from app.request import VALID_PARAMS +from app.utils.misc import BLACKLIST from bs4 import BeautifulSoup +from bs4.element import ResultSet from cryptography.fernet import Fernet import re import urllib.parse as urlparse @@ -17,14 +19,9 @@  def get_first_link(soup): # Replace hrefs with only the intended destination (no "utm" type tags) for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - # Return the first search result URL - if 'url?q=' in href: - return filter_link_args(href) + if 'url?q=' in a['href']: + return filter_link_args(a['href']) def filter_link_args(query_link): @@ -51,8 +48,12 @@ def filter_link_args(query_link): return query_link +def has_ad_content(element: str): + return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element + + class Filter: - def __init__(self, mobile=False, config=None, secret_key=''): + def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} @@ -61,11 +62,17 @@ class Filter: self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.mobile = mobile - self.secret_key = secret_key + self.user_keys = user_keys + self.main_divs = ResultSet('') + self._elements = 0 def __getitem__(self, name): return getattr(self, name) + @property + def elements(self): + return self._elements + def reskin(self, page): # Aesthetic only re-skinning page = page.replace('>G<', '>Wh<') @@ -76,11 +83,31 @@ class Filter: return page + def encrypt_path(self, msg, is_element=False): + # Encrypts path to avoid plaintext results in logs + if is_element: + # Element paths are tracked differently in order for the element key to be regenerated + # once all elements have been loaded + enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode() + self._elements += 1 + return enc_path + + return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode() + def clean(self, soup): - self.remove_ads(soup) - self.update_image_paths(soup) + self.main_divs = soup.find('div', {'id': 'main'}) + self.remove_ads() + self.fix_question_section() self.update_styling(soup) - self.update_links(soup) + + for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: + self.update_element_src(img, 'image/png') + + for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: + self.update_element_src(audio, 'audio/mpeg') + + for link in soup.find_all('a', href=True): + self.update_link(link) input_form = soup.find('form') if input_form is not None: @@ -90,14 +117,11 @@ class Filter: for script in soup('script'): script.decompose() - # Remove google's language/time config - st_card = soup.find('div', id='st-card') - if st_card: - st_card.decompose() - - footer = soup.find('div', id='sfooter') + # Update default footer and header + footer = soup.find('footer') if footer: - footer.decompose() + # Remove divs that have multiple links beyond just page navigation + [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2] header = soup.find('header') if header: @@ -105,35 +129,42 @@ class Filter: return soup - def remove_ads(self, soup): - main_divs = soup.find('div', {'id': 'main'}) - if main_divs is None: + def remove_ads(self): + if not self.main_divs: return - result_divs = main_divs.find_all('div', recursive=False) - for div in [_ for _ in result_divs]: - has_ad = len([_ for _ in div.find_all('span', recursive=True) if 'ad' == _.text.lower()]) + for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: + has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)]) _ = div.decompose() if has_ad else None - def update_image_paths(self, soup): - for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: - img_src = img['src'] - if img_src.startswith('//'): - img_src = 'https:' + img_src - elif img_src.startswith(LOGO_URL): - # Re-brand with Whoogle logo - img['src'] = '/static/img/logo.png' - img['style'] = 'height:40px;width:162px' - continue - elif img_src.startswith(GOOG_IMG): - img['src'] = BLANK_B64 - continue + def fix_question_section(self): + if not self.main_divs: + return - enc_src = Fernet(self.secret_key).encrypt(img_src.encode()) - img['src'] = '/tmp?image_url=' + enc_src.decode() - # TODO: Non-mobile image results link to website instead of image - # if not self.mobile: - # img.append(BeautifulSoup(FULL_RES_IMG.format(img_src), 'html.parser')) + question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0] + for question_div in question_divs: + questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')] + for question in questions: + question['style'] = 'padding: 10px; font-style: italic;' + + def update_element_src(self, element, mime): + element_src = element['src'] + if element_src.startswith('//'): + element_src = 'https:' + element_src + elif element_src.startswith(LOGO_URL): + # Re-brand with Whoogle logo + element['src'] = '/static/img/logo.png' + element['style'] = 'height:40px;width:162px' + return + elif element_src.startswith(GOOG_IMG): + element['src'] = BLANK_B64 + return + + element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ + '&type=' + urlparse.quote(mime) + # TODO: Non-mobile image results link to website instead of image + # if not self.mobile: + # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) def update_styling(self, soup): # Remove unnecessary button(s) @@ -169,45 +200,43 @@ class Filter: for href_element in soup.findAll('a'): href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else '' - def update_links(self, soup): - # Replace hrefs with only the intended destination (no "utm" type tags) - for a in soup.find_all('a', href=True): - href = a['href'].replace('https://www.google.com', '') - if '/advanced_search' in href: - a.decompose() - continue - elif self.new_tab: - a['target'] = '_blank' + def update_link(self, link): + # Replace href with only the intended destination (no "utm" type tags) + href = link['href'].replace('https://www.google.com', '') + if '/advanced_search' in href: + link.decompose() + return + elif self.new_tab: + link['target'] = '_blank' - result_link = urlparse.urlparse(href) - query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' + result_link = urlparse.urlparse(href) + query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' - if query_link.startswith('/'): - a['href'] = 'https://google.com' + query_link - elif '/search?q=' in href: - enc_result = Fernet(self.secret_key).encrypt(query_link.encode()) - new_search = '/search?q=' + enc_result.decode() + if query_link.startswith('/'): + link['href'] = 'https://google.com' + query_link + elif '/search?q=' in href: + new_search = '/search?q=' + self.encrypt_path(query_link) - query_params = parse_qs(urlparse.urlparse(href).query) - for param in VALID_PARAMS: - param_val = query_params[param][0] if param in query_params else '' - new_search += '&' + param + '=' + param_val - a['href'] = new_search - elif 'url?q=' in href: - # Strip unneeded arguments - a['href'] = filter_link_args(query_link) + query_params = parse_qs(urlparse.urlparse(href).query) + for param in VALID_PARAMS: + param_val = query_params[param][0] if param in query_params else '' + new_search += '&' + param + '=' + param_val + link['href'] = new_search + elif 'url?q=' in href: + # Strip unneeded arguments + link['href'] = filter_link_args(query_link) - # Add no-js option - if self.nojs: - gen_nojs(soup, a['href'], a) - else: - a['href'] = href + # Add no-js option + if self.nojs: + gen_nojs(link) + else: + link['href'] = href -def gen_nojs(soup, link, sibling): - nojs_link = soup.new_tag('a') - nojs_link['href'] = '/window?location=' + link +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) \ No newline at end of file + sibling.append(nojs_link) diff --git a/app/request.py b/app/request.py index 7ecd887..fe7d3fb 100644 --- a/app/request.py +++ b/app/request.py @@ -1,7 +1,7 @@ -from io import BytesIO from lxml import etree -import pycurl import random +import requests +from requests import Response import urllib.parse as urlparse # Core Google search URLs @@ -12,27 +12,38 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0' DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' # Valid query params -VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] +VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source'] -def gen_user_agent(normal_ua, is_mobile): +def gen_user_agent(is_mobile): mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' if is_mobile: return MOBILE_UA.format(mozilla, firefox) - else: - return DESKTOP_UA.format(mozilla, linux, firefox) + + return DESKTOP_UA.format(mozilla, linux, firefox) def gen_query(query, args, config, near_city=None): param_dict = {key: '' for key in VALID_PARAMS} + # Use :past(hour/day/week/month/year) if available # example search "new restaurants :past month" - if ':past' in query: + sub_lang = '' + if ':past' in query and 'tbs' not in args: time_range = str.strip(query.split(':past', 1)[-1]) - param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0]) + param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0])) + elif 'tbs' in args: + result_tbs = args.get('tbs') + param_dict['tbs'] = '&tbs=' + result_tbs + + # Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted + # strangely. This is a (admittedly not very elegant) solution for this. + # Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case + sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _] + sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else '' # Ensure search query is parsable query = urlparse.quote(query) @@ -49,13 +60,20 @@ def gen_query(query, args, config, near_city=None): if near_city: param_dict['near'] = '&near=' + urlparse.quote(near_city) - # Set language for results (lr) and interface (hl) - param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '') + # Set language for results (lr) if source isn't set, otherwise use the result + # language param provided by google (but with the strange digit(s) removed) + if 'source' in args: + param_dict['source'] = '&source=' + args.get('source') + param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else '' + else: + param_dict['lr'] = '&lr=' + config.lang + param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' + param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '') param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') for val in param_dict.values(): - if not val or val is None: + if not val: continue query += val @@ -66,20 +84,14 @@ class Request: def __init__(self, normal_ua, language='lang_en'): self.language = language self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua - self.modified_user_agent = gen_user_agent(normal_ua, self.mobile) + self.modified_user_agent = gen_user_agent(self.mobile) def __getitem__(self, name): return getattr(self, name) - def get_decode_value(self): - if 'lang_zh' in self.language: - return 'gb2312' - else: - return 'unicode-escape' - def autocomplete(self, query): ac_query = dict(hl=self.language, q=query) - response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)) + response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text if response: dom = etree.fromstring(response) @@ -87,20 +99,9 @@ class Request: return [] - def send(self, base_url=SEARCH_URL, query='', return_bytes=False): - response_header = [] + def send(self, base_url=SEARCH_URL, query='') -> Response: + headers = { + 'User-Agent': self.modified_user_agent + } - b_obj = BytesIO() - crl = pycurl.Curl() - crl.setopt(crl.URL, base_url + query) - crl.setopt(crl.USERAGENT, self.modified_user_agent) - crl.setopt(crl.WRITEDATA, b_obj) - crl.setopt(crl.HEADERFUNCTION, response_header.append) - crl.setopt(pycurl.FOLLOWLOCATION, 1) - crl.perform() - crl.close() - - if return_bytes: - return b_obj.getvalue() - else: - return b_obj.getvalue().decode(self.get_decode_value(), 'ignore') + return requests.get(base_url + query, headers=headers) diff --git a/app/routes.py b/app/routes.py index 3f50082..ed288c0 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,19 +1,22 @@ -from app import app -from app.filter import Filter, get_first_link -from app.models.config import Config -from app.request import Request, gen_query import argparse import base64 -from bs4 import BeautifulSoup -from cryptography.fernet import Fernet, InvalidToken -from flask import g, jsonify, make_response, request, redirect, render_template, send_file -from functools import wraps import io import json import os -from pycurl import error as pycurl_error +import pickle import urllib.parse as urlparse +import uuid +from functools import wraps + import waitress +from flask import jsonify, make_response, request, redirect, render_template, send_file, session +from requests import exceptions + +from app import app +from app.models.config import Config +from app.request import Request +from app.utils.misc import valid_user_session +from app.utils.routing_utils import * def auth_required(f): @@ -34,17 +37,30 @@ def auth_required(f): @app.before_request def before_request_func(): - # Always redirect to https if HTTPS_ONLY is set (otherwise default to false) + g.request_params = request.args if request.method == 'GET' else request.form + g.cookies_disabled = False + + # Generate session values for user if unavailable + if not valid_user_session(session): + session['config'] = json.load(open(app.config['DEFAULT_CONFIG'])) \ + if os.path.exists(app.config['DEFAULT_CONFIG']) else {'url': request.url_root} + session['uuid'] = str(uuid.uuid4()) + session['fernet_keys'] = generate_user_keys(True) + + # Flag cookies as possibly disabled in order to prevent against + # unnecessary session directory expansion + g.cookies_disabled = True + + if session['uuid'] not in app.user_elements: + app.user_elements.update({session['uuid']: 0}) + + # Always redirect to https if HTTPS_ONLY is set (otherwise default to False) https_only = os.getenv('HTTPS_ONLY', False) - config_path = app.config['CONFIG_PATH'] if https_only and request.url.startswith('http://'): - https_url = request.url.replace('http://', 'https://', 1) - code = 308 - return redirect(https_url, code=code) + return redirect(request.url.replace('http://', 'https://', 1), code=308) - json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root} - g.user_config = Config(**json_config) + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root @@ -53,6 +69,27 @@ def before_request_func(): g.app_location = g.user_config.url +@app.after_request +def after_request_func(response): + if app.user_elements[session['uuid']] <= 0 and '/element' in request.url: + # Regenerate element key if all elements have been served to user + session['fernet_keys']['element_key'] = '' if not g.cookies_disabled else app.default_key_set['element_key'] + app.user_elements[session['uuid']] = 0 + + # Check if address consistently has cookies blocked, in which case start removing session + # files after creation. + # Note: This is primarily done to prevent overpopulation of session directories, since browsers that + # block cookies will still trigger Flask's session creation routine with every request. + if g.cookies_disabled and request.remote_addr not in app.no_cookie_ips: + app.no_cookie_ips.append(request.remote_addr) + elif g.cookies_disabled and request.remote_addr in app.no_cookie_ips: + session_list = list(session.keys()) + for key in session_list: + session.pop(key) + + return response + + @app.errorhandler(404) def unknown_page(e): return redirect(g.app_location) @@ -61,15 +98,14 @@ def unknown_page(e): @app.route('/', methods=['GET']) @auth_required def index(): + # Reset keys + session['fernet_keys'] = generate_user_keys(g.cookies_disabled) + return render_template('index.html', - dark_mode=g.user_config.dark, - ua=g.user_request.modified_user_agent, languages=Config.LANGUAGES, countries=Config.COUNTRIES, - current_lang=g.user_config.lang, - current_ctry=g.user_config.ctry, - version_number=app.config['VERSION_NUMBER'], - request_type='get' if g.user_config.get_only else 'post') + config=g.user_config, + version_number=app.config['VERSION_NUMBER']) @app.route('/opensearch.xml', methods=['GET']) @@ -89,8 +125,7 @@ def opensearch(): @app.route('/autocomplete', methods=['GET', 'POST']) def autocomplete(): - request_params = request.args if request.method == 'GET' else request.form - q = request_params.get('q') + q = g.request_params.get('q') if not q and not request.data: return jsonify({'?': []}) @@ -103,68 +138,65 @@ def autocomplete(): @app.route('/search', methods=['GET', 'POST']) @auth_required def search(): - request_params = request.args if request.method == 'GET' else request.form - q = request_params.get('q') + # Reset element counter + app.user_elements[session['uuid']] = 0 - if q is None or len(q) == 0: + search_util = RoutingUtils(request, g.user_config, session, cookies_disabled=g.cookies_disabled) + query = search_util.new_search_query() + + # Redirect to home if invalid/blank search + if not query: return redirect('/') - else: - # Attempt to decrypt if this is an internal link - try: - q = Fernet(app.secret_key).decrypt(q.encode()).decode() - except InvalidToken: - pass - feeling_lucky = q.startswith('! ') + # Generate response and number of external elements from the page + response, elements = search_util.generate_response() + if search_util.feeling_lucky: + return redirect(response, code=303) - if feeling_lucky: # Well do you, punk? - q = q[2:] - - user_agent = request.headers.get('User-Agent') - mobile = 'Android' in user_agent or 'iPhone' in user_agent - - content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key) - full_query = gen_query(q, request_params, g.user_config, content_filter.near) - get_body = g.user_request.send(query=full_query) - dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') - - if feeling_lucky: - return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL - else: - formatted_results = content_filter.clean(dirty_soup) - - # Set search type to be used in the header template to allow for repeated searches - # in the same category - search_type = request_params.get('tbm') if 'tbm' in request_params else '' + # Keep count of external elements to fetch before element key can be regenerated + app.user_elements[session['uuid']] = elements return render_template( 'display.html', - query=urlparse.unquote(q), - search_type=search_type, + query=urlparse.unquote(query), + search_type=search_util.search_type, dark_mode=g.user_config.dark, - response=formatted_results, + response=response, + version_number=app.config['VERSION_NUMBER'], search_header=render_template( 'header.html', dark_mode=g.user_config.dark, - q=urlparse.unquote(q), - search_type=search_type, - mobile=g.user_request.mobile) if 'isch' not in search_type else '') + query=urlparse.unquote(query), + search_type=search_util.search_type, + mobile=g.user_request.mobile) if 'isch' not in search_util.search_type else '') -@app.route('/config', methods=['GET', 'POST']) +@app.route('/config', methods=['GET', 'POST', 'PUT']) @auth_required def config(): if request.method == 'GET': return json.dumps(g.user_config.__dict__) + elif request.method == 'PUT': + if 'name' in request.args: + config_pkl = os.path.join(app.config['CONFIG_PATH'], request.args.get('name')) + session['config'] = pickle.load(open(config_pkl, 'rb')) if os.path.exists(config_pkl) else session['config'] + return json.dumps(session['config']) + else: + return json.dumps({}) else: config_data = request.form.to_dict() if 'url' not in config_data or not config_data['url']: config_data['url'] = g.user_config.url - with open(app.config['CONFIG_PATH'], 'w') as config_file: - config_file.write(json.dumps(config_data, indent=4)) - config_file.close() + # Save config by name to allow a user to easily load later + if 'name' in request.args: + pickle.dump(config_data, open(os.path.join(app.config['CONFIG_PATH'], request.args.get('name')), 'wb')) + # Overwrite default config if user has cookies disabled + if g.cookies_disabled: + open(app.config['DEFAULT_CONFIG'], 'w').write(json.dumps(config_data, indent=4)) + + session['config'] = config_data return redirect(config_data['url']) @@ -187,25 +219,22 @@ def imgres(): return redirect(request.args.get('imgurl')) -@app.route('/tmp') +@app.route('/element') @auth_required -def tmp(): - cipher_suite = Fernet(app.secret_key) - img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode() +def element(): + cipher_suite = Fernet(session['fernet_keys']['element_key']) + src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode() + src_type = request.args.get('type') try: - file_data = g.user_request.send(base_url=img_url, return_bytes=True) + file_data = g.user_request.send(base_url=src_url).content + app.user_elements[session['uuid']] -= 1 tmp_mem = io.BytesIO() tmp_mem.write(file_data) tmp_mem.seek(0) - return send_file( - tmp_mem, - as_attachment=True, - attachment_filename='tmp.png', - mimetype='image/png' - ) - except pycurl_error: + return send_file(tmp_mem, mimetype=src_type) + except exceptions.RequestException: pass empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==') @@ -215,7 +244,7 @@ def tmp(): @app.route('/window') @auth_required def window(): - get_body = g.user_request.send(base_url=request.args.get('location')) + get_body = g.user_request.send(base_url=request.args.get('location')).text get_body = get_body.replace('src="/', 'src="' + request.args.get('location') + '"') get_body = get_body.replace('href="/', 'href="' + request.args.get('location') + '"') diff --git a/app/static/js/autocomplete.js b/app/static/js/autocomplete.js index 316f8c4..84e9b23 100644 --- a/app/static/js/autocomplete.js +++ b/app/static/js/autocomplete.js @@ -4,7 +4,7 @@ const handleUserInput = searchBar => { xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); xhrRequest.onload = function() { if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) { - alert("Error fetching autocomplete results"); + // Do nothing if failed to fetch autocomplete results return; } diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 4817195..95d917b 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -71,6 +71,41 @@ const setupConfigLayout = () => { fillConfigValues(); }; +const loadConfig = event => { + event.preventDefault(); + let config = prompt("Enter name of config:"); + if (!config) { + alert("Must specify a name for the config to load"); + return; + } + + let xhrPUT = new XMLHttpRequest(); + xhrPUT.open("PUT", "/config?name=" + config + ".conf"); + xhrPUT.onload = function() { + if (xhrPUT.readyState === 4 && xhrPUT.status !== 200) { + alert("Error loading Whoogle config"); + return; + } + + location.reload(true); + }; + + xhrPUT.send(); +}; + +const saveConfig = event => { + event.preventDefault(); + let config = prompt("Enter name for this config:"); + if (!config) { + alert("Must specify a name for the config to save"); + return; + } + + let configForm = document.getElementById("config-form"); + configForm.action = '/config?name=' + config + ".conf"; + configForm.submit(); +}; + document.addEventListener("DOMContentLoaded", function() { setTimeout(function() { document.getElementById("main").style.display = "block"; diff --git a/app/templates/display.html b/app/templates/display.html index 94ebed2..bd18838 100644 --- a/app/templates/display.html +++ b/app/templates/display.html @@ -11,7 +11,13 @@ {{ query }} - Whoogle Search - {{ search_header|safe }} - {{ response|safe }} + {{ search_header|safe }} + {{ response|safe }} + diff --git a/app/templates/header.html b/app/templates/header.html index 5356ec2..5573b99 100644 --- a/app/templates/header.html +++ b/app/templates/header.html @@ -15,7 +15,7 @@ style="background-color: {{ '#000' if dark_mode else '#fff' }}; color: {{ '#685e79' if dark_mode else '#000' }}; border: {{ '1px solid #685e79' if dark_mode else '' }}" - spellcheck="false" type="text" value="{{ q }}"> + spellcheck="false" type="text" value="{{ query }}">
@@ -37,7 +37,7 @@
diff --git a/app/templates/index.html b/app/templates/index.html index 9279031..7d32b9f 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -21,14 +21,14 @@ - + Whoogle Search - +
-
+
@@ -40,17 +40,13 @@
- -
- - User Agent: {{ ua }} -
+
{% for lang in languages %}
- +   +   +
diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/misc.py b/app/utils/misc.py new file mode 100644 index 0000000..b87941d --- /dev/null +++ b/app/utils/misc.py @@ -0,0 +1,29 @@ +from cryptography.fernet import Fernet +from flask import current_app as app + +REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] +BLACKLIST = [ + 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', + 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', + 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' +] + + +def generate_user_keys(cookies_disabled=False) -> dict: + if cookies_disabled: + return app.default_key_set + + # Generate/regenerate unique key per user + return { + 'element_key': Fernet.generate_key(), + 'text_key': Fernet.generate_key() + } + + +def valid_user_session(session): + # Generate secret key for user if unavailable + for value in REQUIRED_SESSION_VALUES: + if value not in session: + return False + + return True diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py new file mode 100644 index 0000000..40f8a90 --- /dev/null +++ b/app/utils/routing_utils.py @@ -0,0 +1,72 @@ +from app.filter import Filter, get_first_link +from app.utils.misc import generate_user_keys +from app.request import gen_query +from bs4 import BeautifulSoup +from cryptography.fernet import Fernet, InvalidToken +from flask import g +from typing import Any, Tuple + + +class RoutingUtils: + def __init__(self, request, config, session, cookies_disabled=False): + self.request_params = request.args if request.method == 'GET' else request.form + self.user_agent = request.headers.get('User-Agent') + self.feeling_lucky = False + self.config = config + self.session = session + self.query = '' + self.cookies_disabled = cookies_disabled + self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else '' + + def __getitem__(self, name): + return getattr(self, name) + + def __setitem__(self, name, value): + return setattr(self, name, value) + + def __delitem__(self, name): + return delattr(self, name) + + def __contains__(self, name): + return hasattr(self, name) + + def new_search_query(self) -> str: + # Generate a new element key each time a new search is performed + self.session['fernet_keys']['element_key'] = generate_user_keys( + cookies_disabled=self.cookies_disabled)['element_key'] + + q = self.request_params.get('q') + + if q is None or len(q) == 0: + return '' + else: + # Attempt to decrypt if this is an internal link + try: + q = Fernet(self.session['fernet_keys']['text_key']).decrypt(q.encode()).decode() + except InvalidToken: + pass + + # Reset text key + self.session['fernet_keys']['text_key'] = generate_user_keys( + cookies_disabled=self.cookies_disabled)['text_key'] + + # Format depending on whether or not the query is a "feeling lucky" query + self.feeling_lucky = q.startswith('! ') + self.query = q[2:] if self.feeling_lucky else q + return self.query + + def generate_response(self) -> Tuple[Any, int]: + mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent + + content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config) + full_query = gen_query(self.query, self.request_params, self.config, content_filter.near) + get_body = g.user_request.send(query=full_query).text + + # Produce cleanable html soup from response + html_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') + + if self.feeling_lucky: + return get_first_link(html_soup), 1 + else: + formatted_results = content_filter.clean(html_soup) + return formatted_results, content_filter.elements diff --git a/requirements.txt b/requirements.txt index 030780c..702d8ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,15 +4,16 @@ cffi==1.13.2 Click==7.0 cryptography==2.8 Flask==1.1.1 +Flask-Session==0.3.2 itsdangerous==1.1.0 Jinja2==2.10.3 lxml==4.5.1 MarkupSafe==1.1.1 pycparser==2.19 -pycurl==7.43.0.4 pyOpenSSL==19.1.0 pytest==5.4.1 python-dateutil==2.8.1 +requests==2.23.0 six==1.14.0 soupsieve==1.9.5 Werkzeug==0.16.0 diff --git a/setup.py b/setup.py index 3428459..08652bc 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setuptools.setup( author='Ben Busby', author_email='benbusby@protonmail.com', name='whoogle-search', - version='0.1.4', + version='0.2.0', include_package_data=True, install_requires=requirements, description='Self-hosted, ad-free, privacy-respecting Google metasearch engine', diff --git a/test/conftest.py b/test/conftest.py index 3d2aa33..63aec3e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,8 +1,13 @@ from app import app +from app.utils.misc import generate_user_keys import pytest @pytest.fixture def client(): - client = app.test_client() - yield client + with app.test_client() as client: + with client.session_transaction() as session: + session['uuid'] = 'test' + session['fernet_keys'] = generate_user_keys() + session['config'] = {} + yield client diff --git a/test/test_misc.py b/test/test_misc.py new file mode 100644 index 0000000..8eb1d78 --- /dev/null +++ b/test/test_misc.py @@ -0,0 +1,33 @@ +from app.utils.misc import generate_user_keys, valid_user_session + + +def test_generate_user_keys(): + keys = generate_user_keys() + assert 'text_key' in keys + assert 'element_key' in keys + assert keys['text_key'] not in keys['element_key'] + + +def test_valid_session(client): + assert not valid_user_session({'fernet_keys': '', 'config': {}}) + with client.session_transaction() as session: + assert valid_user_session(session) + + +def test_request_key_generation(client): + rv = client.get('/') + cookie = rv.headers['Set-Cookie'] + + rv = client.get('/search?q=test+1', headers={'Cookie': cookie}) + assert rv._status_code == 200 + + with client.session_transaction() as session: + assert valid_user_session(session) + text_key = session['fernet_keys']['text_key'] + + rv = client.get('/search?q=test+2', headers={'Cookie': cookie}) + assert rv._status_code == 200 + + with client.session_transaction() as session: + assert valid_user_session(session) + assert text_key not in session['fernet_keys']['text_key'] diff --git a/test/test_results.py b/test/test_results.py index 7f500c8..a943de6 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -1,13 +1,13 @@ from bs4 import BeautifulSoup -from cryptography.fernet import Fernet from app.filter import Filter +from app.utils.misc import generate_user_keys from datetime import datetime from dateutil.parser import * def get_search_results(data): - secret_key = Fernet.generate_key() - soup = Filter(secret_key=secret_key).clean(BeautifulSoup(data, 'html.parser')) + secret_key = generate_user_keys() + soup = Filter(user_keys=secret_key).clean(BeautifulSoup(data, 'html.parser')) main_divs = soup.find('div', {'id': 'main'}) assert len(main_divs) > 1 diff --git a/test/test_routes.py b/test/test_routes.py index 91e17be..56c9909 100644 --- a/test/test_routes.py +++ b/test/test_routes.py @@ -1,10 +1,13 @@ +from app.models.config import Config import json import random demo_config = { 'near': random.choice(['Seattle', 'New York', 'San Francisco']), 'dark_mode': str(random.getrandbits(1)), - 'nojs': str(random.getrandbits(1)) + 'nojs': str(random.getrandbits(1)), + 'lang': random.choice(Config.LANGUAGES)['value'], + 'ctry': random.choice(Config.COUNTRIES)['value'] } @@ -17,6 +20,7 @@ def test_search(client): rv = client.get('/search?q=test') assert rv._status_code == 200 + def test_feeling_lucky(client): rv = client.get('/search?q=!%20test') assert rv._status_code == 303