Session refactoring and improved filter (#86)

* Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config -- users with blocked cookies fall back to the "default" profile (same usage as before)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to save config button label (now "Save As...")

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style

* Moved custom conf files to their own directory

* Refactored whoogle session mgmt

Now allows a fallback "default" session to be used if a user's browser
is blocking cookies

* Reworked pytest client fixture to support new session mgmt

* Added better multilingual support, updated filter

Results page now includes method for switching to "All Languages" from
whichever language is specified as the primary in the config (see #74).

Also removes the non-Whoogle links from the page footer, leaving only
the page navigation controls

Added support for the date range filter on the results page, though I'd
still recommend using the ":past <unit>" query instead.

* Removed no-cache enforcement, minor styling/formatting improvements

* Improving ad filtering for non-English languages

* Added footer to results page
main
Ben Busby 2020-06-11 13:38:51 -06:00 committed by GitHub
parent d859e46a6c
commit b2133edaa3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 476 additions and 215 deletions

4
.gitignore vendored
View File

@ -3,8 +3,12 @@ venv/
__pycache__/ __pycache__/
*.pyc *.pyc
*.pem *.pem
*.conf
config.json config.json
test/static test/static
flask_session/
app/static/config
app/static/custom_config
# pip stuff # pip stuff
build/ build/

View File

@ -1,12 +1,27 @@
from cryptography.fernet import Fernet from app.utils.misc import generate_user_keys
from flask import Flask from flask import Flask
from flask_session import Session
import os import os
app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static')
app.secret_key = Fernet.generate_key() app.user_elements = {}
app.config['VERSION_NUMBER'] = '0.1.4' app.default_key_set = generate_user_keys()
app.no_cookie_ips = []
app.config['SECRET_KEY'] = os.urandom(32)
app.config['SESSION_TYPE'] = 'filesystem'
app.config['VERSION_NUMBER'] = '0.2.0'
app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__))) app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__)))
app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static')) app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static'))
app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json' app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config'))
app.config['DEFAULT_CONFIG'] = os.path.join(app.config['CONFIG_PATH'], 'config.json')
app.config['SESSION_FILE_DIR'] = os.path.join(app.config['CONFIG_PATH'], 'session')
if not os.path.exists(app.config['CONFIG_PATH']):
os.makedirs(app.config['CONFIG_PATH'])
if not os.path.exists(app.config['SESSION_FILE_DIR']):
os.makedirs(app.config['SESSION_FILE_DIR'])
Session(app)
from app import routes from app import routes

View File

@ -1,5 +1,7 @@
from app.request import VALID_PARAMS from app.request import VALID_PARAMS
from app.utils.misc import BLACKLIST
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import ResultSet
from cryptography.fernet import Fernet from cryptography.fernet import Fernet
import re import re
import urllib.parse as urlparse import urllib.parse as urlparse
@ -17,14 +19,9 @@ data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42m
def get_first_link(soup): def get_first_link(soup):
# Replace hrefs with only the intended destination (no "utm" type tags) # Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True): for a in soup.find_all('a', href=True):
href = a['href'].replace('https://www.google.com', '')
result_link = urlparse.urlparse(href)
query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''
# Return the first search result URL # Return the first search result URL
if 'url?q=' in href: if 'url?q=' in a['href']:
return filter_link_args(href) return filter_link_args(a['href'])
def filter_link_args(query_link): def filter_link_args(query_link):
@ -51,8 +48,12 @@ def filter_link_args(query_link):
return query_link return query_link
def has_ad_content(element: str):
return element.upper() in (value.upper() for value in BLACKLIST) or '' in element
class Filter: class Filter:
def __init__(self, mobile=False, config=None, secret_key=''): def __init__(self, user_keys: dict, mobile=False, config=None):
if config is None: if config is None:
config = {} config = {}
@ -61,11 +62,17 @@ class Filter:
self.nojs = config['nojs'] if 'nojs' in config else False self.nojs = config['nojs'] if 'nojs' in config else False
self.new_tab = config['new_tab'] if 'new_tab' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False
self.mobile = mobile self.mobile = mobile
self.secret_key = secret_key self.user_keys = user_keys
self.main_divs = ResultSet('')
self._elements = 0
def __getitem__(self, name): def __getitem__(self, name):
return getattr(self, name) return getattr(self, name)
@property
def elements(self):
return self._elements
def reskin(self, page): def reskin(self, page):
# Aesthetic only re-skinning # Aesthetic only re-skinning
page = page.replace('>G<', '>Wh<') page = page.replace('>G<', '>Wh<')
@ -76,11 +83,31 @@ class Filter:
return page return page
def encrypt_path(self, msg, is_element=False):
# Encrypts path to avoid plaintext results in logs
if is_element:
# Element paths are tracked differently in order for the element key to be regenerated
# once all elements have been loaded
enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode()
self._elements += 1
return enc_path
return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode()
def clean(self, soup): def clean(self, soup):
self.remove_ads(soup) self.main_divs = soup.find('div', {'id': 'main'})
self.update_image_paths(soup) self.remove_ads()
self.fix_question_section()
self.update_styling(soup) self.update_styling(soup)
self.update_links(soup)
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
self.update_element_src(img, 'image/png')
for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
self.update_element_src(audio, 'audio/mpeg')
for link in soup.find_all('a', href=True):
self.update_link(link)
input_form = soup.find('form') input_form = soup.find('form')
if input_form is not None: if input_form is not None:
@ -90,14 +117,11 @@ class Filter:
for script in soup('script'): for script in soup('script'):
script.decompose() script.decompose()
# Remove google's language/time config # Update default footer and header
st_card = soup.find('div', id='st-card') footer = soup.find('footer')
if st_card:
st_card.decompose()
footer = soup.find('div', id='sfooter')
if footer: if footer:
footer.decompose() # Remove divs that have multiple links beyond just page navigation
[_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2]
header = soup.find('header') header = soup.find('header')
if header: if header:
@ -105,35 +129,42 @@ class Filter:
return soup return soup
def remove_ads(self, soup): def remove_ads(self):
main_divs = soup.find('div', {'id': 'main'}) if not self.main_divs:
if main_divs is None:
return return
result_divs = main_divs.find_all('div', recursive=False)
for div in [_ for _ in result_divs]: for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
has_ad = len([_ for _ in div.find_all('span', recursive=True) if 'ad' == _.text.lower()]) has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)])
_ = div.decompose() if has_ad else None _ = div.decompose() if has_ad else None
def update_image_paths(self, soup): def fix_question_section(self):
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: if not self.main_divs:
img_src = img['src'] return
if img_src.startswith('//'):
img_src = 'https:' + img_src
elif img_src.startswith(LOGO_URL):
# Re-brand with Whoogle logo
img['src'] = '/static/img/logo.png'
img['style'] = 'height:40px;width:162px'
continue
elif img_src.startswith(GOOG_IMG):
img['src'] = BLANK_B64
continue
enc_src = Fernet(self.secret_key).encrypt(img_src.encode()) question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0]
img['src'] = '/tmp?image_url=' + enc_src.decode() for question_div in question_divs:
# TODO: Non-mobile image results link to website instead of image questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')]
# if not self.mobile: for question in questions:
# img.append(BeautifulSoup(FULL_RES_IMG.format(img_src), 'html.parser')) question['style'] = 'padding: 10px; font-style: italic;'
def update_element_src(self, element, mime):
element_src = element['src']
if element_src.startswith('//'):
element_src = 'https:' + element_src
elif element_src.startswith(LOGO_URL):
# Re-brand with Whoogle logo
element['src'] = '/static/img/logo.png'
element['style'] = 'height:40px;width:162px'
return
elif element_src.startswith(GOOG_IMG):
element['src'] = BLANK_B64
return
element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \
'&type=' + urlparse.quote(mime)
# TODO: Non-mobile image results link to website instead of image
# if not self.mobile:
# img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser'))
def update_styling(self, soup): def update_styling(self, soup):
# Remove unnecessary button(s) # Remove unnecessary button(s)
@ -169,44 +200,42 @@ class Filter:
for href_element in soup.findAll('a'): for href_element in soup.findAll('a'):
href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else '' href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else ''
def update_links(self, soup): def update_link(self, link):
# Replace hrefs with only the intended destination (no "utm" type tags) # Replace href with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True): href = link['href'].replace('https://www.google.com', '')
href = a['href'].replace('https://www.google.com', '') if '/advanced_search' in href:
if '/advanced_search' in href: link.decompose()
a.decompose() return
continue elif self.new_tab:
elif self.new_tab: link['target'] = '_blank'
a['target'] = '_blank'
result_link = urlparse.urlparse(href) result_link = urlparse.urlparse(href)
query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''
if query_link.startswith('/'): if query_link.startswith('/'):
a['href'] = 'https://google.com' + query_link link['href'] = 'https://google.com' + query_link
elif '/search?q=' in href: elif '/search?q=' in href:
enc_result = Fernet(self.secret_key).encrypt(query_link.encode()) new_search = '/search?q=' + self.encrypt_path(query_link)
new_search = '/search?q=' + enc_result.decode()
query_params = parse_qs(urlparse.urlparse(href).query) query_params = parse_qs(urlparse.urlparse(href).query)
for param in VALID_PARAMS: for param in VALID_PARAMS:
param_val = query_params[param][0] if param in query_params else '' param_val = query_params[param][0] if param in query_params else ''
new_search += '&' + param + '=' + param_val new_search += '&' + param + '=' + param_val
a['href'] = new_search link['href'] = new_search
elif 'url?q=' in href: elif 'url?q=' in href:
# Strip unneeded arguments # Strip unneeded arguments
a['href'] = filter_link_args(query_link) link['href'] = filter_link_args(query_link)
# Add no-js option # Add no-js option
if self.nojs: if self.nojs:
gen_nojs(soup, a['href'], a) gen_nojs(link)
else: else:
a['href'] = href link['href'] = href
def gen_nojs(soup, link, sibling): def gen_nojs(sibling):
nojs_link = soup.new_tag('a') nojs_link = BeautifulSoup().new_tag('a')
nojs_link['href'] = '/window?location=' + link nojs_link['href'] = '/window?location=' + sibling['href']
nojs_link['style'] = 'display:block;width:100%;' nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href'] nojs_link.string = 'NoJS Link: ' + nojs_link['href']
sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser')) sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))

View File

@ -1,7 +1,7 @@
from io import BytesIO
from lxml import etree from lxml import etree
import pycurl
import random import random
import requests
from requests import Response
import urllib.parse as urlparse import urllib.parse as urlparse
# Core Google search URLs # Core Google search URLs
@ -12,27 +12,38 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
# Valid query params # Valid query params
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source']
def gen_user_agent(normal_ua, is_mobile): def gen_user_agent(is_mobile):
mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla'
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
if is_mobile: if is_mobile:
return MOBILE_UA.format(mozilla, firefox) return MOBILE_UA.format(mozilla, firefox)
else:
return DESKTOP_UA.format(mozilla, linux, firefox) return DESKTOP_UA.format(mozilla, linux, firefox)
def gen_query(query, args, config, near_city=None): def gen_query(query, args, config, near_city=None):
param_dict = {key: '' for key in VALID_PARAMS} param_dict = {key: '' for key in VALID_PARAMS}
# Use :past(hour/day/week/month/year) if available # Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month" # example search "new restaurants :past month"
if ':past' in query: sub_lang = ''
if ':past' in query and 'tbs' not in args:
time_range = str.strip(query.split(':past', 1)[-1]) time_range = str.strip(query.split(':past', 1)[-1])
param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0]) param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0]))
elif 'tbs' in args:
result_tbs = args.get('tbs')
param_dict['tbs'] = '&tbs=' + result_tbs
# Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted
# strangely. This is a (admittedly not very elegant) solution for this.
# Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case
sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _]
sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else ''
# Ensure search query is parsable # Ensure search query is parsable
query = urlparse.quote(query) query = urlparse.quote(query)
@ -49,13 +60,20 @@ def gen_query(query, args, config, near_city=None):
if near_city: if near_city:
param_dict['near'] = '&near=' + urlparse.quote(near_city) param_dict['near'] = '&near=' + urlparse.quote(near_city)
# Set language for results (lr) and interface (hl) # Set language for results (lr) if source isn't set, otherwise use the result
param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '') # language param provided by google (but with the strange digit(s) removed)
if 'source' in args:
param_dict['source'] = '&source=' + args.get('source')
param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else ''
else:
param_dict['lr'] = '&lr=' + config.lang
param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '')
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
for val in param_dict.values(): for val in param_dict.values():
if not val or val is None: if not val:
continue continue
query += val query += val
@ -66,20 +84,14 @@ class Request:
def __init__(self, normal_ua, language='lang_en'): def __init__(self, normal_ua, language='lang_en'):
self.language = language self.language = language
self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua
self.modified_user_agent = gen_user_agent(normal_ua, self.mobile) self.modified_user_agent = gen_user_agent(self.mobile)
def __getitem__(self, name): def __getitem__(self, name):
return getattr(self, name) return getattr(self, name)
def get_decode_value(self):
if 'lang_zh' in self.language:
return 'gb2312'
else:
return 'unicode-escape'
def autocomplete(self, query): def autocomplete(self, query):
ac_query = dict(hl=self.language, q=query) ac_query = dict(hl=self.language, q=query)
response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)) response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text
if response: if response:
dom = etree.fromstring(response) dom = etree.fromstring(response)
@ -87,20 +99,9 @@ class Request:
return [] return []
def send(self, base_url=SEARCH_URL, query='', return_bytes=False): def send(self, base_url=SEARCH_URL, query='') -> Response:
response_header = [] headers = {
'User-Agent': self.modified_user_agent
}
b_obj = BytesIO() return requests.get(base_url + query, headers=headers)
crl = pycurl.Curl()
crl.setopt(crl.URL, base_url + query)
crl.setopt(crl.USERAGENT, self.modified_user_agent)
crl.setopt(crl.WRITEDATA, b_obj)
crl.setopt(crl.HEADERFUNCTION, response_header.append)
crl.setopt(pycurl.FOLLOWLOCATION, 1)
crl.perform()
crl.close()
if return_bytes:
return b_obj.getvalue()
else:
return b_obj.getvalue().decode(self.get_decode_value(), 'ignore')

View File

@ -1,19 +1,22 @@
from app import app
from app.filter import Filter, get_first_link
from app.models.config import Config
from app.request import Request, gen_query
import argparse import argparse
import base64 import base64
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet, InvalidToken
from flask import g, jsonify, make_response, request, redirect, render_template, send_file
from functools import wraps
import io import io
import json import json
import os import os
from pycurl import error as pycurl_error import pickle
import urllib.parse as urlparse import urllib.parse as urlparse
import uuid
from functools import wraps
import waitress import waitress
from flask import jsonify, make_response, request, redirect, render_template, send_file, session
from requests import exceptions
from app import app
from app.models.config import Config
from app.request import Request
from app.utils.misc import valid_user_session
from app.utils.routing_utils import *
def auth_required(f): def auth_required(f):
@ -34,17 +37,30 @@ def auth_required(f):
@app.before_request @app.before_request
def before_request_func(): def before_request_func():
# Always redirect to https if HTTPS_ONLY is set (otherwise default to false) g.request_params = request.args if request.method == 'GET' else request.form
g.cookies_disabled = False
# Generate session values for user if unavailable
if not valid_user_session(session):
session['config'] = json.load(open(app.config['DEFAULT_CONFIG'])) \
if os.path.exists(app.config['DEFAULT_CONFIG']) else {'url': request.url_root}
session['uuid'] = str(uuid.uuid4())
session['fernet_keys'] = generate_user_keys(True)
# Flag cookies as possibly disabled in order to prevent against
# unnecessary session directory expansion
g.cookies_disabled = True
if session['uuid'] not in app.user_elements:
app.user_elements.update({session['uuid']: 0})
# Always redirect to https if HTTPS_ONLY is set (otherwise default to False)
https_only = os.getenv('HTTPS_ONLY', False) https_only = os.getenv('HTTPS_ONLY', False)
config_path = app.config['CONFIG_PATH']
if https_only and request.url.startswith('http://'): if https_only and request.url.startswith('http://'):
https_url = request.url.replace('http://', 'https://', 1) return redirect(request.url.replace('http://', 'https://', 1), code=308)
code = 308
return redirect(https_url, code=code)
json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root} g.user_config = Config(**session['config'])
g.user_config = Config(**json_config)
if not g.user_config.url: if not g.user_config.url:
g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root
@ -53,6 +69,27 @@ def before_request_func():
g.app_location = g.user_config.url g.app_location = g.user_config.url
@app.after_request
def after_request_func(response):
if app.user_elements[session['uuid']] <= 0 and '/element' in request.url:
# Regenerate element key if all elements have been served to user
session['fernet_keys']['element_key'] = '' if not g.cookies_disabled else app.default_key_set['element_key']
app.user_elements[session['uuid']] = 0
# Check if address consistently has cookies blocked, in which case start removing session
# files after creation.
# Note: This is primarily done to prevent overpopulation of session directories, since browsers that
# block cookies will still trigger Flask's session creation routine with every request.
if g.cookies_disabled and request.remote_addr not in app.no_cookie_ips:
app.no_cookie_ips.append(request.remote_addr)
elif g.cookies_disabled and request.remote_addr in app.no_cookie_ips:
session_list = list(session.keys())
for key in session_list:
session.pop(key)
return response
@app.errorhandler(404) @app.errorhandler(404)
def unknown_page(e): def unknown_page(e):
return redirect(g.app_location) return redirect(g.app_location)
@ -61,15 +98,14 @@ def unknown_page(e):
@app.route('/', methods=['GET']) @app.route('/', methods=['GET'])
@auth_required @auth_required
def index(): def index():
# Reset keys
session['fernet_keys'] = generate_user_keys(g.cookies_disabled)
return render_template('index.html', return render_template('index.html',
dark_mode=g.user_config.dark,
ua=g.user_request.modified_user_agent,
languages=Config.LANGUAGES, languages=Config.LANGUAGES,
countries=Config.COUNTRIES, countries=Config.COUNTRIES,
current_lang=g.user_config.lang, config=g.user_config,
current_ctry=g.user_config.ctry, version_number=app.config['VERSION_NUMBER'])
version_number=app.config['VERSION_NUMBER'],
request_type='get' if g.user_config.get_only else 'post')
@app.route('/opensearch.xml', methods=['GET']) @app.route('/opensearch.xml', methods=['GET'])
@ -89,8 +125,7 @@ def opensearch():
@app.route('/autocomplete', methods=['GET', 'POST']) @app.route('/autocomplete', methods=['GET', 'POST'])
def autocomplete(): def autocomplete():
request_params = request.args if request.method == 'GET' else request.form q = g.request_params.get('q')
q = request_params.get('q')
if not q and not request.data: if not q and not request.data:
return jsonify({'?': []}) return jsonify({'?': []})
@ -103,68 +138,65 @@ def autocomplete():
@app.route('/search', methods=['GET', 'POST']) @app.route('/search', methods=['GET', 'POST'])
@auth_required @auth_required
def search(): def search():
request_params = request.args if request.method == 'GET' else request.form # Reset element counter
q = request_params.get('q') app.user_elements[session['uuid']] = 0
if q is None or len(q) == 0: search_util = RoutingUtils(request, g.user_config, session, cookies_disabled=g.cookies_disabled)
query = search_util.new_search_query()
# Redirect to home if invalid/blank search
if not query:
return redirect('/') return redirect('/')
else:
# Attempt to decrypt if this is an internal link
try:
q = Fernet(app.secret_key).decrypt(q.encode()).decode()
except InvalidToken:
pass
feeling_lucky = q.startswith('! ') # Generate response and number of external elements from the page
response, elements = search_util.generate_response()
if search_util.feeling_lucky:
return redirect(response, code=303)
if feeling_lucky: # Well do you, punk? # Keep count of external elements to fetch before element key can be regenerated
q = q[2:] app.user_elements[session['uuid']] = elements
user_agent = request.headers.get('User-Agent')
mobile = 'Android' in user_agent or 'iPhone' in user_agent
content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key)
full_query = gen_query(q, request_params, g.user_config, content_filter.near)
get_body = g.user_request.send(query=full_query)
dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser')
if feeling_lucky:
return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL
else:
formatted_results = content_filter.clean(dirty_soup)
# Set search type to be used in the header template to allow for repeated searches
# in the same category
search_type = request_params.get('tbm') if 'tbm' in request_params else ''
return render_template( return render_template(
'display.html', 'display.html',
query=urlparse.unquote(q), query=urlparse.unquote(query),
search_type=search_type, search_type=search_util.search_type,
dark_mode=g.user_config.dark, dark_mode=g.user_config.dark,
response=formatted_results, response=response,
version_number=app.config['VERSION_NUMBER'],
search_header=render_template( search_header=render_template(
'header.html', 'header.html',
dark_mode=g.user_config.dark, dark_mode=g.user_config.dark,
q=urlparse.unquote(q), query=urlparse.unquote(query),
search_type=search_type, search_type=search_util.search_type,
mobile=g.user_request.mobile) if 'isch' not in search_type else '') mobile=g.user_request.mobile) if 'isch' not in search_util.search_type else '')
@app.route('/config', methods=['GET', 'POST']) @app.route('/config', methods=['GET', 'POST', 'PUT'])
@auth_required @auth_required
def config(): def config():
if request.method == 'GET': if request.method == 'GET':
return json.dumps(g.user_config.__dict__) return json.dumps(g.user_config.__dict__)
elif request.method == 'PUT':
if 'name' in request.args:
config_pkl = os.path.join(app.config['CONFIG_PATH'], request.args.get('name'))
session['config'] = pickle.load(open(config_pkl, 'rb')) if os.path.exists(config_pkl) else session['config']
return json.dumps(session['config'])
else:
return json.dumps({})
else: else:
config_data = request.form.to_dict() config_data = request.form.to_dict()
if 'url' not in config_data or not config_data['url']: if 'url' not in config_data or not config_data['url']:
config_data['url'] = g.user_config.url config_data['url'] = g.user_config.url
with open(app.config['CONFIG_PATH'], 'w') as config_file: # Save config by name to allow a user to easily load later
config_file.write(json.dumps(config_data, indent=4)) if 'name' in request.args:
config_file.close() pickle.dump(config_data, open(os.path.join(app.config['CONFIG_PATH'], request.args.get('name')), 'wb'))
# Overwrite default config if user has cookies disabled
if g.cookies_disabled:
open(app.config['DEFAULT_CONFIG'], 'w').write(json.dumps(config_data, indent=4))
session['config'] = config_data
return redirect(config_data['url']) return redirect(config_data['url'])
@ -187,25 +219,22 @@ def imgres():
return redirect(request.args.get('imgurl')) return redirect(request.args.get('imgurl'))
@app.route('/tmp') @app.route('/element')
@auth_required @auth_required
def tmp(): def element():
cipher_suite = Fernet(app.secret_key) cipher_suite = Fernet(session['fernet_keys']['element_key'])
img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode() src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode()
src_type = request.args.get('type')
try: try:
file_data = g.user_request.send(base_url=img_url, return_bytes=True) file_data = g.user_request.send(base_url=src_url).content
app.user_elements[session['uuid']] -= 1
tmp_mem = io.BytesIO() tmp_mem = io.BytesIO()
tmp_mem.write(file_data) tmp_mem.write(file_data)
tmp_mem.seek(0) tmp_mem.seek(0)
return send_file( return send_file(tmp_mem, mimetype=src_type)
tmp_mem, except exceptions.RequestException:
as_attachment=True,
attachment_filename='tmp.png',
mimetype='image/png'
)
except pycurl_error:
pass pass
empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==') empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==')
@ -215,7 +244,7 @@ def tmp():
@app.route('/window') @app.route('/window')
@auth_required @auth_required
def window(): def window():
get_body = g.user_request.send(base_url=request.args.get('location')) get_body = g.user_request.send(base_url=request.args.get('location')).text
get_body = get_body.replace('src="/', 'src="' + request.args.get('location') + '"') get_body = get_body.replace('src="/', 'src="' + request.args.get('location') + '"')
get_body = get_body.replace('href="/', 'href="' + request.args.get('location') + '"') get_body = get_body.replace('href="/', 'href="' + request.args.get('location') + '"')

View File

@ -4,7 +4,7 @@ const handleUserInput = searchBar => {
xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded"); xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
xhrRequest.onload = function() { xhrRequest.onload = function() {
if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) { if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) {
alert("Error fetching autocomplete results"); // Do nothing if failed to fetch autocomplete results
return; return;
} }

View File

@ -71,6 +71,41 @@ const setupConfigLayout = () => {
fillConfigValues(); fillConfigValues();
}; };
const loadConfig = event => {
event.preventDefault();
let config = prompt("Enter name of config:");
if (!config) {
alert("Must specify a name for the config to load");
return;
}
let xhrPUT = new XMLHttpRequest();
xhrPUT.open("PUT", "/config?name=" + config + ".conf");
xhrPUT.onload = function() {
if (xhrPUT.readyState === 4 && xhrPUT.status !== 200) {
alert("Error loading Whoogle config");
return;
}
location.reload(true);
};
xhrPUT.send();
};
const saveConfig = event => {
event.preventDefault();
let config = prompt("Enter name for this config:");
if (!config) {
alert("Must specify a name for the config to save");
return;
}
let configForm = document.getElementById("config-form");
configForm.action = '/config?name=' + config + ".conf";
configForm.submit();
};
document.addEventListener("DOMContentLoaded", function() { document.addEventListener("DOMContentLoaded", function() {
setTimeout(function() { setTimeout(function() {
document.getElementById("main").style.display = "block"; document.getElementById("main").style.display = "block";

View File

@ -11,7 +11,13 @@
<title>{{ query }} - Whoogle Search</title> <title>{{ query }} - Whoogle Search</title>
</head> </head>
<body> <body>
{{ search_header|safe }} {{ search_header|safe }}
{{ response|safe }} {{ response|safe }}
</body> </body>
<footer>
<p style="color: {{ '#fff' if dark_mode else '#000' }};">
Whoogle Search v{{ version_number }} ||
<a style="color: #685e79" href="https://github.com/benbusby/whoogle-search">View on GitHub</a>
</p>
</footer>
</html> </html>

View File

@ -15,7 +15,7 @@
style="background-color: {{ '#000' if dark_mode else '#fff' }}; style="background-color: {{ '#000' if dark_mode else '#fff' }};
color: {{ '#685e79' if dark_mode else '#000' }}; color: {{ '#685e79' if dark_mode else '#000' }};
border: {{ '1px solid #685e79' if dark_mode else '' }}" border: {{ '1px solid #685e79' if dark_mode else '' }}"
spellcheck="false" type="text" value="{{ q }}"> spellcheck="false" type="text" value="{{ query }}">
<input name="tbm" value="{{ search_type }}" style="display: none"> <input name="tbm" value="{{ search_type }}" style="display: none">
<div class="sc"></div> <div class="sc"></div>
</div> </div>
@ -37,7 +37,7 @@
<div class="autocomplete" style="width: 100%; flex: 1"> <div class="autocomplete" style="width: 100%; flex: 1">
<div style="width: 100%; display: flex"> <div style="width: 100%; display: flex">
<input id="search-bar" autocapitalize="none" autocomplete="off" class="noHIxc" name="q" <input id="search-bar" autocapitalize="none" autocomplete="off" class="noHIxc" name="q"
spellcheck="false" type="text" value="{{ q }}" spellcheck="false" type="text" value="{{ query }}"
style="background-color: {{ '#000' if dark_mode else '#fff' }}; style="background-color: {{ '#000' if dark_mode else '#fff' }};
color: {{ '#685e79' if dark_mode else '#000' }}; color: {{ '#685e79' if dark_mode else '#000' }};
border: {{ '1px solid #685e79' if dark_mode else '' }}"> border: {{ '1px solid #685e79' if dark_mode else '' }}">

View File

@ -21,14 +21,14 @@
<script type="text/javascript" src="/static/js/controller.js"></script> <script type="text/javascript" src="/static/js/controller.js"></script>
<link rel="search" href="/opensearch.xml" type="application/opensearchdescription+xml" title="Whoogle Search"> <link rel="search" href="/opensearch.xml" type="application/opensearchdescription+xml" title="Whoogle Search">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="/static/css/{{ 'search-dark' if dark_mode else 'search' }}.css"> <link rel="stylesheet" href="/static/css/{{ 'search-dark' if config.dark else 'search' }}.css">
<link rel="stylesheet" href="/static/css/main.css"> <link rel="stylesheet" href="/static/css/main.css">
<title>Whoogle Search</title> <title>Whoogle Search</title>
</head> </head>
<body id="main" style="display: none; background-color: {{ '#000' if dark_mode else '#fff' }}"> <body id="main" style="display: none; background-color: {{ '#000' if config.dark else '#fff' }}">
<div class="search-container"> <div class="search-container">
<img class="logo" src="/static/img/logo.png"> <img class="logo" src="/static/img/logo.png">
<form id="search-form" action="/search" method="{{ request_type }}"> <form id="search-form" action="/search" method="{{ 'get' if config.get_only else 'post' }}">
<div class="search-fields"> <div class="search-fields">
<div class="autocomplete"> <div class="autocomplete">
<input type="text" name="q" id="search-bar" autofocus="autofocus"> <input type="text" name="q" id="search-bar" autofocus="autofocus">
@ -40,17 +40,13 @@
<button id="config-collapsible" class="collapsible">Configuration</button> <button id="config-collapsible" class="collapsible">Configuration</button>
<div class="content"> <div class="content">
<div class="config-fields"> <div class="config-fields">
<form action="/config" method="post"> <form id="config-form" action="/config" method="post">
<div class="config-div">
<!-- TODO: Add option to regenerate user agent? -->
<span class="ua-span">User Agent: {{ ua }}</span>
</div>
<div class="config-div"> <div class="config-div">
<label for="config-ctry">Country: </label> <label for="config-ctry">Country: </label>
<select name="ctry" id="config-ctry"> <select name="ctry" id="config-ctry">
{% for ctry in countries %} {% for ctry in countries %}
<option value="{{ ctry.value }}" <option value="{{ ctry.value }}"
{% if ctry.value in current_ctry %} {% if ctry.value in config.ctry %}
selected selected
{% endif %}> {% endif %}>
{{ ctry.name }} {{ ctry.name }}
@ -63,7 +59,7 @@
<select name="lang" id="config-lang"> <select name="lang" id="config-lang">
{% for lang in languages %} {% for lang in languages %}
<option value="{{ lang.value }}" <option value="{{ lang.value }}"
{% if lang.value in current_lang %} {% if lang.value in config.lang %}
selected selected
{% endif %}> {% endif %}>
{{ lang.name }} {{ lang.name }}
@ -100,7 +96,9 @@
<input type="text" name="url" id="config-url" value=""> <input type="text" name="url" id="config-url" value="">
</div> </div>
<div class="config-div"> <div class="config-div">
<input type="submit" id="config-submit" value="Save"> <input type="submit" id="config-load" onclick="loadConfig(event)" value="Load">&nbsp;
<input type="submit" id="config-submit" value="Apply">&nbsp;
<input type="submit" id="config-submit" onclick="saveConfig(event)" value="Save As...">
</div> </div>
</form> </form>
</div> </div>

0
app/utils/__init__.py Normal file
View File

29
app/utils/misc.py Normal file
View File

@ -0,0 +1,29 @@
from cryptography.fernet import Fernet
from flask import current_app as app
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
BLACKLIST = [
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
]
def generate_user_keys(cookies_disabled=False) -> dict:
if cookies_disabled:
return app.default_key_set
# Generate/regenerate unique key per user
return {
'element_key': Fernet.generate_key(),
'text_key': Fernet.generate_key()
}
def valid_user_session(session):
# Generate secret key for user if unavailable
for value in REQUIRED_SESSION_VALUES:
if value not in session:
return False
return True

View File

@ -0,0 +1,72 @@
from app.filter import Filter, get_first_link
from app.utils.misc import generate_user_keys
from app.request import gen_query
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet, InvalidToken
from flask import g
from typing import Any, Tuple
class RoutingUtils:
def __init__(self, request, config, session, cookies_disabled=False):
self.request_params = request.args if request.method == 'GET' else request.form
self.user_agent = request.headers.get('User-Agent')
self.feeling_lucky = False
self.config = config
self.session = session
self.query = ''
self.cookies_disabled = cookies_disabled
self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else ''
def __getitem__(self, name):
return getattr(self, name)
def __setitem__(self, name, value):
return setattr(self, name, value)
def __delitem__(self, name):
return delattr(self, name)
def __contains__(self, name):
return hasattr(self, name)
def new_search_query(self) -> str:
# Generate a new element key each time a new search is performed
self.session['fernet_keys']['element_key'] = generate_user_keys(
cookies_disabled=self.cookies_disabled)['element_key']
q = self.request_params.get('q')
if q is None or len(q) == 0:
return ''
else:
# Attempt to decrypt if this is an internal link
try:
q = Fernet(self.session['fernet_keys']['text_key']).decrypt(q.encode()).decode()
except InvalidToken:
pass
# Reset text key
self.session['fernet_keys']['text_key'] = generate_user_keys(
cookies_disabled=self.cookies_disabled)['text_key']
# Format depending on whether or not the query is a "feeling lucky" query
self.feeling_lucky = q.startswith('! ')
self.query = q[2:] if self.feeling_lucky else q
return self.query
def generate_response(self) -> Tuple[Any, int]:
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config)
full_query = gen_query(self.query, self.request_params, self.config, content_filter.near)
get_body = g.user_request.send(query=full_query).text
# Produce cleanable html soup from response
html_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser')
if self.feeling_lucky:
return get_first_link(html_soup), 1
else:
formatted_results = content_filter.clean(html_soup)
return formatted_results, content_filter.elements

View File

@ -4,15 +4,16 @@ cffi==1.13.2
Click==7.0 Click==7.0
cryptography==2.8 cryptography==2.8
Flask==1.1.1 Flask==1.1.1
Flask-Session==0.3.2
itsdangerous==1.1.0 itsdangerous==1.1.0
Jinja2==2.10.3 Jinja2==2.10.3
lxml==4.5.1 lxml==4.5.1
MarkupSafe==1.1.1 MarkupSafe==1.1.1
pycparser==2.19 pycparser==2.19
pycurl==7.43.0.4
pyOpenSSL==19.1.0 pyOpenSSL==19.1.0
pytest==5.4.1 pytest==5.4.1
python-dateutil==2.8.1 python-dateutil==2.8.1
requests==2.23.0
six==1.14.0 six==1.14.0
soupsieve==1.9.5 soupsieve==1.9.5
Werkzeug==0.16.0 Werkzeug==0.16.0

View File

@ -8,7 +8,7 @@ setuptools.setup(
author='Ben Busby', author='Ben Busby',
author_email='benbusby@protonmail.com', author_email='benbusby@protonmail.com',
name='whoogle-search', name='whoogle-search',
version='0.1.4', version='0.2.0',
include_package_data=True, include_package_data=True,
install_requires=requirements, install_requires=requirements,
description='Self-hosted, ad-free, privacy-respecting Google metasearch engine', description='Self-hosted, ad-free, privacy-respecting Google metasearch engine',

View File

@ -1,8 +1,13 @@
from app import app from app import app
from app.utils.misc import generate_user_keys
import pytest import pytest
@pytest.fixture @pytest.fixture
def client(): def client():
client = app.test_client() with app.test_client() as client:
yield client with client.session_transaction() as session:
session['uuid'] = 'test'
session['fernet_keys'] = generate_user_keys()
session['config'] = {}
yield client

33
test/test_misc.py Normal file
View File

@ -0,0 +1,33 @@
from app.utils.misc import generate_user_keys, valid_user_session
def test_generate_user_keys():
keys = generate_user_keys()
assert 'text_key' in keys
assert 'element_key' in keys
assert keys['text_key'] not in keys['element_key']
def test_valid_session(client):
assert not valid_user_session({'fernet_keys': '', 'config': {}})
with client.session_transaction() as session:
assert valid_user_session(session)
def test_request_key_generation(client):
rv = client.get('/')
cookie = rv.headers['Set-Cookie']
rv = client.get('/search?q=test+1', headers={'Cookie': cookie})
assert rv._status_code == 200
with client.session_transaction() as session:
assert valid_user_session(session)
text_key = session['fernet_keys']['text_key']
rv = client.get('/search?q=test+2', headers={'Cookie': cookie})
assert rv._status_code == 200
with client.session_transaction() as session:
assert valid_user_session(session)
assert text_key not in session['fernet_keys']['text_key']

View File

@ -1,13 +1,13 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from cryptography.fernet import Fernet
from app.filter import Filter from app.filter import Filter
from app.utils.misc import generate_user_keys
from datetime import datetime from datetime import datetime
from dateutil.parser import * from dateutil.parser import *
def get_search_results(data): def get_search_results(data):
secret_key = Fernet.generate_key() secret_key = generate_user_keys()
soup = Filter(secret_key=secret_key).clean(BeautifulSoup(data, 'html.parser')) soup = Filter(user_keys=secret_key).clean(BeautifulSoup(data, 'html.parser'))
main_divs = soup.find('div', {'id': 'main'}) main_divs = soup.find('div', {'id': 'main'})
assert len(main_divs) > 1 assert len(main_divs) > 1

View File

@ -1,10 +1,13 @@
from app.models.config import Config
import json import json
import random import random
demo_config = { demo_config = {
'near': random.choice(['Seattle', 'New York', 'San Francisco']), 'near': random.choice(['Seattle', 'New York', 'San Francisco']),
'dark_mode': str(random.getrandbits(1)), 'dark_mode': str(random.getrandbits(1)),
'nojs': str(random.getrandbits(1)) 'nojs': str(random.getrandbits(1)),
'lang': random.choice(Config.LANGUAGES)['value'],
'ctry': random.choice(Config.COUNTRIES)['value']
} }
@ -17,6 +20,7 @@ def test_search(client):
rv = client.get('/search?q=test') rv = client.get('/search?q=test')
assert rv._status_code == 200 assert rv._status_code == 200
def test_feeling_lucky(client): def test_feeling_lucky(client):
rv = client.get('/search?q=!%20test') rv = client.get('/search?q=!%20test')
assert rv._status_code == 303 assert rv._status_code == 303