Session refactoring and improved filter (#86)

* Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config -- users with blocked cookies fall back to the "default" profile (same usage as before)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to save config button label (now "Save As...")

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style

* Moved custom conf files to their own directory

* Refactored whoogle session mgmt

Now allows a fallback "default" session to be used if a user's browser
is blocking cookies

* Reworked pytest client fixture to support new session mgmt

* Added better multilingual support, updated filter

Results page now includes method for switching to "All Languages" from
whichever language is specified as the primary in the config (see #74).

Also removes the non-Whoogle links from the page footer, leaving only
the page navigation controls

Added support for the date range filter on the results page, though I'd
still recommend using the ":past <unit>" query instead.

* Removed no-cache enforcement, minor styling/formatting improvements

* Improving ad filtering for non-English languages

* Added footer to results page
main
Ben Busby 2020-06-11 13:38:51 -06:00 committed by GitHub
parent d859e46a6c
commit b2133edaa3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 476 additions and 215 deletions

4
.gitignore vendored
View File

@ -3,8 +3,12 @@ venv/
__pycache__/
*.pyc
*.pem
*.conf
config.json
test/static
flask_session/
app/static/config
app/static/custom_config
# pip stuff
build/

View File

@ -1,12 +1,27 @@
from cryptography.fernet import Fernet
from app.utils.misc import generate_user_keys
from flask import Flask
from flask_session import Session
import os
app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static')
app.secret_key = Fernet.generate_key()
app.config['VERSION_NUMBER'] = '0.1.4'
app.user_elements = {}
app.default_key_set = generate_user_keys()
app.no_cookie_ips = []
app.config['SECRET_KEY'] = os.urandom(32)
app.config['SESSION_TYPE'] = 'filesystem'
app.config['VERSION_NUMBER'] = '0.2.0'
app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__)))
app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static'))
app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', app.config['STATIC_FOLDER']) + '/config.json'
app.config['CONFIG_PATH'] = os.getenv('CONFIG_VOLUME', os.path.join(app.config['STATIC_FOLDER'], 'config'))
app.config['DEFAULT_CONFIG'] = os.path.join(app.config['CONFIG_PATH'], 'config.json')
app.config['SESSION_FILE_DIR'] = os.path.join(app.config['CONFIG_PATH'], 'session')
if not os.path.exists(app.config['CONFIG_PATH']):
os.makedirs(app.config['CONFIG_PATH'])
if not os.path.exists(app.config['SESSION_FILE_DIR']):
os.makedirs(app.config['SESSION_FILE_DIR'])
Session(app)
from app import routes

View File

@ -1,5 +1,7 @@
from app.request import VALID_PARAMS
from app.utils.misc import BLACKLIST
from bs4 import BeautifulSoup
from bs4.element import ResultSet
from cryptography.fernet import Fernet
import re
import urllib.parse as urlparse
@ -17,14 +19,9 @@ 
def get_first_link(soup):
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
href = a['href'].replace('https://www.google.com', '')
result_link = urlparse.urlparse(href)
query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''
# Return the first search result URL
if 'url?q=' in href:
return filter_link_args(href)
if 'url?q=' in a['href']:
return filter_link_args(a['href'])
def filter_link_args(query_link):
@ -51,8 +48,12 @@ def filter_link_args(query_link):
return query_link
def has_ad_content(element: str):
return element.upper() in (value.upper() for value in BLACKLIST) or '' in element
class Filter:
def __init__(self, mobile=False, config=None, secret_key=''):
def __init__(self, user_keys: dict, mobile=False, config=None):
if config is None:
config = {}
@ -61,11 +62,17 @@ class Filter:
self.nojs = config['nojs'] if 'nojs' in config else False
self.new_tab = config['new_tab'] if 'new_tab' in config else False
self.mobile = mobile
self.secret_key = secret_key
self.user_keys = user_keys
self.main_divs = ResultSet('')
self._elements = 0
def __getitem__(self, name):
return getattr(self, name)
@property
def elements(self):
return self._elements
def reskin(self, page):
# Aesthetic only re-skinning
page = page.replace('>G<', '>Wh<')
@ -76,11 +83,31 @@ class Filter:
return page
def encrypt_path(self, msg, is_element=False):
# Encrypts path to avoid plaintext results in logs
if is_element:
# Element paths are tracked differently in order for the element key to be regenerated
# once all elements have been loaded
enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode()
self._elements += 1
return enc_path
return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode()
def clean(self, soup):
self.remove_ads(soup)
self.update_image_paths(soup)
self.main_divs = soup.find('div', {'id': 'main'})
self.remove_ads()
self.fix_question_section()
self.update_styling(soup)
self.update_links(soup)
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
self.update_element_src(img, 'image/png')
for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
self.update_element_src(audio, 'audio/mpeg')
for link in soup.find_all('a', href=True):
self.update_link(link)
input_form = soup.find('form')
if input_form is not None:
@ -90,14 +117,11 @@ class Filter:
for script in soup('script'):
script.decompose()
# Remove google's language/time config
st_card = soup.find('div', id='st-card')
if st_card:
st_card.decompose()
footer = soup.find('div', id='sfooter')
# Update default footer and header
footer = soup.find('footer')
if footer:
footer.decompose()
# Remove divs that have multiple links beyond just page navigation
[_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2]
header = soup.find('header')
if header:
@ -105,35 +129,42 @@ class Filter:
return soup
def remove_ads(self, soup):
main_divs = soup.find('div', {'id': 'main'})
if main_divs is None:
def remove_ads(self):
if not self.main_divs:
return
result_divs = main_divs.find_all('div', recursive=False)
for div in [_ for _ in result_divs]:
has_ad = len([_ for _ in div.find_all('span', recursive=True) if 'ad' == _.text.lower()])
for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)])
_ = div.decompose() if has_ad else None
def update_image_paths(self, soup):
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
img_src = img['src']
if img_src.startswith('//'):
img_src = 'https:' + img_src
elif img_src.startswith(LOGO_URL):
# Re-brand with Whoogle logo
img['src'] = '/static/img/logo.png'
img['style'] = 'height:40px;width:162px'
continue
elif img_src.startswith(GOOG_IMG):
img['src'] = BLANK_B64
continue
def fix_question_section(self):
if not self.main_divs:
return
enc_src = Fernet(self.secret_key).encrypt(img_src.encode())
img['src'] = '/tmp?image_url=' + enc_src.decode()
# TODO: Non-mobile image results link to website instead of image
# if not self.mobile:
# img.append(BeautifulSoup(FULL_RES_IMG.format(img_src), 'html.parser'))
question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0]
for question_div in question_divs:
questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')]
for question in questions:
question['style'] = 'padding: 10px; font-style: italic;'
def update_element_src(self, element, mime):
element_src = element['src']
if element_src.startswith('//'):
element_src = 'https:' + element_src
elif element_src.startswith(LOGO_URL):
# Re-brand with Whoogle logo
element['src'] = '/static/img/logo.png'
element['style'] = 'height:40px;width:162px'
return
elif element_src.startswith(GOOG_IMG):
element['src'] = BLANK_B64
return
element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \
'&type=' + urlparse.quote(mime)
# TODO: Non-mobile image results link to website instead of image
# if not self.mobile:
# img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser'))
def update_styling(self, soup):
# Remove unnecessary button(s)
@ -169,45 +200,43 @@ class Filter:
for href_element in soup.findAll('a'):
href_element['style'] = 'color: white' if href_element['href'].startswith('/search') else ''
def update_links(self, soup):
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
href = a['href'].replace('https://www.google.com', '')
if '/advanced_search' in href:
a.decompose()
continue
elif self.new_tab:
a['target'] = '_blank'
def update_link(self, link):
# Replace href with only the intended destination (no "utm" type tags)
href = link['href'].replace('https://www.google.com', '')
if '/advanced_search' in href:
link.decompose()
return
elif self.new_tab:
link['target'] = '_blank'
result_link = urlparse.urlparse(href)
query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''
result_link = urlparse.urlparse(href)
query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''
if query_link.startswith('/'):
a['href'] = 'https://google.com' + query_link
elif '/search?q=' in href:
enc_result = Fernet(self.secret_key).encrypt(query_link.encode())
new_search = '/search?q=' + enc_result.decode()
if query_link.startswith('/'):
link['href'] = 'https://google.com' + query_link
elif '/search?q=' in href:
new_search = '/search?q=' + self.encrypt_path(query_link)
query_params = parse_qs(urlparse.urlparse(href).query)
for param in VALID_PARAMS:
param_val = query_params[param][0] if param in query_params else ''
new_search += '&' + param + '=' + param_val
a['href'] = new_search
elif 'url?q=' in href:
# Strip unneeded arguments
a['href'] = filter_link_args(query_link)
query_params = parse_qs(urlparse.urlparse(href).query)
for param in VALID_PARAMS:
param_val = query_params[param][0] if param in query_params else ''
new_search += '&' + param + '=' + param_val
link['href'] = new_search
elif 'url?q=' in href:
# Strip unneeded arguments
link['href'] = filter_link_args(query_link)
# Add no-js option
if self.nojs:
gen_nojs(soup, a['href'], a)
else:
a['href'] = href
# Add no-js option
if self.nojs:
gen_nojs(link)
else:
link['href'] = href
def gen_nojs(soup, link, sibling):
nojs_link = soup.new_tag('a')
nojs_link['href'] = '/window?location=' + link
def gen_nojs(sibling):
nojs_link = BeautifulSoup().new_tag('a')
nojs_link['href'] = '/window?location=' + sibling['href']
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
sibling.append(nojs_link)
sibling.append(nojs_link)

View File

@ -1,7 +1,7 @@
from io import BytesIO
from lxml import etree
import pycurl
import random
import requests
from requests import Response
import urllib.parse as urlparse
# Core Google search URLs
@ -12,27 +12,38 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
# Valid query params
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near']
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source']
def gen_user_agent(normal_ua, is_mobile):
def gen_user_agent(is_mobile):
mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla'
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
if is_mobile:
return MOBILE_UA.format(mozilla, firefox)
else:
return DESKTOP_UA.format(mozilla, linux, firefox)
return DESKTOP_UA.format(mozilla, linux, firefox)
def gen_query(query, args, config, near_city=None):
param_dict = {key: '' for key in VALID_PARAMS}
# Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month"
if ':past' in query:
sub_lang = ''
if ':past' in query and 'tbs' not in args:
time_range = str.strip(query.split(':past', 1)[-1])
param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0])
param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0]))
elif 'tbs' in args:
result_tbs = args.get('tbs')
param_dict['tbs'] = '&tbs=' + result_tbs
# Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted
# strangely. This is a (admittedly not very elegant) solution for this.
# Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case
sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _]
sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else ''
# Ensure search query is parsable
query = urlparse.quote(query)
@ -49,13 +60,20 @@ def gen_query(query, args, config, near_city=None):
if near_city:
param_dict['near'] = '&near=' + urlparse.quote(near_city)
# Set language for results (lr) and interface (hl)
param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '')
# Set language for results (lr) if source isn't set, otherwise use the result
# language param provided by google (but with the strange digit(s) removed)
if 'source' in args:
param_dict['source'] = '&source=' + args.get('source')
param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else ''
else:
param_dict['lr'] = '&lr=' + config.lang
param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '')
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
for val in param_dict.values():
if not val or val is None:
if not val:
continue
query += val
@ -66,20 +84,14 @@ class Request:
def __init__(self, normal_ua, language='lang_en'):
self.language = language
self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua
self.modified_user_agent = gen_user_agent(normal_ua, self.mobile)
self.modified_user_agent = gen_user_agent(self.mobile)
def __getitem__(self, name):
return getattr(self, name)
def get_decode_value(self):
if 'lang_zh' in self.language:
return 'gb2312'
else:
return 'unicode-escape'
def autocomplete(self, query):
ac_query = dict(hl=self.language, q=query)
response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query))
response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text
if response:
dom = etree.fromstring(response)
@ -87,20 +99,9 @@ class Request:
return []
def send(self, base_url=SEARCH_URL, query='', return_bytes=False):
response_header = []
def send(self, base_url=SEARCH_URL, query='') -> Response:
headers = {
'User-Agent': self.modified_user_agent
}
b_obj = BytesIO()
crl = pycurl.Curl()
crl.setopt(crl.URL, base_url + query)
crl.setopt(crl.USERAGENT, self.modified_user_agent)
crl.setopt(crl.WRITEDATA, b_obj)
crl.setopt(crl.HEADERFUNCTION, response_header.append)
crl.setopt(pycurl.FOLLOWLOCATION, 1)
crl.perform()
crl.close()
if return_bytes:
return b_obj.getvalue()
else:
return b_obj.getvalue().decode(self.get_decode_value(), 'ignore')
return requests.get(base_url + query, headers=headers)

View File

@ -1,19 +1,22 @@
from app import app
from app.filter import Filter, get_first_link
from app.models.config import Config
from app.request import Request, gen_query
import argparse
import base64
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet, InvalidToken
from flask import g, jsonify, make_response, request, redirect, render_template, send_file
from functools import wraps
import io
import json
import os
from pycurl import error as pycurl_error
import pickle
import urllib.parse as urlparse
import uuid
from functools import wraps
import waitress
from flask import jsonify, make_response, request, redirect, render_template, send_file, session
from requests import exceptions
from app import app
from app.models.config import Config
from app.request import Request
from app.utils.misc import valid_user_session
from app.utils.routing_utils import *
def auth_required(f):
@ -34,17 +37,30 @@ def auth_required(f):
@app.before_request
def before_request_func():
# Always redirect to https if HTTPS_ONLY is set (otherwise default to false)
g.request_params = request.args if request.method == 'GET' else request.form
g.cookies_disabled = False
# Generate session values for user if unavailable
if not valid_user_session(session):
session['config'] = json.load(open(app.config['DEFAULT_CONFIG'])) \
if os.path.exists(app.config['DEFAULT_CONFIG']) else {'url': request.url_root}
session['uuid'] = str(uuid.uuid4())
session['fernet_keys'] = generate_user_keys(True)
# Flag cookies as possibly disabled in order to prevent against
# unnecessary session directory expansion
g.cookies_disabled = True
if session['uuid'] not in app.user_elements:
app.user_elements.update({session['uuid']: 0})
# Always redirect to https if HTTPS_ONLY is set (otherwise default to False)
https_only = os.getenv('HTTPS_ONLY', False)
config_path = app.config['CONFIG_PATH']
if https_only and request.url.startswith('http://'):
https_url = request.url.replace('http://', 'https://', 1)
code = 308
return redirect(https_url, code=code)
return redirect(request.url.replace('http://', 'https://', 1), code=308)
json_config = json.load(open(config_path)) if os.path.exists(config_path) else {'url': request.url_root}
g.user_config = Config(**json_config)
g.user_config = Config(**session['config'])
if not g.user_config.url:
g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root
@ -53,6 +69,27 @@ def before_request_func():
g.app_location = g.user_config.url
@app.after_request
def after_request_func(response):
if app.user_elements[session['uuid']] <= 0 and '/element' in request.url:
# Regenerate element key if all elements have been served to user
session['fernet_keys']['element_key'] = '' if not g.cookies_disabled else app.default_key_set['element_key']
app.user_elements[session['uuid']] = 0
# Check if address consistently has cookies blocked, in which case start removing session
# files after creation.
# Note: This is primarily done to prevent overpopulation of session directories, since browsers that
# block cookies will still trigger Flask's session creation routine with every request.
if g.cookies_disabled and request.remote_addr not in app.no_cookie_ips:
app.no_cookie_ips.append(request.remote_addr)
elif g.cookies_disabled and request.remote_addr in app.no_cookie_ips:
session_list = list(session.keys())
for key in session_list:
session.pop(key)
return response
@app.errorhandler(404)
def unknown_page(e):
return redirect(g.app_location)
@ -61,15 +98,14 @@ def unknown_page(e):
@app.route('/', methods=['GET'])
@auth_required
def index():
# Reset keys
session['fernet_keys'] = generate_user_keys(g.cookies_disabled)
return render_template('index.html',
dark_mode=g.user_config.dark,
ua=g.user_request.modified_user_agent,
languages=Config.LANGUAGES,
countries=Config.COUNTRIES,
current_lang=g.user_config.lang,
current_ctry=g.user_config.ctry,
version_number=app.config['VERSION_NUMBER'],
request_type='get' if g.user_config.get_only else 'post')
config=g.user_config,
version_number=app.config['VERSION_NUMBER'])
@app.route('/opensearch.xml', methods=['GET'])
@ -89,8 +125,7 @@ def opensearch():
@app.route('/autocomplete', methods=['GET', 'POST'])
def autocomplete():
request_params = request.args if request.method == 'GET' else request.form
q = request_params.get('q')
q = g.request_params.get('q')
if not q and not request.data:
return jsonify({'?': []})
@ -103,68 +138,65 @@ def autocomplete():
@app.route('/search', methods=['GET', 'POST'])
@auth_required
def search():
request_params = request.args if request.method == 'GET' else request.form
q = request_params.get('q')
# Reset element counter
app.user_elements[session['uuid']] = 0
if q is None or len(q) == 0:
search_util = RoutingUtils(request, g.user_config, session, cookies_disabled=g.cookies_disabled)
query = search_util.new_search_query()
# Redirect to home if invalid/blank search
if not query:
return redirect('/')
else:
# Attempt to decrypt if this is an internal link
try:
q = Fernet(app.secret_key).decrypt(q.encode()).decode()
except InvalidToken:
pass
feeling_lucky = q.startswith('! ')
# Generate response and number of external elements from the page
response, elements = search_util.generate_response()
if search_util.feeling_lucky:
return redirect(response, code=303)
if feeling_lucky: # Well do you, punk?
q = q[2:]
user_agent = request.headers.get('User-Agent')
mobile = 'Android' in user_agent or 'iPhone' in user_agent
content_filter = Filter(mobile, g.user_config, secret_key=app.secret_key)
full_query = gen_query(q, request_params, g.user_config, content_filter.near)
get_body = g.user_request.send(query=full_query)
dirty_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser')
if feeling_lucky:
return redirect(get_first_link(dirty_soup), 303) # Using 303 so the browser performs a GET request for the URL
else:
formatted_results = content_filter.clean(dirty_soup)
# Set search type to be used in the header template to allow for repeated searches
# in the same category
search_type = request_params.get('tbm') if 'tbm' in request_params else ''
# Keep count of external elements to fetch before element key can be regenerated
app.user_elements[session['uuid']] = elements
return render_template(
'display.html',
query=urlparse.unquote(q),
search_type=search_type,
query=urlparse.unquote(query),
search_type=search_util.search_type,
dark_mode=g.user_config.dark,
response=formatted_results,
response=response,
version_number=app.config['VERSION_NUMBER'],
search_header=render_template(
'header.html',
dark_mode=g.user_config.dark,
q=urlparse.unquote(q),
search_type=search_type,
mobile=g.user_request.mobile) if 'isch' not in search_type else '')
query=urlparse.unquote(query),
search_type=search_util.search_type,
mobile=g.user_request.mobile) if 'isch' not in search_util.search_type else '')
@app.route('/config', methods=['GET', 'POST'])
@app.route('/config', methods=['GET', 'POST', 'PUT'])
@auth_required
def config():
if request.method == 'GET':
return json.dumps(g.user_config.__dict__)
elif request.method == 'PUT':
if 'name' in request.args:
config_pkl = os.path.join(app.config['CONFIG_PATH'], request.args.get('name'))
session['config'] = pickle.load(open(config_pkl, 'rb')) if os.path.exists(config_pkl) else session['config']
return json.dumps(session['config'])
else:
return json.dumps({})
else:
config_data = request.form.to_dict()
if 'url' not in config_data or not config_data['url']:
config_data['url'] = g.user_config.url
with open(app.config['CONFIG_PATH'], 'w') as config_file:
config_file.write(json.dumps(config_data, indent=4))
config_file.close()
# Save config by name to allow a user to easily load later
if 'name' in request.args:
pickle.dump(config_data, open(os.path.join(app.config['CONFIG_PATH'], request.args.get('name')), 'wb'))
# Overwrite default config if user has cookies disabled
if g.cookies_disabled:
open(app.config['DEFAULT_CONFIG'], 'w').write(json.dumps(config_data, indent=4))
session['config'] = config_data
return redirect(config_data['url'])
@ -187,25 +219,22 @@ def imgres():
return redirect(request.args.get('imgurl'))
@app.route('/tmp')
@app.route('/element')
@auth_required
def tmp():
cipher_suite = Fernet(app.secret_key)
img_url = cipher_suite.decrypt(request.args.get('image_url').encode()).decode()
def element():
cipher_suite = Fernet(session['fernet_keys']['element_key'])
src_url = cipher_suite.decrypt(request.args.get('url').encode()).decode()
src_type = request.args.get('type')
try:
file_data = g.user_request.send(base_url=img_url, return_bytes=True)
file_data = g.user_request.send(base_url=src_url).content
app.user_elements[session['uuid']] -= 1
tmp_mem = io.BytesIO()
tmp_mem.write(file_data)
tmp_mem.seek(0)
return send_file(
tmp_mem,
as_attachment=True,
attachment_filename='tmp.png',
mimetype='image/png'
)
except pycurl_error:
return send_file(tmp_mem, mimetype=src_type)
except exceptions.RequestException:
pass
empty_gif = base64.b64decode('R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==')
@ -215,7 +244,7 @@ def tmp():
@app.route('/window')
@auth_required
def window():
get_body = g.user_request.send(base_url=request.args.get('location'))
get_body = g.user_request.send(base_url=request.args.get('location')).text
get_body = get_body.replace('src="/', 'src="' + request.args.get('location') + '"')
get_body = get_body.replace('href="/', 'href="' + request.args.get('location') + '"')

View File

@ -4,7 +4,7 @@ const handleUserInput = searchBar => {
xhrRequest.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
xhrRequest.onload = function() {
if (xhrRequest.readyState === 4 && xhrRequest.status !== 200) {
alert("Error fetching autocomplete results");
// Do nothing if failed to fetch autocomplete results
return;
}

View File

@ -71,6 +71,41 @@ const setupConfigLayout = () => {
fillConfigValues();
};
const loadConfig = event => {
event.preventDefault();
let config = prompt("Enter name of config:");
if (!config) {
alert("Must specify a name for the config to load");
return;
}
let xhrPUT = new XMLHttpRequest();
xhrPUT.open("PUT", "/config?name=" + config + ".conf");
xhrPUT.onload = function() {
if (xhrPUT.readyState === 4 && xhrPUT.status !== 200) {
alert("Error loading Whoogle config");
return;
}
location.reload(true);
};
xhrPUT.send();
};
const saveConfig = event => {
event.preventDefault();
let config = prompt("Enter name for this config:");
if (!config) {
alert("Must specify a name for the config to save");
return;
}
let configForm = document.getElementById("config-form");
configForm.action = '/config?name=' + config + ".conf";
configForm.submit();
};
document.addEventListener("DOMContentLoaded", function() {
setTimeout(function() {
document.getElementById("main").style.display = "block";

View File

@ -11,7 +11,13 @@
<title>{{ query }} - Whoogle Search</title>
</head>
<body>
{{ search_header|safe }}
{{ response|safe }}
{{ search_header|safe }}
{{ response|safe }}
</body>
<footer>
<p style="color: {{ '#fff' if dark_mode else '#000' }};">
Whoogle Search v{{ version_number }} ||
<a style="color: #685e79" href="https://github.com/benbusby/whoogle-search">View on GitHub</a>
</p>
</footer>
</html>

View File

@ -15,7 +15,7 @@
style="background-color: {{ '#000' if dark_mode else '#fff' }};
color: {{ '#685e79' if dark_mode else '#000' }};
border: {{ '1px solid #685e79' if dark_mode else '' }}"
spellcheck="false" type="text" value="{{ q }}">
spellcheck="false" type="text" value="{{ query }}">
<input name="tbm" value="{{ search_type }}" style="display: none">
<div class="sc"></div>
</div>
@ -37,7 +37,7 @@
<div class="autocomplete" style="width: 100%; flex: 1">
<div style="width: 100%; display: flex">
<input id="search-bar" autocapitalize="none" autocomplete="off" class="noHIxc" name="q"
spellcheck="false" type="text" value="{{ q }}"
spellcheck="false" type="text" value="{{ query }}"
style="background-color: {{ '#000' if dark_mode else '#fff' }};
color: {{ '#685e79' if dark_mode else '#000' }};
border: {{ '1px solid #685e79' if dark_mode else '' }}">

View File

@ -21,14 +21,14 @@
<script type="text/javascript" src="/static/js/controller.js"></script>
<link rel="search" href="/opensearch.xml" type="application/opensearchdescription+xml" title="Whoogle Search">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="/static/css/{{ 'search-dark' if dark_mode else 'search' }}.css">
<link rel="stylesheet" href="/static/css/{{ 'search-dark' if config.dark else 'search' }}.css">
<link rel="stylesheet" href="/static/css/main.css">
<title>Whoogle Search</title>
</head>
<body id="main" style="display: none; background-color: {{ '#000' if dark_mode else '#fff' }}">
<body id="main" style="display: none; background-color: {{ '#000' if config.dark else '#fff' }}">
<div class="search-container">
<img class="logo" src="/static/img/logo.png">
<form id="search-form" action="/search" method="{{ request_type }}">
<form id="search-form" action="/search" method="{{ 'get' if config.get_only else 'post' }}">
<div class="search-fields">
<div class="autocomplete">
<input type="text" name="q" id="search-bar" autofocus="autofocus">
@ -40,17 +40,13 @@
<button id="config-collapsible" class="collapsible">Configuration</button>
<div class="content">
<div class="config-fields">
<form action="/config" method="post">
<div class="config-div">
<!-- TODO: Add option to regenerate user agent? -->
<span class="ua-span">User Agent: {{ ua }}</span>
</div>
<form id="config-form" action="/config" method="post">
<div class="config-div">
<label for="config-ctry">Country: </label>
<select name="ctry" id="config-ctry">
{% for ctry in countries %}
<option value="{{ ctry.value }}"
{% if ctry.value in current_ctry %}
{% if ctry.value in config.ctry %}
selected
{% endif %}>
{{ ctry.name }}
@ -63,7 +59,7 @@
<select name="lang" id="config-lang">
{% for lang in languages %}
<option value="{{ lang.value }}"
{% if lang.value in current_lang %}
{% if lang.value in config.lang %}
selected
{% endif %}>
{{ lang.name }}
@ -100,7 +96,9 @@
<input type="text" name="url" id="config-url" value="">
</div>
<div class="config-div">
<input type="submit" id="config-submit" value="Save">
<input type="submit" id="config-load" onclick="loadConfig(event)" value="Load">&nbsp;
<input type="submit" id="config-submit" value="Apply">&nbsp;
<input type="submit" id="config-submit" onclick="saveConfig(event)" value="Save As...">
</div>
</form>
</div>

0
app/utils/__init__.py Normal file
View File

29
app/utils/misc.py Normal file
View File

@ -0,0 +1,29 @@
from cryptography.fernet import Fernet
from flask import current_app as app
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
BLACKLIST = [
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
]
def generate_user_keys(cookies_disabled=False) -> dict:
if cookies_disabled:
return app.default_key_set
# Generate/regenerate unique key per user
return {
'element_key': Fernet.generate_key(),
'text_key': Fernet.generate_key()
}
def valid_user_session(session):
# Generate secret key for user if unavailable
for value in REQUIRED_SESSION_VALUES:
if value not in session:
return False
return True

View File

@ -0,0 +1,72 @@
from app.filter import Filter, get_first_link
from app.utils.misc import generate_user_keys
from app.request import gen_query
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet, InvalidToken
from flask import g
from typing import Any, Tuple
class RoutingUtils:
def __init__(self, request, config, session, cookies_disabled=False):
self.request_params = request.args if request.method == 'GET' else request.form
self.user_agent = request.headers.get('User-Agent')
self.feeling_lucky = False
self.config = config
self.session = session
self.query = ''
self.cookies_disabled = cookies_disabled
self.search_type = self.request_params.get('tbm') if 'tbm' in self.request_params else ''
def __getitem__(self, name):
return getattr(self, name)
def __setitem__(self, name, value):
return setattr(self, name, value)
def __delitem__(self, name):
return delattr(self, name)
def __contains__(self, name):
return hasattr(self, name)
def new_search_query(self) -> str:
# Generate a new element key each time a new search is performed
self.session['fernet_keys']['element_key'] = generate_user_keys(
cookies_disabled=self.cookies_disabled)['element_key']
q = self.request_params.get('q')
if q is None or len(q) == 0:
return ''
else:
# Attempt to decrypt if this is an internal link
try:
q = Fernet(self.session['fernet_keys']['text_key']).decrypt(q.encode()).decode()
except InvalidToken:
pass
# Reset text key
self.session['fernet_keys']['text_key'] = generate_user_keys(
cookies_disabled=self.cookies_disabled)['text_key']
# Format depending on whether or not the query is a "feeling lucky" query
self.feeling_lucky = q.startswith('! ')
self.query = q[2:] if self.feeling_lucky else q
return self.query
def generate_response(self) -> Tuple[Any, int]:
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config)
full_query = gen_query(self.query, self.request_params, self.config, content_filter.near)
get_body = g.user_request.send(query=full_query).text
# Produce cleanable html soup from response
html_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser')
if self.feeling_lucky:
return get_first_link(html_soup), 1
else:
formatted_results = content_filter.clean(html_soup)
return formatted_results, content_filter.elements

View File

@ -4,15 +4,16 @@ cffi==1.13.2
Click==7.0
cryptography==2.8
Flask==1.1.1
Flask-Session==0.3.2
itsdangerous==1.1.0
Jinja2==2.10.3
lxml==4.5.1
MarkupSafe==1.1.1
pycparser==2.19
pycurl==7.43.0.4
pyOpenSSL==19.1.0
pytest==5.4.1
python-dateutil==2.8.1
requests==2.23.0
six==1.14.0
soupsieve==1.9.5
Werkzeug==0.16.0

View File

@ -8,7 +8,7 @@ setuptools.setup(
author='Ben Busby',
author_email='benbusby@protonmail.com',
name='whoogle-search',
version='0.1.4',
version='0.2.0',
include_package_data=True,
install_requires=requirements,
description='Self-hosted, ad-free, privacy-respecting Google metasearch engine',

View File

@ -1,8 +1,13 @@
from app import app
from app.utils.misc import generate_user_keys
import pytest
@pytest.fixture
def client():
client = app.test_client()
yield client
with app.test_client() as client:
with client.session_transaction() as session:
session['uuid'] = 'test'
session['fernet_keys'] = generate_user_keys()
session['config'] = {}
yield client

33
test/test_misc.py Normal file
View File

@ -0,0 +1,33 @@
from app.utils.misc import generate_user_keys, valid_user_session
def test_generate_user_keys():
keys = generate_user_keys()
assert 'text_key' in keys
assert 'element_key' in keys
assert keys['text_key'] not in keys['element_key']
def test_valid_session(client):
assert not valid_user_session({'fernet_keys': '', 'config': {}})
with client.session_transaction() as session:
assert valid_user_session(session)
def test_request_key_generation(client):
rv = client.get('/')
cookie = rv.headers['Set-Cookie']
rv = client.get('/search?q=test+1', headers={'Cookie': cookie})
assert rv._status_code == 200
with client.session_transaction() as session:
assert valid_user_session(session)
text_key = session['fernet_keys']['text_key']
rv = client.get('/search?q=test+2', headers={'Cookie': cookie})
assert rv._status_code == 200
with client.session_transaction() as session:
assert valid_user_session(session)
assert text_key not in session['fernet_keys']['text_key']

View File

@ -1,13 +1,13 @@
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet
from app.filter import Filter
from app.utils.misc import generate_user_keys
from datetime import datetime
from dateutil.parser import *
def get_search_results(data):
secret_key = Fernet.generate_key()
soup = Filter(secret_key=secret_key).clean(BeautifulSoup(data, 'html.parser'))
secret_key = generate_user_keys()
soup = Filter(user_keys=secret_key).clean(BeautifulSoup(data, 'html.parser'))
main_divs = soup.find('div', {'id': 'main'})
assert len(main_divs) > 1

View File

@ -1,10 +1,13 @@
from app.models.config import Config
import json
import random
demo_config = {
'near': random.choice(['Seattle', 'New York', 'San Francisco']),
'dark_mode': str(random.getrandbits(1)),
'nojs': str(random.getrandbits(1))
'nojs': str(random.getrandbits(1)),
'lang': random.choice(Config.LANGUAGES)['value'],
'ctry': random.choice(Config.COUNTRIES)['value']
}
@ -17,6 +20,7 @@ def test_search(client):
rv = client.get('/search?q=test')
assert rv._status_code == 200
def test_feeling_lucky(client):
rv = client.get('/search?q=!%20test')
assert rv._status_code == 303