Improve naming of *_utils files, update fn/class doc

The app/utils/*_utils weren't named very well, and all have been updated
to have more accurate names.

Function and class documention for the utils have been updated as well,
as part of the effort to improve overall documentation for the project.
main
Ben Busby 2021-03-08 12:22:04 -05:00 committed by Ben Busby
parent 1d612c68a4
commit f8dfc78539
13 changed files with 232 additions and 100 deletions

View File

@ -1,6 +1,6 @@
from app.request import send_tor_signal
from app.utils.session_utils import generate_user_keys
from app.utils.gen_ddg_bangs import gen_bangs_json
from app.utils.session import generate_user_keys
from app.utils.bangs import gen_bangs_json
from flask import Flask
from flask_session import Session
import json

View File

@ -1,5 +1,5 @@
from app.request import VALID_PARAMS
from app.utils.filter_utils import *
from app.utils.results import *
from bs4.element import ResultSet
from cryptography.fernet import Fernet
import re
@ -208,7 +208,7 @@ class Filter:
# Add no-js option
if self.nojs:
gen_nojs(link)
append_nojs(link)
else:
link['href'] = href

View File

@ -23,8 +23,8 @@ class TorError(Exception):
"""Exception raised for errors in Tor requests.
Attributes:
message -- a message describing the error that occurred
disable -- optionally disables Tor in the user config (note:
message: a message describing the error that occurred
disable: optionally disables Tor in the user config (note:
this should only happen if the connection has been dropped
altogether).
"""
@ -133,9 +133,9 @@ class Request:
search suggestions, and loading of external content (images, audio, etc).
Attributes:
normal_ua -- the user's current user agent
root_path -- the root path of the whoogle instance
config -- the user's current whoogle configuration
normal_ua: the user's current user agent
root_path: the root path of the whoogle instance
config: the user's current whoogle configuration
"""
def __init__(self, normal_ua, root_path, config: Config):

View File

@ -16,8 +16,9 @@ from requests import exceptions
from app import app
from app.models.config import Config
from app.request import Request, TorError
from app.utils.session_utils import valid_user_session
from app.utils.routing_utils import *
from app.utils.bangs import resolve_bang
from app.utils.session import valid_user_session
from app.utils.search import *
# Load DDG bang json files only on init
bang_json = json.load(open(app.config['BANG_FILE']))
@ -199,13 +200,13 @@ def search():
# Update user config if specified in search args
g.user_config = g.user_config.from_params(g.request_params)
search_util = RoutingUtils(request, g.user_config, session,
cookies_disabled=g.cookies_disabled)
search_util = Search(request, g.user_config, session,
cookies_disabled=g.cookies_disabled)
query = search_util.new_search_query()
resolved_bangs = search_util.bang_operator(bang_json)
if resolved_bangs != '':
return redirect(resolved_bangs)
bang = resolve_bang(query=query, bangs_dict=bang_json)
if bang != '':
return redirect(bang)
# Redirect to home if invalid/blank search
if not query:

61
app/utils/bangs.py Normal file
View File

@ -0,0 +1,61 @@
import json
import requests
DDG_BANGS = 'https://duckduckgo.com/bang.v255.js'
def gen_bangs_json(bangs_file: str) -> None:
"""Generates a json file from the DDG bangs list
Args:
bangs_file: The str path to the new DDG bangs json file
Returns:
None
"""
try:
# Request full list from DDG
r = requests.get(DDG_BANGS)
r.raise_for_status()
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
# Convert to json
data = json.loads(r.text)
# Set up a json object (with better formatting) for all available bangs
bangs_data = {}
for row in data:
bang_command = '!' + row['t']
bangs_data[bang_command] = {
'url': row['u'].replace('{{{s}}}', '{}'),
'suggestion': bang_command + ' (' + row['s'] + ')'
}
json.dump(bangs_data, open(bangs_file, 'w'))
def resolve_bang(query: str, bangs_dict: dict) -> str:
"""Transform's a user's query to a bang search, if an operator is found
Args:
query: The search query
bangs_dict: The dict of available bang operators, with corresponding
format string search URLs
(i.e. "!w": "https://en.wikipedia.org...?search={}")
Returns:
str: A formatted redirect for a bang search, or an empty str if there
wasn't a match or didn't contain a bang operator
"""
split_query = query.split(' ')
for operator in bangs_dict.keys():
if operator not in split_query:
continue
return bangs_dict[operator]['url'].format(
query.replace(operator, '').strip())
return ''

View File

@ -1,26 +0,0 @@
import json
import requests
def gen_bangs_json(bangs_file):
# Request list
try:
r = requests.get('https://duckduckgo.com/bang.v255.js')
r.raise_for_status()
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
# Convert to json
data = json.loads(r.text)
# Set up a json object (with better formatting) for all available bangs
bangs_data = {}
for row in data:
bang_command = '!' + row['t']
bangs_data[bang_command] = {
'url': row['u'].replace('{{{s}}}', '{}'),
'suggestion': bang_command + ' (' + row['s'] + ')'
}
json.dump(bangs_data, open(bangs_file, 'w'))

View File

@ -28,12 +28,30 @@ SITE_ALTS = {
}
def has_ad_content(element: str):
def has_ad_content(element: str) -> bool:
"""Inspects an HTML element for ad related content
Args:
element: The HTML element to inspect
Returns:
bool: True/False for the element containing an ad
"""
return element.upper() in (value.upper() for value in BLACKLIST) \
or '' in element
def get_first_link(soup):
def get_first_link(soup: BeautifulSoup) -> str:
"""Retrieves the first result link from the query response
Args:
soup: The BeautifulSoup response body
Returns:
str: A str link to the first result
"""
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
# Return the first search result URL
@ -41,7 +59,16 @@ def get_first_link(soup):
return filter_link_args(a['href'])
def get_site_alt(link: str):
def get_site_alt(link: str) -> str:
"""Returns an alternative to a particular site, if one is configured
Args:
link: A string result URL to check against the SITE_ALTS map
Returns:
str: An updated (or ignored) result link
"""
for site_key in SITE_ALTS.keys():
if site_key not in link:
continue
@ -55,13 +82,22 @@ def get_site_alt(link: str):
return link
def filter_link_args(query_link):
parsed_link = urlparse.urlparse(query_link)
def filter_link_args(link: str) -> str:
"""Filters out unnecessary URL args from a result link
Args:
link: The string result link to check for extraneous URL params
Returns:
str: An updated (or ignored) result link
"""
parsed_link = urlparse.urlparse(link)
link_args = parse_qs(parsed_link.query)
safe_args = {}
if len(link_args) == 0 and len(parsed_link) > 0:
return query_link
return link
for arg in link_args.keys():
if arg in SKIP_ARGS:
@ -70,19 +106,28 @@ def filter_link_args(query_link):
safe_args[arg] = link_args[arg]
# Remove original link query and replace with filtered args
query_link = query_link.replace(parsed_link.query, '')
link = link.replace(parsed_link.query, '')
if len(safe_args) > 0:
query_link = query_link + urlparse.urlencode(safe_args, doseq=True)
link = link + urlparse.urlencode(safe_args, doseq=True)
else:
query_link = query_link.replace('?', '')
link = link.replace('?', '')
return query_link
return link
def gen_nojs(sibling):
def append_nojs(result: BeautifulSoup) -> None:
"""Appends a no-Javascript alternative for a search result
Args:
result: The search result to append a no-JS link to
Returns:
None
"""
nojs_link = BeautifulSoup(features='html.parser').new_tag('a')
nojs_link['href'] = '/window?location=' + sibling['href']
nojs_link['href'] = '/window?location=' + result['href']
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
sibling.append(nojs_link)
result.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
result.append(nojs_link)

View File

@ -1,5 +1,5 @@
from app.filter import Filter, get_first_link
from app.utils.session_utils import generate_user_keys
from app.utils.session import generate_user_keys
from app.request import gen_query
from bs4 import BeautifulSoup as bsoup
from cryptography.fernet import Fernet, InvalidToken
@ -11,6 +11,18 @@ TOR_BANNER = '<hr><h1 style="text-align: center">You are using Tor</h1><hr>'
def needs_https(url: str) -> bool:
"""Checks if the current instance needs to be upgraded to HTTPS
Note that all Heroku instances are available by default over HTTPS, but
do not automatically set up a redirect when visited over HTTP.
Args:
url: The instance url
Returns:
bool: True/False representing the need to upgrade
"""
https_only = os.getenv('HTTPS_ONLY', False)
is_heroku = url.endswith('.herokuapp.com')
is_http = url.startswith('http://')
@ -18,7 +30,15 @@ def needs_https(url: str) -> bool:
return (is_heroku and is_http) or (https_only and is_http)
class RoutingUtils:
class Search:
"""Search query preprocessor - used before submitting the query or
redirecting to another site
Attributes:
request: the incoming flask request
config: the current user config settings
session: the flask user session
"""
def __init__(self, request, config, session, cookies_disabled=False):
method = request.method
self.request_params = request.args if method == 'GET' else request.form
@ -31,19 +51,28 @@ class RoutingUtils:
self.search_type = self.request_params.get(
'tbm') if 'tbm' in self.request_params else ''
def __getitem__(self, name):
def __getitem__(self, name) -> Any:
return getattr(self, name)
def __setitem__(self, name, value):
def __setitem__(self, name, value) -> None:
return setattr(self, name, value)
def __delitem__(self, name):
def __delitem__(self, name) -> None:
return delattr(self, name)
def __contains__(self, name):
def __contains__(self, name) -> bool:
return hasattr(self, name)
def new_search_query(self) -> str:
"""Parses a plaintext query into a valid string for submission
Also decrypts the query string, if encrypted (in the case of
paginated results).
Returns:
str: A valid query string
"""
# Generate a new element key each time a new search is performed
self.session['fernet_keys']['element_key'] = generate_user_keys(
cookies_disabled=self.cookies_disabled)['element_key']
@ -70,17 +99,18 @@ class RoutingUtils:
self.query = q[2:] if self.feeling_lucky else q
return self.query
def bang_operator(self, bangs_dict: dict) -> str:
split_query = self.query.split(' ')
for operator in bangs_dict.keys():
if operator not in split_query:
continue
return bangs_dict[operator]['url'].format(
self.query.replace(operator, '').strip())
return ''
def generate_response(self) -> Tuple[Any, int]:
"""Generates a response for the user's query
Returns:
Tuple[Any, int]: A tuple in the format (response, # of elements)
For example, in the case of a "feeling lucky"
search, the response is a result URL, with no
encrypted elements to account for. Otherwise, the
response is a BeautifulSoup response body, with
N encrypted elements to track before key regen.
"""
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(
@ -102,7 +132,7 @@ class RoutingUtils:
if g.user_request.tor_valid else bsoup('', 'html.parser'))
if self.feeling_lucky:
return get_first_link(html_soup), 1
return get_first_link(html_soup), 0
else:
formatted_results = content_filter.clean(html_soup)

45
app/utils/session.py Normal file
View File

@ -0,0 +1,45 @@
from cryptography.fernet import Fernet
from flask import current_app as app
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
def generate_user_keys(cookies_disabled=False) -> dict:
"""Generates a set of user keys
Args:
cookies_disabled: Flag for whether or not cookies are disabled by the
user. If so, the user can only use the default key
set generated on app init for queries.
Returns:
dict: A new Fernet key set
"""
if cookies_disabled:
return app.default_key_set
# Generate/regenerate unique key per user
return {
'element_key': Fernet.generate_key(),
'text_key': Fernet.generate_key()
}
def valid_user_session(session: dict) -> bool:
"""Validates the current user session
Args:
session: The current Flask user session
Returns:
bool: True/False indicating that all required session values are
available
"""
# Generate secret key for user if unavailable
for value in REQUIRED_SESSION_VALUES:
if value not in session:
return False
return True

View File

@ -1,24 +0,0 @@
from cryptography.fernet import Fernet
from flask import current_app as app
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
def generate_user_keys(cookies_disabled=False) -> dict:
if cookies_disabled:
return app.default_key_set
# Generate/regenerate unique key per user
return {
'element_key': Fernet.generate_key(),
'text_key': Fernet.generate_key()
}
def valid_user_session(session):
# Generate secret key for user if unavailable
for value in REQUIRED_SESSION_VALUES:
if value not in session:
return False
return True

View File

@ -1,5 +1,5 @@
from app import app
from app.utils.session_utils import generate_user_keys
from app.utils.session import generate_user_keys
import pytest
import random

View File

@ -1,4 +1,4 @@
from app.utils.session_utils import generate_user_keys, valid_user_session
from app.utils.session import generate_user_keys, valid_user_session
def test_generate_user_keys():

View File

@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from app.filter import Filter
from app.utils.session_utils import generate_user_keys
from app.utils.session import generate_user_keys
from datetime import datetime
from dateutil.parser import *