Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/
main
Ben Busby 2021-03-24 15:13:52 -04:00 committed by Ben Busby
parent 892b646a4e
commit 8ad8e66d37
5 changed files with 86 additions and 42 deletions

View File

@ -1,6 +1,7 @@
from app.request import VALID_PARAMS
from app.utils.results import *
from bs4.element import ResultSet
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from cryptography.fernet import Fernet
import re
import urllib.parse as urlparse
@ -8,7 +9,7 @@ from urllib.parse import parse_qs
class Filter:
def __init__(self, user_keys: dict, mobile=False, config=None):
def __init__(self, user_keys: dict, mobile=False, config=None) -> None:
if config is None:
config = {}
@ -29,7 +30,7 @@ class Filter:
def elements(self):
return self._elements
def reskin(self, page):
def reskin(self, page: str) -> str:
# Aesthetic only re-skinning
if self.dark:
page = page.replace(
@ -39,22 +40,22 @@ class Filter:
return page
def encrypt_path(self, msg, is_element=False):
def encrypt_path(self, path, is_element=False) -> str:
# Encrypts path to avoid plaintext results in logs
if is_element:
# Element paths are encrypted separately from text, to allow key
# regeneration once all items have been served to the user
enc_path = Fernet(
self.user_keys['element_key']
).encrypt(msg.encode()).decode()
).encrypt(path.encode()).decode()
self._elements += 1
return enc_path
return Fernet(
self.user_keys['text_key']
).encrypt(msg.encode()).decode()
).encrypt(path.encode()).decode()
def clean(self, soup):
def clean(self, soup) -> BeautifulSoup:
self.main_divs = soup.find('div', {'id': 'main'})
self.remove_ads()
self.fix_question_section()
@ -90,7 +91,12 @@ class Filter:
return soup
def remove_ads(self):
def remove_ads(self) -> None:
"""Removes ads found in the list of search result divs
Returns:
None (The soup object is modified directly)
"""
if not self.main_divs:
return
@ -99,7 +105,16 @@ class Filter:
if has_ad_content(_.text)]
_ = div.decompose() if len(div_ads) else None
def fix_question_section(self):
def fix_question_section(self) -> None:
"""Collapses the "People Also Asked" section into a "details" element
These sections are typically the only sections in the results page that
are structured as <div><h2>Title</h2><div>...</div></div>, so they are
extracted by checking all result divs for h2 children.
Returns:
None (The soup object is modified directly)
"""
if not self.main_divs:
return
@ -126,7 +141,14 @@ class Filter:
for question in questions:
question['style'] = 'padding: 10px; font-style: italic;'
def update_element_src(self, element, mime):
def update_element_src(self, element: Tag, mime: str) -> None:
"""Encrypts the original src of an element and rewrites the element src
to use the "/element?src=" pass-through.
Returns:
None (The soup element is modified directly)
"""
src = element['src']
if src.startswith('//'):
@ -145,7 +167,8 @@ class Filter:
src,
is_element=True) + '&type=' + urlparse.quote(mime)
def update_styling(self, soup):
def update_styling(self, soup) -> None:
""""""
# Remove unnecessary button(s)
for button in soup.find_all('button'):
button.decompose()
@ -168,7 +191,17 @@ class Filter:
except AttributeError:
pass
def update_link(self, link):
def update_link(self, link: Tag) -> None:
"""Update internal link paths with encrypted path, otherwise remove
unnecessary redirects and/or marketing params from the url
Args:
link: A bs4 Tag element to inspect and update
Returns:
None (the tag is updated directly)
"""
# Replace href with only the intended destination (no "utm" type tags)
href = link['href'].replace('https://www.google.com', '')
if 'advanced_search' in href or 'tbm=shop' in href:

View File

@ -29,10 +29,10 @@ class TorError(Exception):
altogether).
"""
def __init__(self, message, disable=False):
def __init__(self, message, disable=False) -> None:
self.message = message
self.disable = disable
super().__init__(self.message)
super().__init__(message)
def send_tor_signal(signal: Signal) -> bool:
@ -64,7 +64,7 @@ def gen_query(query, args, config, near_city=None) -> str:
# Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month"
sub_lang = ''
lang = ''
if ':past' in query and 'tbs' not in args:
time_range = str.strip(query.split(':past', 1)[-1])
param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0]))
@ -79,9 +79,10 @@ def gen_query(query, args, config, near_city=None) -> str:
# Example:
# &tbs=qdr:h,lr:lang_1pl
# -- the lr param needs to be extracted and remove the leading '1'
sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _]
sub_lang = sub_lang[0][sub_lang[0].find('lr:') +
3:len(sub_lang[0])] if len(sub_lang) > 0 else ''
result_params = [_ for _ in result_tbs.split(',') if 'lr:' in _]
if len(result_params) > 0:
result_param = result_params[0]
lang = result_param[result_param.find('lr:') + 3:len(result_param)]
# Ensure search query is parsable
query = urlparse.quote(query)
@ -103,8 +104,8 @@ def gen_query(query, args, config, near_city=None) -> str:
if 'source' in args:
param_dict['source'] = '&source=' + args.get('source')
param_dict['lr'] = ('&lr=' + ''.join(
[_ for _ in sub_lang if not _.isdigit()]
)) if sub_lang else ''
[_ for _ in lang if not _.isdigit()]
)) if lang else ''
else:
param_dict['lr'] = (
'&lr=' + config.lang_search
@ -150,12 +151,12 @@ class Request:
# Set up proxy, if previously configured
if os.environ.get('WHOOGLE_PROXY_LOC'):
auth_str = ''
if os.environ.get('WHOOGLE_PROXY_USER'):
auth_str = os.environ.get('WHOOGLE_PROXY_USER') + \
':' + os.environ.get('WHOOGLE_PROXY_PASS')
if os.environ.get('WHOOGLE_PROXY_USER', ''):
auth_str = os.environ.get('WHOOGLE_PROXY_USER', '') + \
':' + os.environ.get('WHOOGLE_PROXY_PASS', '')
self.proxies = {
'http': os.environ.get('WHOOGLE_PROXY_TYPE') + '://' +
auth_str + '@' + os.environ.get('WHOOGLE_PROXY_LOC'),
'http': os.environ.get('WHOOGLE_PROXY_TYPE', '') + '://' +
auth_str + '@' + os.environ.get('WHOOGLE_PROXY_LOC', ''),
}
self.proxies['https'] = self.proxies['http'].replace('http',
'https')

View File

@ -347,7 +347,7 @@ def window():
return render_template('display.html', response=results)
def run_app():
def run_app() -> None:
parser = argparse.ArgumentParser(
description='Whoogle Search console runner')
parser.add_argument(

View File

@ -57,6 +57,7 @@ def get_first_link(soup: BeautifulSoup) -> str:
# Return the first search result URL
if 'url?q=' in a['href']:
return filter_link_args(a['href'])
return ''
def get_site_alt(link: str) -> str:

View File

@ -24,15 +24,24 @@ def needs_https(url: str) -> bool:
bool: True/False representing the need to upgrade
"""
https_only = os.getenv('HTTPS_ONLY', False)
https_only = bool(os.getenv('HTTPS_ONLY', 0))
is_heroku = url.endswith('.herokuapp.com')
is_http = url.startswith('http://')
return (is_heroku and is_http) or (https_only and is_http)
def has_captcha(site_contents: str) -> bool:
return CAPTCHA in site_contents
def has_captcha(results: str) -> bool:
"""Checks to see if the search results are blocked by a captcha
Args:
results: The search page html as a string
Returns:
bool: True/False indicating if a captcha element was found
"""
return CAPTCHA in results
class Search:
@ -118,23 +127,23 @@ class Search:
"""
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(
self.session['fernet_keys'],
mobile=mobile,
config=self.config)
full_query = gen_query(
self.query,
self.request_params,
self.config,
content_filter.near)
content_filter = Filter(self.session['fernet_keys'],
mobile=mobile,
config=self.config)
full_query = gen_query(self.query,
self.request_params,
self.config,
content_filter.near)
get_body = g.user_request.send(query=full_query)
# Produce cleanable html soup from response
html_soup = bsoup(content_filter.reskin(get_body.text), 'html.parser')
html_soup.insert(
0,
bsoup(TOR_BANNER, 'html.parser')
if g.user_request.tor_valid else bsoup('', 'html.parser'))
# Indicate whether or not a Tor connection is active
tor_banner = bsoup('', 'html.parser')
if g.user_request.tor_valid:
tor_banner = bsoup(TOR_BANNER, 'html.parser')
html_soup.insert(0, tor_banner)
if self.feeling_lucky:
return get_first_link(html_soup), 0