whoogle-search/app/utils/misc.py

import base64
from bs4 import BeautifulSoup as bsoup
from cryptography.fernet import Fernet
from flask import Request
import hashlib
import io
import os
import re
from requests import exceptions, get
from urllib.parse import urlparse

ddg_favicon_site = 'http://icons.duckduckgo.com/ip2'

empty_gif = base64.b64decode(
    'R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==')

placeholder_img = base64.b64decode(
    'iVBORw0KGgoAAAANSUhEUgAAABkAAAAZCAYAAADE6YVjAAABF0lEQVRIS8XWPw9EMBQA8Eok' \
    'JBKrMFqMBt//GzAYLTZ/VomExPDu6uLiaPteqVynBn0/75W2Vp7nEIYhe6p1XcespmmAd7Is' \
    'M+4URcGiKPogvMMvmIS2eN9MOMKbKWgf54SYgI4vKkTuQKJKSJErkKzUSkQHUs0lilAg7GMh' \
    'ISoIA/hYMiKCKIA2soeowCWEMkfHtUmrXLcyGYYBfN9HF8djiaglWzNZlgVs21YisoAUaEXG' \
    'cQTP86QIFgi7vyLzPIPjOEIEC7ANQv/4aZrAdd0TUtc1i+MYnSsMWjPp+x6CIPgJVlUVS5KE' \
    'DKig/+wnVzM4pnzaGeHd+ENlWbI0TbVLJBtw2uMfP63wc9d2kDCWxi5Q27bsBerSJ9afJbeL' \
    'AAAAAElFTkSuQmCC'
)


def fetch_favicon(url: str) -> bytes:
    """Fetches a favicon using DuckDuckGo's favicon retriever

    Args:
        url: The url to fetch the favicon from
    Returns:
        bytes - the favicon bytes, or a placeholder image if one
        was not returned
    """
    domain = urlparse(url).netloc

    response = get(f'{ddg_favicon_site}/{domain}.ico')

    if response.status_code == 200 and len(response.content) > 0:
        tmp_mem = io.BytesIO()
        tmp_mem.write(response.content)
        tmp_mem.seek(0)

        return tmp_mem.read()
    else:
        return placeholder_img


def gen_file_hash(path: str, static_file: str) -> str:
    file_contents = open(os.path.join(path, static_file), 'rb').read()
    file_hash = hashlib.md5(file_contents).hexdigest()[:8]
    filename_split = os.path.splitext(static_file)

    return filename_split[0] + '.' + file_hash + filename_split[-1]


def read_config_bool(var: str) -> bool:
    val = os.getenv(var, '0')
    # user can specify one of the following values as 'true' inputs (all
    # variants with upper case letters will also work):
    # ('true', 't', '1', 'yes', 'y')
    val = val.lower() in ('true', 't', '1', 'yes', 'y')
    return val


def get_client_ip(r: Request) -> str:
    if r.environ.get('HTTP_X_FORWARDED_FOR') is None:
        return r.environ['REMOTE_ADDR']
    else:
        return r.environ['HTTP_X_FORWARDED_FOR']


def get_request_url(url: str) -> str:
    if os.getenv('HTTPS_ONLY', False):
        return url.replace('http://', 'https://', 1)

    return url


def get_proxy_host_url(r: Request, default: str, root=False) -> str:
    scheme = r.headers.get('X-Forwarded-Proto', 'https')
    http_host = r.headers.get('X-Forwarded-Host')

    full_path = r.full_path if not root else ''
    if full_path.startswith('/'):
        full_path = f'/{full_path}'

    if http_host:
        prefix = os.environ.get('WHOOGLE_URL_PREFIX', '')
        if prefix:
            prefix = f'/{re.sub("[^0-9a-zA-Z]+", "", prefix)}'
        return f'{scheme}://{http_host}{prefix}{full_path}'

    return default


def check_for_update(version_url: str, current: str) -> int:
    # Check for the latest version of Whoogle
    try:
        update = bsoup(get(version_url).text, 'html.parser')
        latest = update.select_one('[class="Link--primary"]').string[1:]
        current = int(''.join(filter(str.isdigit, current)))
        latest = int(''.join(filter(str.isdigit, latest)))
        has_update = '' if current >= latest else latest
    except (exceptions.ConnectionError, AttributeError):
        # Ignore failures, assume current version is up to date
        has_update = ''

    return has_update


def get_abs_url(url, page_url):
    # Creates a valid absolute URL using a partial or relative URL
    if url.startswith('//'):
        return f'https:{url}'
    elif url.startswith('/'):
        return f'{urlparse(page_url).netloc}{url}'
    elif url.startswith('./'):
        return f'{page_url}{url[2:]}'
    return url


def list_to_dict(lst: list) -> dict:
    if len(lst) < 2:
        return {}
    return {lst[i].replace(' ', ''): lst[i+1].replace(' ', '')
            for i in range(0, len(lst), 2)}


def encrypt_string(key: bytes, string: str) -> str:
    cipher_suite = Fernet(key)
    return cipher_suite.encrypt(string.encode()).decode()


def decrypt_string(key: bytes, string: str) -> str:
    cipher_suite = Fernet(g.session_key)
    return cipher_suite.decrypt(string.encode()).decode()
Display audio controls, refactor site icon placement Audio controls are now always shown by default (mostly found in searches that contain word pronunciation guides). Site icons were moved to the left side of the results. 2023-10-12 00:41:48 +03:00			`import base64`
Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00			`from bs4 import BeautifulSoup as bsoup`
Redirect POST search -> enc GET request This should fix the annoyance with browsers like Firefox not caching POST request responses. By redirecting a POST search to be a GET request instead (with an encrypted query string), the page can be cached and successfully navigated back to after visiting a result. 2023-10-17 01:28:36 +03:00			`from cryptography.fernet import Fernet`
Expand 'my ip' to work for proxied requests Adds a check for the HTTP_X_FORWARDED_FOR header, and uses the value from the request if found. 2021-10-29 06:31:24 +03:00			`from flask import Request`
Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00			`import hashlib`
Fetch fallback site icons from DDG DDG provides favicons using the url format icons.duckduckgo.com/ip2/{site}.ico This can be used to fetch favicons in the event that the default "/favicon.ico" path does not work. 2023-10-12 02:26:12 +03:00			`import io`
Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00			`import os`
Include url prefix for reverse proxied instances The url prefix was not included when reconstructing the root url using X-Forwarded-* headers, causing some elements to fail to load properly. Fixes #937 2023-01-30 22:13:46 +03:00			`import re`
Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00			`from requests import exceptions, get`
Support proxying results through Whoogle (aka "anonymous view") (#682) * Expand `/window` endpoint to behave like a proxy The `/window` endpoint was previously used as a type of proxy, but only for removing Javascript from the result page. This expands the existing functionality to allow users to proxy search result pages (with or without Javascript) through their Whoogle instance. * Implement filtering of remote content from css * Condense NoJS feature into Anonymous View Enabling NoJS now removes Javascript from the Anonymous View, rather than creating a separate option. * Exclude 'data:' urls from filter, add translations The 'data:' url must be allowed in results to view certain elements on the page, such as stars for review based results. Add translations for the remaining languages. * Add cssutils to requirements 2022-04-13 20:29:07 +03:00			`from urllib.parse import urlparse`
Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00
Fetch fallback site icons from DDG DDG provides favicons using the url format icons.duckduckgo.com/ip2/{site}.ico This can be used to fetch favicons in the event that the default "/favicon.ico" path does not work. 2023-10-12 02:26:12 +03:00			`ddg_favicon_site = 'http://icons.duckduckgo.com/ip2'`

Display audio controls, refactor site icon placement Audio controls are now always shown by default (mostly found in searches that contain word pronunciation guides). Site icons were moved to the left side of the results. 2023-10-12 00:41:48 +03:00			`empty_gif = base64.b64decode(`
			`'R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==')`

			`placeholder_img = base64.b64decode(`
			`'iVBORw0KGgoAAAANSUhEUgAAABkAAAAZCAYAAADE6YVjAAABF0lEQVRIS8XWPw9EMBQA8Eok' \`
			`'JBKrMFqMBt//GzAYLTZ/VomExPDu6uLiaPteqVynBn0/75W2Vp7nEIYhe6p1XcespmmAd7Is' \`
			`'M+4URcGiKPogvMMvmIS2eN9MOMKbKWgf54SYgI4vKkTuQKJKSJErkKzUSkQHUs0lilAg7GMh' \`
			`'ISoIA/hYMiKCKIA2soeowCWEMkfHtUmrXLcyGYYBfN9HF8djiaglWzNZlgVs21YisoAUaEXG' \`
			`'cQTP86QIFgi7vyLzPIPjOEIEC7ANQv/4aZrAdd0TUtc1i+MYnSsMWjPp+x6CIPgJVlUVS5KE' \`
			`'DKig/+wnVzM4pnzaGeHd+ENlWbI0TbVLJBtw2uMfP63wc9d2kDCWxi5Q27bsBerSJ9afJbeL' \`
			`'AAAAAElFTkSuQmCC'`
			`)`

Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00
Fetch fallback site icons from DDG DDG provides favicons using the url format icons.duckduckgo.com/ip2/{site}.ico This can be used to fetch favicons in the event that the default "/favicon.ico" path does not work. 2023-10-12 02:26:12 +03:00			`def fetch_favicon(url: str) -> bytes:`
			`"""Fetches a favicon using DuckDuckGo's favicon retriever`

			`Args:`
			`url: The url to fetch the favicon from`
			`Returns:`
			`bytes - the favicon bytes, or a placeholder image if one`
			`was not returned`
			`"""`
			`domain = urlparse(url).netloc`

			`response = get(f'{ddg_favicon_site}/{domain}.ico')`

			`if response.status_code == 200 and len(response.content) > 0:`
			`tmp_mem = io.BytesIO()`
			`tmp_mem.write(response.content)`
			`tmp_mem.seek(0)`

			`return tmp_mem.read()`
			`else:`
			`return placeholder_img`


Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00			`def gen_file_hash(path: str, static_file: str) -> str:`
			`file_contents = open(os.path.join(path, static_file), 'rb').read()`
			`file_hash = hashlib.md5(file_contents).hexdigest()[:8]`
			`filename_split = os.path.splitext(static_file)`

			`return filename_split[0] + '.' + file_hash + filename_split[-1]`
Disable autocomplete via WHOOGLE_AUTOCOMPLETE var Setting WHOOGLE_AUTOCOMPLETE to 0 now disables the autocomplete/search suggestion feature. Closes #462 2021-10-15 03:58:13 +03:00

			`def read_config_bool(var: str) -> bool:`
			`val = os.getenv(var, '0')`
Allow different `true` values for config vars (#841) * Fixes read_config_bool to allow several true params * add upper case comment 2022-09-07 21:54:43 +03:00			`# user can specify one of the following values as 'true' inputs (all`
			`# variants with upper case letters will also work):`
			`# ('true', 't', '1', 'yes', 'y')`
			`val = val.lower() in ('true', 't', '1', 'yes', 'y')`
			`return val`
Expand 'my ip' to work for proxied requests Adds a check for the HTTP_X_FORWARDED_FOR header, and uses the value from the request if found. 2021-10-29 06:31:24 +03:00

			`def get_client_ip(r: Request) -> str:`
			`if r.environ.get('HTTP_X_FORWARDED_FOR') is None:`
			`return r.environ['REMOTE_ADDR']`
			`else:`
			`return r.environ['HTTP_X_FORWARDED_FOR']`
Fix incorrect redirect protocol used by Flask Flask's `request.url` uses `http` as the protocol, which breaks instances that enforce `https`, since the session redirect relies on `request.url` for the follow-through URL. This introduces a new method for determining the correct URL to use for these redirects by automatically replacing the protocol with `https` if the `HTTPS_ONLY` env var is set for that instance. Fixes #538 Fixes #545 2021-11-22 09:21:04 +03:00

			`def get_request_url(url: str) -> str:`
			`if os.getenv('HTTPS_ONLY', False):`
			`return url.replace('http://', 'https://', 1)`

			`return url`
Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00

Include full path when determining proxy host url Session validation includes a method for determining the proxy host url, but previously did not include the path for the initial request. This caused a situation where users with a new session would not be able to complete their first search, since the session validation follow-through url did not include the actual path for their search query. The method now includes a flag for only extracting the root url, which is needed for creating full urls in the content filter. Fixes #708 2022-08-02 19:55:45 +03:00			`def get_proxy_host_url(r: Request, default: str, root=False) -> str:`
			`scheme = r.headers.get('X-Forwarded-Proto', 'https')`
Use X-Forwarded-Host as url_root when present (#799) If Whoogle is accessed on a non-standard port _and_ proxied, this port is lost to the application and `element['src']`s are incorrectly formed (omitting port). HTTP x-Forwarded-Host will contain this front port number in a typical Nginx reverse proxy configuration. 2022-07-05 19:01:47 +03:00			`http_host = r.headers.get('X-Forwarded-Host')`
Include url prefix for reverse proxied instances The url prefix was not included when reconstructing the root url using X-Forwarded-* headers, causing some elements to fail to load properly. Fixes #937 2023-01-30 22:13:46 +03:00
			`full_path = r.full_path if not root else ''`
			`if full_path.startswith('/'):`
			`full_path = f'/{full_path}'`

Use X-Forwarded-Host as url_root when present (#799) If Whoogle is accessed on a non-standard port _and_ proxied, this port is lost to the application and `element['src']`s are incorrectly formed (omitting port). HTTP x-Forwarded-Host will contain this front port number in a typical Nginx reverse proxy configuration. 2022-07-05 19:01:47 +03:00			`if http_host:`
Include url prefix for reverse proxied instances The url prefix was not included when reconstructing the root url using X-Forwarded-* headers, causing some elements to fail to load properly. Fixes #937 2023-01-30 22:13:46 +03:00			`prefix = os.environ.get('WHOOGLE_URL_PREFIX', '')`
			`if prefix:`
			`prefix = f'/{re.sub("[^0-9a-zA-Z]+", "", prefix)}'`
			`return f'{scheme}://{http_host}{prefix}{full_path}'`
Use X-Forwarded-Host as url_root when present (#799) If Whoogle is accessed on a non-standard port _and_ proxied, this port is lost to the application and `element['src']`s are incorrectly formed (omitting port). HTTP x-Forwarded-Host will contain this front port number in a typical Nginx reverse proxy configuration. 2022-07-05 19:01:47 +03:00
			`return default`


Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00			`def check_for_update(version_url: str, current: str) -> int:`
			`# Check for the latest version of Whoogle`
			`try:`
			`update = bsoup(get(version_url).text, 'html.parser')`
			`latest = update.select_one('[class="Link--primary"]').string[1:]`
			`current = int(''.join(filter(str.isdigit, current)))`
			`latest = int(''.join(filter(str.isdigit, latest)))`
			`has_update = '' if current >= latest else latest`
			`except (exceptions.ConnectionError, AttributeError):`
			`# Ignore failures, assume current version is up to date`
			`has_update = ''`

			`return has_update`
Support proxying results through Whoogle (aka "anonymous view") (#682) * Expand `/window` endpoint to behave like a proxy The `/window` endpoint was previously used as a type of proxy, but only for removing Javascript from the result page. This expands the existing functionality to allow users to proxy search result pages (with or without Javascript) through their Whoogle instance. * Implement filtering of remote content from css * Condense NoJS feature into Anonymous View Enabling NoJS now removes Javascript from the Anonymous View, rather than creating a separate option. * Exclude 'data:' urls from filter, add translations The 'data:' url must be allowed in results to view certain elements on the page, such as stars for review based results. Add translations for the remaining languages. * Add cssutils to requirements 2022-04-13 20:29:07 +03:00

			`def get_abs_url(url, page_url):`
			`# Creates a valid absolute URL using a partial or relative URL`
			`if url.startswith('//'):`
			`return f'https:{url}'`
			`elif url.startswith('/'):`
			`return f'{urlparse(page_url).netloc}{url}'`
			`elif url.startswith('./'):`
			`return f'{page_url}{url[2:]}'`
			`return url`
Allow defining custom redirects with `WHOOGLE_REDIRECTS` Redirects to alternative frontends can now be defined using the WHOOGLE_REDIRECTS environment variable. Usage is documented in the readme, but is basically defined as <parent>:<new>. Closes #988 2023-05-19 21:15:15 +03:00

			`def list_to_dict(lst: list) -> dict:`
			`if len(lst) < 2:`
			`return {}`
			`return {lst[i].replace(' ', ''): lst[i+1].replace(' ', '')`
			`for i in range(0, len(lst), 2)}`
Redirect POST search -> enc GET request This should fix the annoyance with browsers like Firefox not caching POST request responses. By redirecting a POST search to be a GET request instead (with an encrypted query string), the page can be cached and successfully navigated back to after visiting a result. 2023-10-17 01:28:36 +03:00

			`def encrypt_string(key: bytes, string: str) -> str:`
			`cipher_suite = Fernet(key)`
			`return cipher_suite.encrypt(string.encode()).decode()`


			`def decrypt_string(key: bytes, string: str) -> str:`
			`cipher_suite = Fernet(g.session_key)`
			`return cipher_suite.decrypt(string.encode()).decode()`