whoogle-search/app/utils/misc.py

from bs4 import BeautifulSoup as bsoup
from flask import Request
import hashlib
import os
import re
from requests import exceptions, get
from urllib.parse import urlparse


def gen_file_hash(path: str, static_file: str) -> str:
    file_contents = open(os.path.join(path, static_file), 'rb').read()
    file_hash = hashlib.md5(file_contents).hexdigest()[:8]
    filename_split = os.path.splitext(static_file)

    return filename_split[0] + '.' + file_hash + filename_split[-1]


def read_config_bool(var: str) -> bool:
    val = os.getenv(var, '0')
    # user can specify one of the following values as 'true' inputs (all
    # variants with upper case letters will also work):
    # ('true', 't', '1', 'yes', 'y')
    val = val.lower() in ('true', 't', '1', 'yes', 'y')
    return val


def get_client_ip(r: Request) -> str:
    if r.environ.get('HTTP_X_FORWARDED_FOR') is None:
        return r.environ['REMOTE_ADDR']
    else:
        return r.environ['HTTP_X_FORWARDED_FOR']


def get_request_url(url: str) -> str:
    if os.getenv('HTTPS_ONLY', False):
        return url.replace('http://', 'https://', 1)

    return url


def get_proxy_host_url(r: Request, default: str, root=False) -> str:
    scheme = r.headers.get('X-Forwarded-Proto', 'https')
    http_host = r.headers.get('X-Forwarded-Host')

    full_path = r.full_path if not root else ''
    if full_path.startswith('/'):
        full_path = f'/{full_path}'

    if http_host:
        prefix = os.environ.get('WHOOGLE_URL_PREFIX', '')
        if prefix:
            prefix = f'/{re.sub("[^0-9a-zA-Z]+", "", prefix)}'
        return f'{scheme}://{http_host}{prefix}{full_path}'

    return default


def check_for_update(version_url: str, current: str) -> int:
    # Check for the latest version of Whoogle
    try:
        update = bsoup(get(version_url).text, 'html.parser')
        latest = update.select_one('[class="Link--primary"]').string[1:]
        current = int(''.join(filter(str.isdigit, current)))
        latest = int(''.join(filter(str.isdigit, latest)))
        has_update = '' if current >= latest else latest
    except (exceptions.ConnectionError, AttributeError):
        # Ignore failures, assume current version is up to date
        has_update = ''

    return has_update


def get_abs_url(url, page_url):
    # Creates a valid absolute URL using a partial or relative URL
    if url.startswith('//'):
        return f'https:{url}'
    elif url.startswith('/'):
        return f'{urlparse(page_url).netloc}{url}'
    elif url.startswith('./'):
        return f'{page_url}{url[2:]}'
    return url


def list_to_dict(lst: list) -> dict:
    if len(lst) < 2:
        return {}
    return {lst[i].replace(' ', ''): lst[i+1].replace(' ', '')
            for i in range(0, len(lst), 2)}
Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00			`from bs4 import BeautifulSoup as bsoup`
Expand 'my ip' to work for proxied requests Adds a check for the HTTP_X_FORWARDED_FOR header, and uses the value from the request if found. 2021-10-29 06:31:24 +03:00			`from flask import Request`
Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00			`import hashlib`
			`import os`
Include url prefix for reverse proxied instances The url prefix was not included when reconstructing the root url using X-Forwarded-* headers, causing some elements to fail to load properly. Fixes #937 2023-01-30 22:13:46 +03:00			`import re`
Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00			`from requests import exceptions, get`
Support proxying results through Whoogle (aka "anonymous view") (#682) * Expand `/window` endpoint to behave like a proxy The `/window` endpoint was previously used as a type of proxy, but only for removing Javascript from the result page. This expands the existing functionality to allow users to proxy search result pages (with or without Javascript) through their Whoogle instance. * Implement filtering of remote content from css * Condense NoJS feature into Anonymous View Enabling NoJS now removes Javascript from the Anonymous View, rather than creating a separate option. * Exclude 'data:' urls from filter, add translations The 'data:' url must be allowed in results to view certain elements on the page, such as stars for review based results. Add translations for the remaining languages. * Add cssutils to requirements 2022-04-13 20:29:07 +03:00			`from urllib.parse import urlparse`
Use cache busting for css/js files On app init, short hashes are generated from file checksums to use for cache busting. These hashes are added into the full file name and used to symlink to the actual file contents. These symlinks are loaded in the jinja templates for each page, and can tell the browser to load a new file if the hash changes. This is only in place for css and js files, but can be extended in the future for other file types if needed. 2021-07-01 02:00:01 +03:00

			`def gen_file_hash(path: str, static_file: str) -> str:`
			`file_contents = open(os.path.join(path, static_file), 'rb').read()`
			`file_hash = hashlib.md5(file_contents).hexdigest()[:8]`
			`filename_split = os.path.splitext(static_file)`

			`return filename_split[0] + '.' + file_hash + filename_split[-1]`
Disable autocomplete via WHOOGLE_AUTOCOMPLETE var Setting WHOOGLE_AUTOCOMPLETE to 0 now disables the autocomplete/search suggestion feature. Closes #462 2021-10-15 03:58:13 +03:00

			`def read_config_bool(var: str) -> bool:`
			`val = os.getenv(var, '0')`
Allow different `true` values for config vars (#841) * Fixes read_config_bool to allow several true params * add upper case comment 2022-09-07 21:54:43 +03:00			`# user can specify one of the following values as 'true' inputs (all`
			`# variants with upper case letters will also work):`
			`# ('true', 't', '1', 'yes', 'y')`
			`val = val.lower() in ('true', 't', '1', 'yes', 'y')`
			`return val`
Expand 'my ip' to work for proxied requests Adds a check for the HTTP_X_FORWARDED_FOR header, and uses the value from the request if found. 2021-10-29 06:31:24 +03:00

			`def get_client_ip(r: Request) -> str:`
			`if r.environ.get('HTTP_X_FORWARDED_FOR') is None:`
			`return r.environ['REMOTE_ADDR']`
			`else:`
			`return r.environ['HTTP_X_FORWARDED_FOR']`
Fix incorrect redirect protocol used by Flask Flask's `request.url` uses `http` as the protocol, which breaks instances that enforce `https`, since the session redirect relies on `request.url` for the follow-through URL. This introduces a new method for determining the correct URL to use for these redirects by automatically replacing the protocol with `https` if the `HTTPS_ONLY` env var is set for that instance. Fixes #538 Fixes #545 2021-11-22 09:21:04 +03:00

			`def get_request_url(url: str) -> str:`
			`if os.getenv('HTTPS_ONLY', False):`
			`return url.replace('http://', 'https://', 1)`

			`return url`
Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00

Include full path when determining proxy host url Session validation includes a method for determining the proxy host url, but previously did not include the path for the initial request. This caused a situation where users with a new session would not be able to complete their first search, since the session validation follow-through url did not include the actual path for their search query. The method now includes a flag for only extracting the root url, which is needed for creating full urls in the content filter. Fixes #708 2022-08-02 19:55:45 +03:00			`def get_proxy_host_url(r: Request, default: str, root=False) -> str:`
			`scheme = r.headers.get('X-Forwarded-Proto', 'https')`
Use X-Forwarded-Host as url_root when present (#799) If Whoogle is accessed on a non-standard port _and_ proxied, this port is lost to the application and `element['src']`s are incorrectly formed (omitting port). HTTP x-Forwarded-Host will contain this front port number in a typical Nginx reverse proxy configuration. 2022-07-05 19:01:47 +03:00			`http_host = r.headers.get('X-Forwarded-Host')`
Include url prefix for reverse proxied instances The url prefix was not included when reconstructing the root url using X-Forwarded-* headers, causing some elements to fail to load properly. Fixes #937 2023-01-30 22:13:46 +03:00
			`full_path = r.full_path if not root else ''`
			`if full_path.startswith('/'):`
			`full_path = f'/{full_path}'`

Use X-Forwarded-Host as url_root when present (#799) If Whoogle is accessed on a non-standard port _and_ proxied, this port is lost to the application and `element['src']`s are incorrectly formed (omitting port). HTTP x-Forwarded-Host will contain this front port number in a typical Nginx reverse proxy configuration. 2022-07-05 19:01:47 +03:00			`if http_host:`
Include url prefix for reverse proxied instances The url prefix was not included when reconstructing the root url using X-Forwarded-* headers, causing some elements to fail to load properly. Fixes #937 2023-01-30 22:13:46 +03:00			`prefix = os.environ.get('WHOOGLE_URL_PREFIX', '')`
			`if prefix:`
			`prefix = f'/{re.sub("[^0-9a-zA-Z]+", "", prefix)}'`
			`return f'{scheme}://{http_host}{prefix}{full_path}'`
Use X-Forwarded-Host as url_root when present (#799) If Whoogle is accessed on a non-standard port _and_ proxied, this port is lost to the application and `element['src']`s are incorrectly formed (omitting port). HTTP x-Forwarded-Host will contain this front port number in a typical Nginx reverse proxy configuration. 2022-07-05 19:01:47 +03:00
			`return default`


Check for updates using 24 hour time delta Rather than only checking for an available update on app init, the check for updates now performs the check once every 24 hours on the first request sent after that period. This also now catches the requests.exceptions.ConnectionError that is thrown if the app is initialized without an active internet connection. Fixes #649 2022-02-14 22:19:02 +03:00			`def check_for_update(version_url: str, current: str) -> int:`
			`# Check for the latest version of Whoogle`
			`try:`
			`update = bsoup(get(version_url).text, 'html.parser')`
			`latest = update.select_one('[class="Link--primary"]').string[1:]`
			`current = int(''.join(filter(str.isdigit, current)))`
			`latest = int(''.join(filter(str.isdigit, latest)))`
			`has_update = '' if current >= latest else latest`
			`except (exceptions.ConnectionError, AttributeError):`
			`# Ignore failures, assume current version is up to date`
			`has_update = ''`

			`return has_update`
Support proxying results through Whoogle (aka "anonymous view") (#682) * Expand `/window` endpoint to behave like a proxy The `/window` endpoint was previously used as a type of proxy, but only for removing Javascript from the result page. This expands the existing functionality to allow users to proxy search result pages (with or without Javascript) through their Whoogle instance. * Implement filtering of remote content from css * Condense NoJS feature into Anonymous View Enabling NoJS now removes Javascript from the Anonymous View, rather than creating a separate option. * Exclude 'data:' urls from filter, add translations The 'data:' url must be allowed in results to view certain elements on the page, such as stars for review based results. Add translations for the remaining languages. * Add cssutils to requirements 2022-04-13 20:29:07 +03:00

			`def get_abs_url(url, page_url):`
			`# Creates a valid absolute URL using a partial or relative URL`
			`if url.startswith('//'):`
			`return f'https:{url}'`
			`elif url.startswith('/'):`
			`return f'{urlparse(page_url).netloc}{url}'`
			`elif url.startswith('./'):`
			`return f'{page_url}{url[2:]}'`
			`return url`
Allow defining custom redirects with `WHOOGLE_REDIRECTS` Redirects to alternative frontends can now be defined using the WHOOGLE_REDIRECTS environment variable. Usage is documented in the readme, but is basically defined as <parent>:<new>. Closes #988 2023-05-19 21:15:15 +03:00

			`def list_to_dict(lst: list) -> dict:`
			`if len(lst) < 2:`
			`return {}`
			`return {lst[i].replace(' ', ''): lst[i+1].replace(' ', '')`
			`for i in range(0, len(lst), 2)}`