From 975ece8cd00d08add819d6a98fbe9291b3597f8d Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Sun, 26 Jul 2020 11:53:59 -0600 Subject: [PATCH] Privacy respecting alternatives in results view (#106) Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration. Verbatim search and option to ignore search autocorrect are now supported as well. Also cleaned up the javascript side of whoogle config so that it now uses arrays of available fields for parsing config values instead of manually assigning each one to a variable. This doesn't include support for Google Maps -> Open Street Maps, that seems a bit more involved than the social media redirects were, so it should likely be a separate effort. --- app/__init__.py | 2 +- app/filter.py | 68 +++++---------------- app/models/config.py | 1 + app/request.py | 6 +- app/routes.py | 2 +- app/static/css/main.css | 11 +++- app/static/js/controller.js | 35 +++++------ app/templates/index.html | 6 ++ app/utils/filter_utils.py | 79 +++++++++++++++++++++++++ app/utils/routing_utils.py | 2 +- app/utils/{misc.py => session_utils.py} | 5 -- test/conftest.py | 2 +- test/test_misc.py | 2 +- test/test_results.py | 2 +- 14 files changed, 138 insertions(+), 85 deletions(-) create mode 100644 app/utils/filter_utils.py rename app/utils/{misc.py => session_utils.py} (62%) diff --git a/app/__init__.py b/app/__init__.py index 22e436d..f21d4b4 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,4 @@ -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from flask import Flask from flask_session import Session import os diff --git a/app/filter.py b/app/filter.py index 1cc9f87..41a5cef 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,56 +1,11 @@ from app.request import VALID_PARAMS -from app.utils.misc import BLACKLIST -from bs4 import BeautifulSoup +from app.utils.filter_utils import * from bs4.element import ResultSet from cryptography.fernet import Fernet import re import urllib.parse as urlparse from urllib.parse import parse_qs -SKIP_ARGS = ['ref_src', 'utm'] -FULL_RES_IMG = '
Full Image' -GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' -LOGO_URL = GOOG_IMG + '_desk' -BLANK_B64 = ''' -data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC -''' - - -def get_first_link(soup): - # Replace hrefs with only the intended destination (no "utm" type tags) - for a in soup.find_all('a', href=True): - # Return the first search result URL - if 'url?q=' in a['href']: - return filter_link_args(a['href']) - - -def filter_link_args(query_link): - parsed_link = urlparse.urlparse(query_link) - link_args = parse_qs(parsed_link.query) - safe_args = {} - - if len(link_args) == 0 and len(parsed_link) > 0: - return query_link - - for arg in link_args.keys(): - if arg in SKIP_ARGS: - continue - - safe_args[arg] = link_args[arg] - - # Remove original link query and replace with filtered args - query_link = query_link.replace(parsed_link.query, '') - if len(safe_args) > 0: - query_link = query_link + urlparse.urlencode(safe_args, doseq=True) - else: - query_link = query_link.replace('?', '') - - return query_link - - -def has_ad_content(element: str): - return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element - class Filter: def __init__(self, user_keys: dict, mobile=False, config=None): @@ -61,6 +16,7 @@ class Filter: self.dark = config['dark'] if 'dark' in config else False self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False + self.alt_redirect = config['alts'] if 'alts' in config else False self.mobile = mobile self.user_keys = user_keys self.main_divs = ResultSet('') @@ -213,8 +169,12 @@ class Filter: query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' if query_link.startswith('/'): + # Internal google links (i.e. mail, maps, etc) should still be forwarded to Google link['href'] = 'https://google.com' + query_link elif '/search?q=' in href: + # "li:1" implies the query should be interpreted verbatim, so we wrap it in double quotes + if 'li:1' in href: + query_link = '"' + query_link + '"' new_search = '/search?q=' + self.encrypt_path(query_link) query_params = parse_qs(urlparse.urlparse(href).query) @@ -232,11 +192,13 @@ class Filter: else: link['href'] = href + # Replace link location if "alts" config is enabled + if self.alt_redirect: + # Search and replace all link descriptions with alternative location + link['href'] = get_site_alt(link['href']) + link_desc = link.find_all(text=re.compile('|'.join(SITE_ALTS.keys()))) + if len(link_desc) == 0: + return -def gen_nojs(sibling): - nojs_link = BeautifulSoup().new_tag('a') - nojs_link['href'] = '/window?location=' + sibling['href'] - nojs_link['style'] = 'display:block;width:100%;' - nojs_link.string = 'NoJS Link: ' + nojs_link['href'] - sibling.append(BeautifulSoup('


', 'html.parser')) - sibling.append(nojs_link) + # Replace link destination + link_desc[0].replace_with(get_site_alt(link_desc[0])) diff --git a/app/models/config.py b/app/models/config.py index 45b1b65..d261cd3 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -306,6 +306,7 @@ class Config: self.dark = False self.nojs = False self.near = '' + self.alts = False self.new_tab = False self.get_only = False diff --git a/app/request.py b/app/request.py index 192eedc..4abb9b3 100644 --- a/app/request.py +++ b/app/request.py @@ -12,7 +12,7 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0' DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' # Valid query params -VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source'] +VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr'] def gen_user_agent(is_mobile): @@ -68,6 +68,10 @@ def gen_query(query, args, config, near_city=None): else: param_dict['lr'] = ('&lr=' + config.lang_search) if config.lang_search else '' + # Set autocorrected search ignore + if 'nfpr' in args: + param_dict['nfpr'] = '&nfpr=' + args.get('nfpr') + param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' param_dict['hl'] = ('&hl=' + config.lang_interface.replace('lang_', '')) if config.lang_interface else '' param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') diff --git a/app/routes.py b/app/routes.py index 7f1869c..fd6278d 100644 --- a/app/routes.py +++ b/app/routes.py @@ -15,7 +15,7 @@ from requests import exceptions from app import app from app.models.config import Config from app.request import Request -from app.utils.misc import valid_user_session +from app.utils.session_utils import valid_user_session from app.utils.routing_utils import * diff --git a/app/static/css/main.css b/app/static/css/main.css index ef4b557..34458f6 100644 --- a/app/static/css/main.css +++ b/app/static/css/main.css @@ -34,10 +34,10 @@ body { color: #685e79; border-radius: 10px 10px 0 0; max-width: 600px; - background: rgba(0,0,0,0); + background: rgba(0, 0, 0, 0); } -#search-bar:focus{ +#search-bar:focus { color: #685e79; } @@ -68,7 +68,7 @@ button::-moz-focus-inner { .collapsible { outline: 0; - background-color: rgba(0,0,0,0); + background-color: rgba(0, 0, 0, 0); color: #685e79; cursor: pointer; padding: 18px; @@ -129,3 +129,8 @@ footer { width: 100%; z-index: -1; } + +.info-text { + font-style: italic; + font-size: 12px; +} \ No newline at end of file diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 95d917b..1035ff9 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -1,3 +1,13 @@ +// Whoogle configurations that use boolean values and checkboxes +CONFIG_BOOLS = [ + "nojs", "dark", "safe", "alts", "new_tab", "get_only" +]; + +// Whoogle configurations that use string values and input fields +CONFIG_STRS = [ + "near", "url" +]; + const setupSearchLayout = () => { // Setup search field const searchBar = document.getElementById("search-bar"); @@ -18,15 +28,6 @@ const setupSearchLayout = () => { }; const fillConfigValues = () => { - // Establish all config value elements - const near = document.getElementById("config-near"); - const noJS = document.getElementById("config-nojs"); - const dark = document.getElementById("config-dark"); - const safe = document.getElementById("config-safe"); - const url = document.getElementById("config-url"); - const newTab = document.getElementById("config-new-tab"); - const getOnly = document.getElementById("config-get-only"); - // Request existing config info let xhrGET = new XMLHttpRequest(); xhrGET.open("GET", "/config"); @@ -39,15 +40,15 @@ const fillConfigValues = () => { // Allow for updating/saving config values let configSettings = JSON.parse(xhrGET.responseText); - near.value = configSettings["near"] ? configSettings["near"] : ""; - noJS.checked = !!configSettings["nojs"]; - dark.checked = !!configSettings["dark"]; - safe.checked = !!configSettings["safe"]; - getOnly.checked = !!configSettings["get_only"]; - newTab.checked = !!configSettings["new_tab"]; + CONFIG_STRS.forEach(function(item) { + let configElement = document.getElementById("config-" + item.replace("_", "-")); + configElement.value = configSettings[item] ? configSettings[item] : ""; + }); - // Addresses the issue of incorrect URL being used behind reverse proxy - url.value = configSettings["url"] ? configSettings["url"] : ""; + CONFIG_BOOLS.forEach(function(item) { + let configElement = document.getElementById("config-" + item.replace("_", "-")); + configElement.checked = !!configSettings[item]; + }); }; xhrGET.send(); diff --git a/app/templates/index.html b/app/templates/index.html index a541413..dd89e32 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -96,6 +96,12 @@ +
+ + +
— Replaces Twitter/YouTube/Instagram links + with Nitter/Invidious/Bibliogram links.
+
diff --git a/app/utils/filter_utils.py b/app/utils/filter_utils.py new file mode 100644 index 0000000..ed05d76 --- /dev/null +++ b/app/utils/filter_utils.py @@ -0,0 +1,79 @@ +from bs4 import BeautifulSoup +import urllib.parse as urlparse +from urllib.parse import parse_qs + +SKIP_ARGS = ['ref_src', 'utm'] +FULL_RES_IMG = '
Full Image' +GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' +LOGO_URL = GOOG_IMG + '_desk' +BLANK_B64 = ''' +data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC +''' + +BLACKLIST = [ + 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', + 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', + 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' +] + +SITE_ALTS = { + 'twitter.com': 'nitter.net', + 'youtube.com': 'invidio.us', + 'instagram.com': 'bibliogram.art/u' +} + + +def has_ad_content(element: str): + return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element + + +def get_first_link(soup): + # Replace hrefs with only the intended destination (no "utm" type tags) + for a in soup.find_all('a', href=True): + # Return the first search result URL + if 'url?q=' in a['href']: + return filter_link_args(a['href']) + + +def get_site_alt(link: str): + for site_key in SITE_ALTS.keys(): + if site_key not in link: + continue + + link = link.replace(site_key, SITE_ALTS[site_key]) + break + + return link + + +def filter_link_args(query_link): + parsed_link = urlparse.urlparse(query_link) + link_args = parse_qs(parsed_link.query) + safe_args = {} + + if len(link_args) == 0 and len(parsed_link) > 0: + return query_link + + for arg in link_args.keys(): + if arg in SKIP_ARGS: + continue + + safe_args[arg] = link_args[arg] + + # Remove original link query and replace with filtered args + query_link = query_link.replace(parsed_link.query, '') + if len(safe_args) > 0: + query_link = query_link + urlparse.urlencode(safe_args, doseq=True) + else: + query_link = query_link.replace('?', '') + + return query_link + + +def gen_nojs(sibling): + nojs_link = BeautifulSoup().new_tag('a') + nojs_link['href'] = '/window?location=' + sibling['href'] + nojs_link['style'] = 'display:block;width:100%;' + nojs_link.string = 'NoJS Link: ' + nojs_link['href'] + sibling.append(BeautifulSoup('


', 'html.parser')) + sibling.append(nojs_link) \ No newline at end of file diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index 40f8a90..2a649b4 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -1,5 +1,5 @@ from app.filter import Filter, get_first_link -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from app.request import gen_query from bs4 import BeautifulSoup from cryptography.fernet import Fernet, InvalidToken diff --git a/app/utils/misc.py b/app/utils/session_utils.py similarity index 62% rename from app/utils/misc.py rename to app/utils/session_utils.py index b87941d..f959abe 100644 --- a/app/utils/misc.py +++ b/app/utils/session_utils.py @@ -2,11 +2,6 @@ from cryptography.fernet import Fernet from flask import current_app as app REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys'] -BLACKLIST = [ - 'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고', - 'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', - 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés' -] def generate_user_keys(cookies_disabled=False) -> dict: diff --git a/test/conftest.py b/test/conftest.py index 63aec3e..7a15f00 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,5 +1,5 @@ from app import app -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys import pytest diff --git a/test/test_misc.py b/test/test_misc.py index 8eb1d78..92fcadb 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -1,4 +1,4 @@ -from app.utils.misc import generate_user_keys, valid_user_session +from app.utils.session_utils import generate_user_keys, valid_user_session def test_generate_user_keys(): diff --git a/test/test_results.py b/test/test_results.py index 463a355..a7aa771 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from app.filter import Filter -from app.utils.misc import generate_user_keys +from app.utils.session_utils import generate_user_keys from datetime import datetime from dateutil.parser import *