Privacy respecting alternatives in results view (#106)

Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration. Verbatim search and option to ignore search autocorrect are now supported as well. Also cleaned up the javascript side of whoogle config so that it now uses arrays of available fields for parsing config values instead of manually assigning each one to a variable. This doesn't include support for Google Maps -> Open Street Maps, that seems a bit more involved than the social media redirects were, so it should likely be a separate effort.
2020-07-26 11:53:59 -06:00 · 2020-07-26 11:53:59 -06:00 · 975ece8cd0
parent 3d7456f37b
commit 975ece8cd0
14 changed files with 138 additions and 85 deletions
--- a/app/init.py
+++ b/app/init.py
@ -1,4 +1,4 @@
-from app.utils.misc import generate_user_keys
+from app.utils.session_utils import generate_user_keys
 from flask import Flask
 from flask_session import Session
 import os
--- a/app/filter.py
+++ b/app/filter.py
@ -1,56 +1,11 @@
 from app.request import VALID_PARAMS
-from app.utils.misc import BLACKLIST
-from bs4 import BeautifulSoup
+from app.utils.filter_utils import *
 from bs4.element import ResultSet
 from cryptography.fernet import Fernet
 import re
 import urllib.parse as urlparse
 from urllib.parse import parse_qs

-SKIP_ARGS = ['ref_src', 'utm']
-FULL_RES_IMG = '<br/><a href="{}">Full Image</a>'
-GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
-LOGO_URL = GOOG_IMG + '_desk'
-BLANK_B64 = '''
-data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC
-'''
-
-
-def get_first_link(soup):
-    # Replace hrefs with only the intended destination (no "utm" type tags)
-    for a in soup.find_all('a', href=True):
-        # Return the first search result URL
-        if 'url?q=' in a['href']:
-            return filter_link_args(a['href'])
-
-
-def filter_link_args(query_link):
-    parsed_link = urlparse.urlparse(query_link)
-    link_args = parse_qs(parsed_link.query)
-    safe_args = {}
-
-    if len(link_args) == 0 and len(parsed_link) > 0:
-        return query_link
-
-    for arg in link_args.keys():
-        if arg in SKIP_ARGS:
-            continue
-
-        safe_args[arg] = link_args[arg]
-
-    # Remove original link query and replace with filtered args
-    query_link = query_link.replace(parsed_link.query, '')
-    if len(safe_args) > 0:
-        query_link = query_link + urlparse.urlencode(safe_args, doseq=True)
-    else:
-        query_link = query_link.replace('?', '')
-
-    return query_link
-
-
-def has_ad_content(element: str):
-    return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element
-

 class Filter:
    def __init__(self, user_keys: dict, mobile=False, config=None):
@ -61,6 +16,7 @@ class Filter:
        self.dark = config['dark'] if 'dark' in config else False
        self.nojs = config['nojs'] if 'nojs' in config else False
        self.new_tab = config['new_tab'] if 'new_tab' in config else False
+        self.alt_redirect = config['alts'] if 'alts' in config else False
        self.mobile = mobile
        self.user_keys = user_keys
        self.main_divs = ResultSet('')
@ -213,8 +169,12 @@ class Filter:
        query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''

        if query_link.startswith('/'):
+            # Internal google links (i.e. mail, maps, etc) should still be forwarded to Google
            link['href'] = 'https://google.com' + query_link
        elif '/search?q=' in href:
+            # "li:1" implies the query should be interpreted verbatim, so we wrap it in double quotes
+            if 'li:1' in href:
+                query_link = '"' + query_link + '"'
            new_search = '/search?q=' + self.encrypt_path(query_link)

            query_params = parse_qs(urlparse.urlparse(href).query)
@ -232,11 +192,13 @@ class Filter:
        else:
            link['href'] = href

+        # Replace link location if "alts" config is enabled
+        if self.alt_redirect:
+            # Search and replace all link descriptions with alternative location
+            link['href'] = get_site_alt(link['href'])
+            link_desc = link.find_all(text=re.compile('|'.join(SITE_ALTS.keys())))
+            if len(link_desc) == 0:
+                return

-def gen_nojs(sibling):
-    nojs_link = BeautifulSoup().new_tag('a')
-    nojs_link['href'] = '/window?location=' + sibling['href']
-    nojs_link['style'] = 'display:block;width:100%;'
-    nojs_link.string = 'NoJS Link: ' + nojs_link['href']
-    sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
-    sibling.append(nojs_link)
+            # Replace link destination
+            link_desc[0].replace_with(get_site_alt(link_desc[0]))
--- a/app/models/config.py
+++ b/app/models/config.py
@ -306,6 +306,7 @@ class Config:
        self.dark = False
        self.nojs = False
        self.near = ''
+        self.alts = False
        self.new_tab = False
        self.get_only = False

--- a/app/request.py
+++ b/app/request.py
@ -12,7 +12,7 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
 DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'

 # Valid query params
-VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source']
+VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr']


 def gen_user_agent(is_mobile):
@ -68,6 +68,10 @@ def gen_query(query, args, config, near_city=None):
    else:
        param_dict['lr'] = ('&lr=' + config.lang_search) if config.lang_search else ''

+    # Set autocorrected search ignore
+    if 'nfpr' in args:
+        param_dict['nfpr'] = '&nfpr=' + args.get('nfpr')
+
    param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
    param_dict['hl'] = ('&hl=' + config.lang_interface.replace('lang_', '')) if config.lang_interface else ''
    param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')
--- a/app/routes.py
+++ b/app/routes.py
@ -15,7 +15,7 @@ from requests import exceptions
 from app import app
 from app.models.config import Config
 from app.request import Request
-from app.utils.misc import valid_user_session
+from app.utils.session_utils import valid_user_session
 from app.utils.routing_utils import *


--- a/app/static/css/main.css
+++ b/app/static/css/main.css
@ -34,10 +34,10 @@ body {
    color: #685e79;
    border-radius: 10px 10px 0 0;
    max-width: 600px;
-    background: rgba(0,0,0,0);
+    background: rgba(0, 0, 0, 0);
 }

-#search-bar:focus{
+#search-bar:focus {
    color: #685e79;
 }

@ -68,7 +68,7 @@ button::-moz-focus-inner {

 .collapsible {
    outline: 0;
-    background-color: rgba(0,0,0,0);
+    background-color: rgba(0, 0, 0, 0);
    color: #685e79;
    cursor: pointer;
    padding: 18px;
@ -129,3 +129,8 @@ footer {
    width: 100%;
    z-index: -1;
 }
+
+.info-text {
+    font-style: italic;
+    font-size: 12px;
+}
--- a/app/static/js/controller.js
+++ b/app/static/js/controller.js
@ -1,3 +1,13 @@
+// Whoogle configurations that use boolean values and checkboxes
+CONFIG_BOOLS = [
+    "nojs", "dark", "safe", "alts", "new_tab", "get_only"
+];
+
+// Whoogle configurations that use string values and input fields
+CONFIG_STRS = [
+    "near", "url"
+];
+
 const setupSearchLayout = () => {
    // Setup search field
    const searchBar = document.getElementById("search-bar");
@ -18,15 +28,6 @@ const setupSearchLayout = () => {
 };

 const fillConfigValues = () => {
-    // Establish all config value elements
-    const near = document.getElementById("config-near");
-    const noJS = document.getElementById("config-nojs");
-    const dark = document.getElementById("config-dark");
-    const safe = document.getElementById("config-safe");
-    const url  = document.getElementById("config-url");
-    const newTab  = document.getElementById("config-new-tab");
-    const getOnly = document.getElementById("config-get-only");
-
    // Request existing config info
    let xhrGET = new XMLHttpRequest();
    xhrGET.open("GET", "/config");
@ -39,15 +40,15 @@ const fillConfigValues = () => {
        // Allow for updating/saving config values
        let configSettings = JSON.parse(xhrGET.responseText);

-        near.value = configSettings["near"] ? configSettings["near"] : "";
-        noJS.checked = !!configSettings["nojs"];
-        dark.checked = !!configSettings["dark"];
-        safe.checked = !!configSettings["safe"];
-        getOnly.checked = !!configSettings["get_only"];
-        newTab.checked = !!configSettings["new_tab"];
+        CONFIG_STRS.forEach(function(item) {
+            let configElement = document.getElementById("config-" + item.replace("_", "-"));
+            configElement.value = configSettings[item] ? configSettings[item] : "";
+        });

-        // Addresses the issue of incorrect URL being used behind reverse proxy
-        url.value = configSettings["url"] ? configSettings["url"] : "";
+        CONFIG_BOOLS.forEach(function(item) {
+            let configElement = document.getElementById("config-" + item.replace("_", "-"));
+            configElement.checked = !!configSettings[item];
+        });
    };

    xhrGET.send();
--- a/app/templates/index.html
+++ b/app/templates/index.html
@ -96,6 +96,12 @@
                            <label for="config-safe">Safe Search: </label>
                            <input type="checkbox" name="safe" id="config-safe">
                        </div>
+                        <div class="config-div">
+                            <label class="tooltip" for="config-alts">Replace Social Media Links: </label>
+                            <input type="checkbox" name="alts" id="config-alts">
+                            <div><span class="info-text"> — Replaces Twitter/YouTube/Instagram links
+                                with Nitter/Invidious/Bibliogram links.</span></div>
+                        </div>
                        <div class="config-div">
                            <label for="config-new-tab">Open Links in New Tab: </label>
                            <input type="checkbox" name="new_tab" id="config-new-tab">
--- a/app/utils/filter_utils.py
+++ b/app/utils/filter_utils.py
@ -0,0 +1,79 @@
+from bs4 import BeautifulSoup
+import urllib.parse as urlparse
+from urllib.parse import parse_qs
+
+SKIP_ARGS = ['ref_src', 'utm']
+FULL_RES_IMG = '<br/><a href="{}">Full Image</a>'
+GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
+LOGO_URL = GOOG_IMG + '_desk'
+BLANK_B64 = '''
+data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC
+'''
+
+BLACKLIST = [
+    'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
+    'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
+    'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
+]
+
+SITE_ALTS = {
+    'twitter.com': 'nitter.net',
+    'youtube.com': 'invidio.us',
+    'instagram.com': 'bibliogram.art/u'
+}
+
+
+def has_ad_content(element: str):
+    return element.upper() in (value.upper() for value in BLACKLIST) or 'ⓘ' in element
+
+
+def get_first_link(soup):
+    # Replace hrefs with only the intended destination (no "utm" type tags)
+    for a in soup.find_all('a', href=True):
+        # Return the first search result URL
+        if 'url?q=' in a['href']:
+            return filter_link_args(a['href'])
+
+
+def get_site_alt(link: str):
+    for site_key in SITE_ALTS.keys():
+        if site_key not in link:
+            continue
+
+        link = link.replace(site_key, SITE_ALTS[site_key])
+        break
+
+    return link
+
+
+def filter_link_args(query_link):
+    parsed_link = urlparse.urlparse(query_link)
+    link_args = parse_qs(parsed_link.query)
+    safe_args = {}
+
+    if len(link_args) == 0 and len(parsed_link) > 0:
+        return query_link
+
+    for arg in link_args.keys():
+        if arg in SKIP_ARGS:
+            continue
+
+        safe_args[arg] = link_args[arg]
+
+    # Remove original link query and replace with filtered args
+    query_link = query_link.replace(parsed_link.query, '')
+    if len(safe_args) > 0:
+        query_link = query_link + urlparse.urlencode(safe_args, doseq=True)
+    else:
+        query_link = query_link.replace('?', '')
+
+    return query_link
+
+
+def gen_nojs(sibling):
+    nojs_link = BeautifulSoup().new_tag('a')
+    nojs_link['href'] = '/window?location=' + sibling['href']
+    nojs_link['style'] = 'display:block;width:100%;'
+    nojs_link.string = 'NoJS Link: ' + nojs_link['href']
+    sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
+    sibling.append(nojs_link)
--- a/app/utils/routing_utils.py
+++ b/app/utils/routing_utils.py
@ -1,5 +1,5 @@
 from app.filter import Filter, get_first_link
-from app.utils.misc import generate_user_keys
+from app.utils.session_utils import generate_user_keys
 from app.request import gen_query
 from bs4 import BeautifulSoup
 from cryptography.fernet import Fernet, InvalidToken
--- a/app/utils/session_utils.py
+++ b/app/utils/session_utils.py
@ -2,11 +2,6 @@ from cryptography.fernet import Fernet
 from flask import current_app as app

 REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
-BLACKLIST = [
-    'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
-    'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
-    'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
-]


 def generate_user_keys(cookies_disabled=False) -> dict:
--- a/test/conftest.py
+++ b/test/conftest.py
@ -1,5 +1,5 @@
 from app import app
-from app.utils.misc import generate_user_keys
+from app.utils.session_utils import generate_user_keys
 import pytest


--- a/test/test_misc.py
+++ b/test/test_misc.py
@ -1,4 +1,4 @@
-from app.utils.misc import generate_user_keys, valid_user_session
+from app.utils.session_utils import generate_user_keys, valid_user_session


 def test_generate_user_keys():
--- a/test/test_results.py
+++ b/test/test_results.py
@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup
 from app.filter import Filter
-from app.utils.misc import generate_user_keys
+from app.utils.session_utils import generate_user_keys
 from datetime import datetime
 from dateutil.parser import *