Privacy respecting alternatives in results view (#106)

Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration.

Verbatim search and option to ignore search autocorrect are now supported as well.

Also cleaned up the javascript side of whoogle config so that it now
uses arrays of available fields for parsing config values instead of manually assigning each
one to a variable.

This doesn't include support for Google Maps -> Open Street Maps, that
seems a bit more involved than the social media redirects were, so it
should likely be a separate effort.
main
Ben Busby 2020-07-26 11:53:59 -06:00 committed by GitHub
parent 3d7456f37b
commit 975ece8cd0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 138 additions and 85 deletions

View File

@ -1,4 +1,4 @@
from app.utils.misc import generate_user_keys
from app.utils.session_utils import generate_user_keys
from flask import Flask
from flask_session import Session
import os

View File

@ -1,56 +1,11 @@
from app.request import VALID_PARAMS
from app.utils.misc import BLACKLIST
from bs4 import BeautifulSoup
from app.utils.filter_utils import *
from bs4.element import ResultSet
from cryptography.fernet import Fernet
import re
import urllib.parse as urlparse
from urllib.parse import parse_qs
SKIP_ARGS = ['ref_src', 'utm']
FULL_RES_IMG = '<br/><a href="{}">Full Image</a>'
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
LOGO_URL = GOOG_IMG + '_desk'
BLANK_B64 = '''
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC
'''
def get_first_link(soup):
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
# Return the first search result URL
if 'url?q=' in a['href']:
return filter_link_args(a['href'])
def filter_link_args(query_link):
parsed_link = urlparse.urlparse(query_link)
link_args = parse_qs(parsed_link.query)
safe_args = {}
if len(link_args) == 0 and len(parsed_link) > 0:
return query_link
for arg in link_args.keys():
if arg in SKIP_ARGS:
continue
safe_args[arg] = link_args[arg]
# Remove original link query and replace with filtered args
query_link = query_link.replace(parsed_link.query, '')
if len(safe_args) > 0:
query_link = query_link + urlparse.urlencode(safe_args, doseq=True)
else:
query_link = query_link.replace('?', '')
return query_link
def has_ad_content(element: str):
return element.upper() in (value.upper() for value in BLACKLIST) or '' in element
class Filter:
def __init__(self, user_keys: dict, mobile=False, config=None):
@ -61,6 +16,7 @@ class Filter:
self.dark = config['dark'] if 'dark' in config else False
self.nojs = config['nojs'] if 'nojs' in config else False
self.new_tab = config['new_tab'] if 'new_tab' in config else False
self.alt_redirect = config['alts'] if 'alts' in config else False
self.mobile = mobile
self.user_keys = user_keys
self.main_divs = ResultSet('')
@ -213,8 +169,12 @@ class Filter:
query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else ''
if query_link.startswith('/'):
# Internal google links (i.e. mail, maps, etc) should still be forwarded to Google
link['href'] = 'https://google.com' + query_link
elif '/search?q=' in href:
# "li:1" implies the query should be interpreted verbatim, so we wrap it in double quotes
if 'li:1' in href:
query_link = '"' + query_link + '"'
new_search = '/search?q=' + self.encrypt_path(query_link)
query_params = parse_qs(urlparse.urlparse(href).query)
@ -232,11 +192,13 @@ class Filter:
else:
link['href'] = href
# Replace link location if "alts" config is enabled
if self.alt_redirect:
# Search and replace all link descriptions with alternative location
link['href'] = get_site_alt(link['href'])
link_desc = link.find_all(text=re.compile('|'.join(SITE_ALTS.keys())))
if len(link_desc) == 0:
return
def gen_nojs(sibling):
nojs_link = BeautifulSoup().new_tag('a')
nojs_link['href'] = '/window?location=' + sibling['href']
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
sibling.append(nojs_link)
# Replace link destination
link_desc[0].replace_with(get_site_alt(link_desc[0]))

View File

@ -306,6 +306,7 @@ class Config:
self.dark = False
self.nojs = False
self.near = ''
self.alts = False
self.new_tab = False
self.get_only = False

View File

@ -12,7 +12,7 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
# Valid query params
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source']
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr']
def gen_user_agent(is_mobile):
@ -68,6 +68,10 @@ def gen_query(query, args, config, near_city=None):
else:
param_dict['lr'] = ('&lr=' + config.lang_search) if config.lang_search else ''
# Set autocorrected search ignore
if 'nfpr' in args:
param_dict['nfpr'] = '&nfpr=' + args.get('nfpr')
param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
param_dict['hl'] = ('&hl=' + config.lang_interface.replace('lang_', '')) if config.lang_interface else ''
param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')

View File

@ -15,7 +15,7 @@ from requests import exceptions
from app import app
from app.models.config import Config
from app.request import Request
from app.utils.misc import valid_user_session
from app.utils.session_utils import valid_user_session
from app.utils.routing_utils import *

View File

@ -34,10 +34,10 @@ body {
color: #685e79;
border-radius: 10px 10px 0 0;
max-width: 600px;
background: rgba(0,0,0,0);
background: rgba(0, 0, 0, 0);
}
#search-bar:focus{
#search-bar:focus {
color: #685e79;
}
@ -68,7 +68,7 @@ button::-moz-focus-inner {
.collapsible {
outline: 0;
background-color: rgba(0,0,0,0);
background-color: rgba(0, 0, 0, 0);
color: #685e79;
cursor: pointer;
padding: 18px;
@ -129,3 +129,8 @@ footer {
width: 100%;
z-index: -1;
}
.info-text {
font-style: italic;
font-size: 12px;
}

View File

@ -1,3 +1,13 @@
// Whoogle configurations that use boolean values and checkboxes
CONFIG_BOOLS = [
"nojs", "dark", "safe", "alts", "new_tab", "get_only"
];
// Whoogle configurations that use string values and input fields
CONFIG_STRS = [
"near", "url"
];
const setupSearchLayout = () => {
// Setup search field
const searchBar = document.getElementById("search-bar");
@ -18,15 +28,6 @@ const setupSearchLayout = () => {
};
const fillConfigValues = () => {
// Establish all config value elements
const near = document.getElementById("config-near");
const noJS = document.getElementById("config-nojs");
const dark = document.getElementById("config-dark");
const safe = document.getElementById("config-safe");
const url = document.getElementById("config-url");
const newTab = document.getElementById("config-new-tab");
const getOnly = document.getElementById("config-get-only");
// Request existing config info
let xhrGET = new XMLHttpRequest();
xhrGET.open("GET", "/config");
@ -39,15 +40,15 @@ const fillConfigValues = () => {
// Allow for updating/saving config values
let configSettings = JSON.parse(xhrGET.responseText);
near.value = configSettings["near"] ? configSettings["near"] : "";
noJS.checked = !!configSettings["nojs"];
dark.checked = !!configSettings["dark"];
safe.checked = !!configSettings["safe"];
getOnly.checked = !!configSettings["get_only"];
newTab.checked = !!configSettings["new_tab"];
CONFIG_STRS.forEach(function(item) {
let configElement = document.getElementById("config-" + item.replace("_", "-"));
configElement.value = configSettings[item] ? configSettings[item] : "";
});
// Addresses the issue of incorrect URL being used behind reverse proxy
url.value = configSettings["url"] ? configSettings["url"] : "";
CONFIG_BOOLS.forEach(function(item) {
let configElement = document.getElementById("config-" + item.replace("_", "-"));
configElement.checked = !!configSettings[item];
});
};
xhrGET.send();

View File

@ -96,6 +96,12 @@
<label for="config-safe">Safe Search: </label>
<input type="checkbox" name="safe" id="config-safe">
</div>
<div class="config-div">
<label class="tooltip" for="config-alts">Replace Social Media Links: </label>
<input type="checkbox" name="alts" id="config-alts">
<div><span class="info-text"> — Replaces Twitter/YouTube/Instagram links
with Nitter/Invidious/Bibliogram links.</span></div>
</div>
<div class="config-div">
<label for="config-new-tab">Open Links in New Tab: </label>
<input type="checkbox" name="new_tab" id="config-new-tab">

79
app/utils/filter_utils.py Normal file
View File

@ -0,0 +1,79 @@
from bs4 import BeautifulSoup
import urllib.parse as urlparse
from urllib.parse import parse_qs
SKIP_ARGS = ['ref_src', 'utm']
FULL_RES_IMG = '<br/><a href="{}">Full Image</a>'
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
LOGO_URL = GOOG_IMG + '_desk'
BLANK_B64 = '''
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkwAIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC
'''
BLACKLIST = [
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
]
SITE_ALTS = {
'twitter.com': 'nitter.net',
'youtube.com': 'invidio.us',
'instagram.com': 'bibliogram.art/u'
}
def has_ad_content(element: str):
return element.upper() in (value.upper() for value in BLACKLIST) or '' in element
def get_first_link(soup):
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
# Return the first search result URL
if 'url?q=' in a['href']:
return filter_link_args(a['href'])
def get_site_alt(link: str):
for site_key in SITE_ALTS.keys():
if site_key not in link:
continue
link = link.replace(site_key, SITE_ALTS[site_key])
break
return link
def filter_link_args(query_link):
parsed_link = urlparse.urlparse(query_link)
link_args = parse_qs(parsed_link.query)
safe_args = {}
if len(link_args) == 0 and len(parsed_link) > 0:
return query_link
for arg in link_args.keys():
if arg in SKIP_ARGS:
continue
safe_args[arg] = link_args[arg]
# Remove original link query and replace with filtered args
query_link = query_link.replace(parsed_link.query, '')
if len(safe_args) > 0:
query_link = query_link + urlparse.urlencode(safe_args, doseq=True)
else:
query_link = query_link.replace('?', '')
return query_link
def gen_nojs(sibling):
nojs_link = BeautifulSoup().new_tag('a')
nojs_link['href'] = '/window?location=' + sibling['href']
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
sibling.append(nojs_link)

View File

@ -1,5 +1,5 @@
from app.filter import Filter, get_first_link
from app.utils.misc import generate_user_keys
from app.utils.session_utils import generate_user_keys
from app.request import gen_query
from bs4 import BeautifulSoup
from cryptography.fernet import Fernet, InvalidToken

View File

@ -2,11 +2,6 @@ from cryptography.fernet import Fernet
from flask import current_app as app
REQUIRED_SESSION_VALUES = ['uuid', 'config', 'fernet_keys']
BLACKLIST = [
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama', 'Реклама', 'Anunț', '광고',
'annons', 'Annonse', 'Iklan', '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam',
'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés'
]
def generate_user_keys(cookies_disabled=False) -> dict:

View File

@ -1,5 +1,5 @@
from app import app
from app.utils.misc import generate_user_keys
from app.utils.session_utils import generate_user_keys
import pytest

View File

@ -1,4 +1,4 @@
from app.utils.misc import generate_user_keys, valid_user_session
from app.utils.session_utils import generate_user_keys, valid_user_session
def test_generate_user_keys():

View File

@ -1,6 +1,6 @@
from bs4 import BeautifulSoup
from app.filter import Filter
from app.utils.misc import generate_user_keys
from app.utils.session_utils import generate_user_keys
from datetime import datetime
from dateutil.parser import *