From fb600d6fc8db2e638f5bf632898b882f82f10049 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Mon, 16 May 2022 09:53:48 -0600 Subject: [PATCH] Improve G page distinction between footer and results Pages in the Whoogle footer that by default route to Google pages were previously being removed, but caused results that also routed to similar pages to no longer be accessible. This was due to the removal of the '/url' endpoint that Google uses for each result. To fix this, the result link is now parsed so that the domain of the result can be checked against the disallowed G page list. Since results are delivered in a "/url?q=" format -- even for pages to Google's own products -- and the footer links are formatted as ".google.com", footer links are removed and result links are parsed correctly. Fixes #747 --- app/filter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/filter.py b/app/filter.py index 7412275..6151034 100644 --- a/app/filter.py +++ b/app/filter.py @@ -410,8 +410,10 @@ class Filter: None (the tag is updated directly) """ + link_netloc = urlparse.urlparse(link['href']).netloc + # Remove any elements that direct to unsupported Google pages - if any(url in link['href'] for url in unsupported_g_pages): + if any(url in link_netloc for url in unsupported_g_pages): # FIXME: The "Shopping" tab requires further filtering (see #136) # Temporarily removing all links to that tab for now. parent = link.parent @@ -431,6 +433,8 @@ class Filter: # Internal google links (i.e. mail, maps, etc) should still # be forwarded to Google link['href'] = 'https://google.com' + q + elif link['href'].startswith('/url'): + link['href'] = q elif q.startswith('https://accounts.google.com'): # Remove Sign-in link link.decompose()