Refactor site alt link replacement

Replacing result links and text when site alts are enabled is now part
of its own function, and handles replacement of link location and link
description separately.

Fixes #880
main
Ben Busby 2022-12-05 13:28:29 -07:00
parent 0310f0f542
commit fd85f1573a
No known key found for this signature in database
GPG Key ID: B9B7231E01D924A1
2 changed files with 77 additions and 43 deletions

View File

@ -119,6 +119,7 @@ class Filter:
page_url='',
query='',
mobile=False) -> None:
self.soup = None
self.config = config
self.mobile = mobile
self.user_key = user_key
@ -149,46 +150,50 @@ class Filter:
return Fernet(self.user_key).encrypt(path.encode()).decode()
def clean(self, soup) -> BeautifulSoup:
self.main_divs = soup.find('div', {'id': 'main'})
self.soup = soup
self.main_divs = self.soup.find('div', {'id': 'main'})
self.remove_ads()
self.remove_block_titles()
self.remove_block_url()
self.collapse_sections()
self.update_css(soup)
self.update_styling(soup)
self.remove_block_tabs(soup)
self.update_css()
self.update_styling()
self.remove_block_tabs()
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]:
self.update_element_src(img, 'image/png')
for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]:
self.update_element_src(audio, 'audio/mpeg')
for link in soup.find_all('a', href=True):
for link in self.soup.find_all('a', href=True):
self.update_link(link)
input_form = soup.find('form')
if self.config.alts:
self.site_alt_swap()
input_form = self.soup.find('form')
if input_form is not None:
input_form['method'] = 'GET' if self.config.get_only else 'POST'
# Use a relative URI for submissions
input_form['action'] = 'search'
# Ensure no extra scripts passed through
for script in soup('script'):
for script in self.soup('script'):
script.decompose()
# Update default footer and header
footer = soup.find('footer')
footer = self.soup.find('footer')
if footer:
# Remove divs that have multiple links beyond just page navigation
[_.decompose() for _ in footer.find_all('div', recursive=False)
if len(_.find_all('a', href=True)) > 3]
header = soup.find('header')
header = self.soup.find('header')
if header:
header.decompose()
self.remove_site_blocks(soup)
return soup
self.remove_site_blocks(self.soup)
return self.soup
def remove_site_blocks(self, soup) -> None:
if not self.config.block or not soup.body:
@ -233,7 +238,7 @@ class Filter:
if block_url.search(_.attrs['href']) is not None]
_ = div.decompose() if len(block_divs) else None
def remove_block_tabs(self, soup) -> None:
def remove_block_tabs(self) -> None:
if self.main_divs:
for div in self.main_divs.find_all(
'div',
@ -242,7 +247,7 @@ class Filter:
_ = div.decompose()
else:
# when in images tab
for div in soup.find_all(
for div in self.soup.find_all(
'div',
attrs={'class': f'{GClasses.images_tbm_tab}'}
):
@ -369,7 +374,7 @@ class Filter:
) + '&type=' + urlparse.quote(mime)
)
def update_css(self, soup) -> None:
def update_css(self) -> None:
"""Updates URLs used in inline styles to be proxied by Whoogle
using the /element endpoint.
@ -378,7 +383,7 @@ class Filter:
"""
# Filter all <style> tags
for style in soup.find_all('style'):
for style in self.soup.find_all('style'):
style.string = clean_css(style.string, self.page_url)
# TODO: Convert remote stylesheets to style tags and proxy all
@ -386,20 +391,20 @@ class Filter:
# for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
# print(link)
def update_styling(self, soup) -> None:
def update_styling(self) -> None:
# Update CSS classes for result divs
soup = GClasses.replace_css_classes(soup)
soup = GClasses.replace_css_classes(self.soup)
# Remove unnecessary button(s)
for button in soup.find_all('button'):
for button in self.soup.find_all('button'):
button.decompose()
# Remove svg logos
for svg in soup.find_all('svg'):
for svg in self.soup.find_all('svg'):
svg.decompose()
# Update logo
logo = soup.find('a', {'class': 'l'})
logo = self.soup.find('a', {'class': 'l'})
if logo and self.mobile:
logo['style'] = ('display:flex; justify-content:center; '
'align-items:center; color:#685e79; '
@ -407,14 +412,15 @@ class Filter:
# Fix search bar length on mobile
try:
search_bar = soup.find('header').find('form').find('div')
search_bar = self.soup.find('header').find('form').find('div')
search_bar['style'] = 'width: 100%;'
except AttributeError:
pass
# Fix body max width on images tab
style = soup.find('style')
div = soup.find('div', attrs={'class': f'{GClasses.images_tbm_tab}'})
style = self.soup.find('style')
div = self.soup.find('div', attrs={
'class': f'{GClasses.images_tbm_tab}'})
if style and div and not self.mobile:
css = style.string
css_html_tag = (
@ -444,7 +450,6 @@ class Filter:
"""
parsed_link = urlparse.urlparse(link['href'])
link_netloc = ''
if '/url?q=' in link['href']:
link_netloc = extract_q(parsed_link.query, link['href'])
else:
@ -543,25 +548,54 @@ class Filter:
):
link["target"] = "_blank"
# Replace link location if "alts" config is enabled
if self.config.alts:
# Search and replace all link descriptions
# with alternative location
link['href'] = get_site_alt(link['href'])
link_desc = link.find_all(
text=re.compile('|'.join(SITE_ALTS.keys())))
if len(link_desc) == 0:
return
def site_alt_swap(self) -> None:
"""Replaces link locations and page elements if "alts" config
is enabled
"""
for site, alt in SITE_ALTS.items():
for div in self.soup.find_all('div', text=re.compile(site)):
# Use the number of words in the div string to determine if the
# string is a result description (shouldn't replace domains used
# in desc text).
# Also ignore medium.com replacements since these are handled
# specifically in the link description replacement, and medium
# results are never given their own "card" result where this
# replacement would make sense.
if site == 'medium.com' or len(div.string.split(' ')) > 1:
continue
# Replace link description
link_desc = link_desc[0]
for site, alt in SITE_ALTS.items():
div.string = div.string.replace(site, alt)
for link in self.soup.find_all('a', href=True):
# Search and replace all link descriptions
# with alternative location
link['href'] = get_site_alt(link['href'])
link_desc = link.find_all(
text=re.compile('|'.join(SITE_ALTS.keys())))
if len(link_desc) == 0:
continue
# Replace link description
link_desc = link_desc[0]
if site not in link_desc or not alt:
continue
new_desc = BeautifulSoup(features='html.parser').new_tag('div')
new_desc.string = str(link_desc).replace(site, alt)
link_str = str(link_desc)
# Medium links should be handled differently, since 'medium.com'
# is a common substring of domain names, but shouldn't be
# replaced (i.e. 'philomedium.com' should stay as it is).
if 'medium.com' in link_str:
if link_str.startswith('medium.com') or '.medium.com' in link_str:
new_desc.string = link_str.replace(
'medium.com', 'farside.link/scribe')
else:
new_desc.string = link_str
else:
new_desc.string = link_str.replace(site, alt)
link_desc.replace_with(new_desc)
break
def view_image(self, soup) -> BeautifulSoup:
"""Replaces the soup with a new one that handles mobile results and

View File

@ -34,8 +34,7 @@ SITE_ALTS = {
'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'),
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
**dict.fromkeys([
'.medium.com',
'//medium.com',
'medium.com',
'levelup.gitconnected.com'
], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'),
@ -144,7 +143,8 @@ def get_site_alt(link: str) -> str:
hostcomp = f'{parsed_link.scheme}://{hostname}'
for site_key in SITE_ALTS.keys():
if not hostname or site_key not in hostname or not SITE_ALTS[site_key]:
site_alt = f'{parsed_link.scheme}://{site_key}'
if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
continue
# Wikipedia -> Wikiless replacements require the subdomain (if it's