Refactor site alt link replacement

Replacing result links and text when site alts are enabled is now part
of its own function, and handles replacement of link location and link
description separately.

Fixes #880
main
Ben Busby 2022-12-05 13:28:29 -07:00
parent 0310f0f542
commit fd85f1573a
No known key found for this signature in database
GPG Key ID: B9B7231E01D924A1
2 changed files with 77 additions and 43 deletions

View File

@ -119,6 +119,7 @@ class Filter:
page_url='', page_url='',
query='', query='',
mobile=False) -> None: mobile=False) -> None:
self.soup = None
self.config = config self.config = config
self.mobile = mobile self.mobile = mobile
self.user_key = user_key self.user_key = user_key
@ -149,46 +150,50 @@ class Filter:
return Fernet(self.user_key).encrypt(path.encode()).decode() return Fernet(self.user_key).encrypt(path.encode()).decode()
def clean(self, soup) -> BeautifulSoup: def clean(self, soup) -> BeautifulSoup:
self.main_divs = soup.find('div', {'id': 'main'}) self.soup = soup
self.main_divs = self.soup.find('div', {'id': 'main'})
self.remove_ads() self.remove_ads()
self.remove_block_titles() self.remove_block_titles()
self.remove_block_url() self.remove_block_url()
self.collapse_sections() self.collapse_sections()
self.update_css(soup) self.update_css()
self.update_styling(soup) self.update_styling()
self.remove_block_tabs(soup) self.remove_block_tabs()
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]:
self.update_element_src(img, 'image/png') self.update_element_src(img, 'image/png')
for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]:
self.update_element_src(audio, 'audio/mpeg') self.update_element_src(audio, 'audio/mpeg')
for link in soup.find_all('a', href=True): for link in self.soup.find_all('a', href=True):
self.update_link(link) self.update_link(link)
input_form = soup.find('form') if self.config.alts:
self.site_alt_swap()
input_form = self.soup.find('form')
if input_form is not None: if input_form is not None:
input_form['method'] = 'GET' if self.config.get_only else 'POST' input_form['method'] = 'GET' if self.config.get_only else 'POST'
# Use a relative URI for submissions # Use a relative URI for submissions
input_form['action'] = 'search' input_form['action'] = 'search'
# Ensure no extra scripts passed through # Ensure no extra scripts passed through
for script in soup('script'): for script in self.soup('script'):
script.decompose() script.decompose()
# Update default footer and header # Update default footer and header
footer = soup.find('footer') footer = self.soup.find('footer')
if footer: if footer:
# Remove divs that have multiple links beyond just page navigation # Remove divs that have multiple links beyond just page navigation
[_.decompose() for _ in footer.find_all('div', recursive=False) [_.decompose() for _ in footer.find_all('div', recursive=False)
if len(_.find_all('a', href=True)) > 3] if len(_.find_all('a', href=True)) > 3]
header = soup.find('header') header = self.soup.find('header')
if header: if header:
header.decompose() header.decompose()
self.remove_site_blocks(soup) self.remove_site_blocks(self.soup)
return soup return self.soup
def remove_site_blocks(self, soup) -> None: def remove_site_blocks(self, soup) -> None:
if not self.config.block or not soup.body: if not self.config.block or not soup.body:
@ -233,7 +238,7 @@ class Filter:
if block_url.search(_.attrs['href']) is not None] if block_url.search(_.attrs['href']) is not None]
_ = div.decompose() if len(block_divs) else None _ = div.decompose() if len(block_divs) else None
def remove_block_tabs(self, soup) -> None: def remove_block_tabs(self) -> None:
if self.main_divs: if self.main_divs:
for div in self.main_divs.find_all( for div in self.main_divs.find_all(
'div', 'div',
@ -242,7 +247,7 @@ class Filter:
_ = div.decompose() _ = div.decompose()
else: else:
# when in images tab # when in images tab
for div in soup.find_all( for div in self.soup.find_all(
'div', 'div',
attrs={'class': f'{GClasses.images_tbm_tab}'} attrs={'class': f'{GClasses.images_tbm_tab}'}
): ):
@ -369,7 +374,7 @@ class Filter:
) + '&type=' + urlparse.quote(mime) ) + '&type=' + urlparse.quote(mime)
) )
def update_css(self, soup) -> None: def update_css(self) -> None:
"""Updates URLs used in inline styles to be proxied by Whoogle """Updates URLs used in inline styles to be proxied by Whoogle
using the /element endpoint. using the /element endpoint.
@ -378,7 +383,7 @@ class Filter:
""" """
# Filter all <style> tags # Filter all <style> tags
for style in soup.find_all('style'): for style in self.soup.find_all('style'):
style.string = clean_css(style.string, self.page_url) style.string = clean_css(style.string, self.page_url)
# TODO: Convert remote stylesheets to style tags and proxy all # TODO: Convert remote stylesheets to style tags and proxy all
@ -386,20 +391,20 @@ class Filter:
# for link in soup.find_all('link', attrs={'rel': 'stylesheet'}): # for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
# print(link) # print(link)
def update_styling(self, soup) -> None: def update_styling(self) -> None:
# Update CSS classes for result divs # Update CSS classes for result divs
soup = GClasses.replace_css_classes(soup) soup = GClasses.replace_css_classes(self.soup)
# Remove unnecessary button(s) # Remove unnecessary button(s)
for button in soup.find_all('button'): for button in self.soup.find_all('button'):
button.decompose() button.decompose()
# Remove svg logos # Remove svg logos
for svg in soup.find_all('svg'): for svg in self.soup.find_all('svg'):
svg.decompose() svg.decompose()
# Update logo # Update logo
logo = soup.find('a', {'class': 'l'}) logo = self.soup.find('a', {'class': 'l'})
if logo and self.mobile: if logo and self.mobile:
logo['style'] = ('display:flex; justify-content:center; ' logo['style'] = ('display:flex; justify-content:center; '
'align-items:center; color:#685e79; ' 'align-items:center; color:#685e79; '
@ -407,14 +412,15 @@ class Filter:
# Fix search bar length on mobile # Fix search bar length on mobile
try: try:
search_bar = soup.find('header').find('form').find('div') search_bar = self.soup.find('header').find('form').find('div')
search_bar['style'] = 'width: 100%;' search_bar['style'] = 'width: 100%;'
except AttributeError: except AttributeError:
pass pass
# Fix body max width on images tab # Fix body max width on images tab
style = soup.find('style') style = self.soup.find('style')
div = soup.find('div', attrs={'class': f'{GClasses.images_tbm_tab}'}) div = self.soup.find('div', attrs={
'class': f'{GClasses.images_tbm_tab}'})
if style and div and not self.mobile: if style and div and not self.mobile:
css = style.string css = style.string
css_html_tag = ( css_html_tag = (
@ -444,7 +450,6 @@ class Filter:
""" """
parsed_link = urlparse.urlparse(link['href']) parsed_link = urlparse.urlparse(link['href'])
link_netloc = ''
if '/url?q=' in link['href']: if '/url?q=' in link['href']:
link_netloc = extract_q(parsed_link.query, link['href']) link_netloc = extract_q(parsed_link.query, link['href'])
else: else:
@ -543,25 +548,54 @@ class Filter:
): ):
link["target"] = "_blank" link["target"] = "_blank"
# Replace link location if "alts" config is enabled def site_alt_swap(self) -> None:
if self.config.alts: """Replaces link locations and page elements if "alts" config
is enabled
"""
for site, alt in SITE_ALTS.items():
for div in self.soup.find_all('div', text=re.compile(site)):
# Use the number of words in the div string to determine if the
# string is a result description (shouldn't replace domains used
# in desc text).
# Also ignore medium.com replacements since these are handled
# specifically in the link description replacement, and medium
# results are never given their own "card" result where this
# replacement would make sense.
if site == 'medium.com' or len(div.string.split(' ')) > 1:
continue
div.string = div.string.replace(site, alt)
for link in self.soup.find_all('a', href=True):
# Search and replace all link descriptions # Search and replace all link descriptions
# with alternative location # with alternative location
link['href'] = get_site_alt(link['href']) link['href'] = get_site_alt(link['href'])
link_desc = link.find_all( link_desc = link.find_all(
text=re.compile('|'.join(SITE_ALTS.keys()))) text=re.compile('|'.join(SITE_ALTS.keys())))
if len(link_desc) == 0: if len(link_desc) == 0:
return continue
# Replace link description # Replace link description
link_desc = link_desc[0] link_desc = link_desc[0]
for site, alt in SITE_ALTS.items():
if site not in link_desc or not alt: if site not in link_desc or not alt:
continue continue
new_desc = BeautifulSoup(features='html.parser').new_tag('div') new_desc = BeautifulSoup(features='html.parser').new_tag('div')
new_desc.string = str(link_desc).replace(site, alt) link_str = str(link_desc)
# Medium links should be handled differently, since 'medium.com'
# is a common substring of domain names, but shouldn't be
# replaced (i.e. 'philomedium.com' should stay as it is).
if 'medium.com' in link_str:
if link_str.startswith('medium.com') or '.medium.com' in link_str:
new_desc.string = link_str.replace(
'medium.com', 'farside.link/scribe')
else:
new_desc.string = link_str
else:
new_desc.string = link_str.replace(site, alt)
link_desc.replace_with(new_desc) link_desc.replace_with(new_desc)
break
def view_image(self, soup) -> BeautifulSoup: def view_image(self, soup) -> BeautifulSoup:
"""Replaces the soup with a new one that handles mobile results and """Replaces the soup with a new one that handles mobile results and

View File

@ -34,8 +34,7 @@ SITE_ALTS = {
'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'), 'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'),
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'), 'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
**dict.fromkeys([ **dict.fromkeys([
'.medium.com', 'medium.com',
'//medium.com',
'levelup.gitconnected.com' 'levelup.gitconnected.com'
], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')), ], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'), 'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'),
@ -144,7 +143,8 @@ def get_site_alt(link: str) -> str:
hostcomp = f'{parsed_link.scheme}://{hostname}' hostcomp = f'{parsed_link.scheme}://{hostname}'
for site_key in SITE_ALTS.keys(): for site_key in SITE_ALTS.keys():
if not hostname or site_key not in hostname or not SITE_ALTS[site_key]: site_alt = f'{parsed_link.scheme}://{site_key}'
if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
continue continue
# Wikipedia -> Wikiless replacements require the subdomain (if it's # Wikipedia -> Wikiless replacements require the subdomain (if it's