Refactor site alt link replacement
Replacing result links and text when site alts are enabled is now part of its own function, and handles replacement of link location and link description separately. Fixes #880main
parent
0310f0f542
commit
fd85f1573a
114
app/filter.py
114
app/filter.py
|
@ -119,6 +119,7 @@ class Filter:
|
||||||
page_url='',
|
page_url='',
|
||||||
query='',
|
query='',
|
||||||
mobile=False) -> None:
|
mobile=False) -> None:
|
||||||
|
self.soup = None
|
||||||
self.config = config
|
self.config = config
|
||||||
self.mobile = mobile
|
self.mobile = mobile
|
||||||
self.user_key = user_key
|
self.user_key = user_key
|
||||||
|
@ -149,46 +150,50 @@ class Filter:
|
||||||
return Fernet(self.user_key).encrypt(path.encode()).decode()
|
return Fernet(self.user_key).encrypt(path.encode()).decode()
|
||||||
|
|
||||||
def clean(self, soup) -> BeautifulSoup:
|
def clean(self, soup) -> BeautifulSoup:
|
||||||
self.main_divs = soup.find('div', {'id': 'main'})
|
self.soup = soup
|
||||||
|
self.main_divs = self.soup.find('div', {'id': 'main'})
|
||||||
self.remove_ads()
|
self.remove_ads()
|
||||||
self.remove_block_titles()
|
self.remove_block_titles()
|
||||||
self.remove_block_url()
|
self.remove_block_url()
|
||||||
self.collapse_sections()
|
self.collapse_sections()
|
||||||
self.update_css(soup)
|
self.update_css()
|
||||||
self.update_styling(soup)
|
self.update_styling()
|
||||||
self.remove_block_tabs(soup)
|
self.remove_block_tabs()
|
||||||
|
|
||||||
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
|
for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]:
|
||||||
self.update_element_src(img, 'image/png')
|
self.update_element_src(img, 'image/png')
|
||||||
|
|
||||||
for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
|
for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]:
|
||||||
self.update_element_src(audio, 'audio/mpeg')
|
self.update_element_src(audio, 'audio/mpeg')
|
||||||
|
|
||||||
for link in soup.find_all('a', href=True):
|
for link in self.soup.find_all('a', href=True):
|
||||||
self.update_link(link)
|
self.update_link(link)
|
||||||
|
|
||||||
input_form = soup.find('form')
|
if self.config.alts:
|
||||||
|
self.site_alt_swap()
|
||||||
|
|
||||||
|
input_form = self.soup.find('form')
|
||||||
if input_form is not None:
|
if input_form is not None:
|
||||||
input_form['method'] = 'GET' if self.config.get_only else 'POST'
|
input_form['method'] = 'GET' if self.config.get_only else 'POST'
|
||||||
# Use a relative URI for submissions
|
# Use a relative URI for submissions
|
||||||
input_form['action'] = 'search'
|
input_form['action'] = 'search'
|
||||||
|
|
||||||
# Ensure no extra scripts passed through
|
# Ensure no extra scripts passed through
|
||||||
for script in soup('script'):
|
for script in self.soup('script'):
|
||||||
script.decompose()
|
script.decompose()
|
||||||
|
|
||||||
# Update default footer and header
|
# Update default footer and header
|
||||||
footer = soup.find('footer')
|
footer = self.soup.find('footer')
|
||||||
if footer:
|
if footer:
|
||||||
# Remove divs that have multiple links beyond just page navigation
|
# Remove divs that have multiple links beyond just page navigation
|
||||||
[_.decompose() for _ in footer.find_all('div', recursive=False)
|
[_.decompose() for _ in footer.find_all('div', recursive=False)
|
||||||
if len(_.find_all('a', href=True)) > 3]
|
if len(_.find_all('a', href=True)) > 3]
|
||||||
|
|
||||||
header = soup.find('header')
|
header = self.soup.find('header')
|
||||||
if header:
|
if header:
|
||||||
header.decompose()
|
header.decompose()
|
||||||
self.remove_site_blocks(soup)
|
self.remove_site_blocks(self.soup)
|
||||||
return soup
|
return self.soup
|
||||||
|
|
||||||
def remove_site_blocks(self, soup) -> None:
|
def remove_site_blocks(self, soup) -> None:
|
||||||
if not self.config.block or not soup.body:
|
if not self.config.block or not soup.body:
|
||||||
|
@ -233,7 +238,7 @@ class Filter:
|
||||||
if block_url.search(_.attrs['href']) is not None]
|
if block_url.search(_.attrs['href']) is not None]
|
||||||
_ = div.decompose() if len(block_divs) else None
|
_ = div.decompose() if len(block_divs) else None
|
||||||
|
|
||||||
def remove_block_tabs(self, soup) -> None:
|
def remove_block_tabs(self) -> None:
|
||||||
if self.main_divs:
|
if self.main_divs:
|
||||||
for div in self.main_divs.find_all(
|
for div in self.main_divs.find_all(
|
||||||
'div',
|
'div',
|
||||||
|
@ -242,7 +247,7 @@ class Filter:
|
||||||
_ = div.decompose()
|
_ = div.decompose()
|
||||||
else:
|
else:
|
||||||
# when in images tab
|
# when in images tab
|
||||||
for div in soup.find_all(
|
for div in self.soup.find_all(
|
||||||
'div',
|
'div',
|
||||||
attrs={'class': f'{GClasses.images_tbm_tab}'}
|
attrs={'class': f'{GClasses.images_tbm_tab}'}
|
||||||
):
|
):
|
||||||
|
@ -369,7 +374,7 @@ class Filter:
|
||||||
) + '&type=' + urlparse.quote(mime)
|
) + '&type=' + urlparse.quote(mime)
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_css(self, soup) -> None:
|
def update_css(self) -> None:
|
||||||
"""Updates URLs used in inline styles to be proxied by Whoogle
|
"""Updates URLs used in inline styles to be proxied by Whoogle
|
||||||
using the /element endpoint.
|
using the /element endpoint.
|
||||||
|
|
||||||
|
@ -378,7 +383,7 @@ class Filter:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Filter all <style> tags
|
# Filter all <style> tags
|
||||||
for style in soup.find_all('style'):
|
for style in self.soup.find_all('style'):
|
||||||
style.string = clean_css(style.string, self.page_url)
|
style.string = clean_css(style.string, self.page_url)
|
||||||
|
|
||||||
# TODO: Convert remote stylesheets to style tags and proxy all
|
# TODO: Convert remote stylesheets to style tags and proxy all
|
||||||
|
@ -386,20 +391,20 @@ class Filter:
|
||||||
# for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
|
# for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
|
||||||
# print(link)
|
# print(link)
|
||||||
|
|
||||||
def update_styling(self, soup) -> None:
|
def update_styling(self) -> None:
|
||||||
# Update CSS classes for result divs
|
# Update CSS classes for result divs
|
||||||
soup = GClasses.replace_css_classes(soup)
|
soup = GClasses.replace_css_classes(self.soup)
|
||||||
|
|
||||||
# Remove unnecessary button(s)
|
# Remove unnecessary button(s)
|
||||||
for button in soup.find_all('button'):
|
for button in self.soup.find_all('button'):
|
||||||
button.decompose()
|
button.decompose()
|
||||||
|
|
||||||
# Remove svg logos
|
# Remove svg logos
|
||||||
for svg in soup.find_all('svg'):
|
for svg in self.soup.find_all('svg'):
|
||||||
svg.decompose()
|
svg.decompose()
|
||||||
|
|
||||||
# Update logo
|
# Update logo
|
||||||
logo = soup.find('a', {'class': 'l'})
|
logo = self.soup.find('a', {'class': 'l'})
|
||||||
if logo and self.mobile:
|
if logo and self.mobile:
|
||||||
logo['style'] = ('display:flex; justify-content:center; '
|
logo['style'] = ('display:flex; justify-content:center; '
|
||||||
'align-items:center; color:#685e79; '
|
'align-items:center; color:#685e79; '
|
||||||
|
@ -407,14 +412,15 @@ class Filter:
|
||||||
|
|
||||||
# Fix search bar length on mobile
|
# Fix search bar length on mobile
|
||||||
try:
|
try:
|
||||||
search_bar = soup.find('header').find('form').find('div')
|
search_bar = self.soup.find('header').find('form').find('div')
|
||||||
search_bar['style'] = 'width: 100%;'
|
search_bar['style'] = 'width: 100%;'
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fix body max width on images tab
|
# Fix body max width on images tab
|
||||||
style = soup.find('style')
|
style = self.soup.find('style')
|
||||||
div = soup.find('div', attrs={'class': f'{GClasses.images_tbm_tab}'})
|
div = self.soup.find('div', attrs={
|
||||||
|
'class': f'{GClasses.images_tbm_tab}'})
|
||||||
if style and div and not self.mobile:
|
if style and div and not self.mobile:
|
||||||
css = style.string
|
css = style.string
|
||||||
css_html_tag = (
|
css_html_tag = (
|
||||||
|
@ -444,7 +450,6 @@ class Filter:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
parsed_link = urlparse.urlparse(link['href'])
|
parsed_link = urlparse.urlparse(link['href'])
|
||||||
link_netloc = ''
|
|
||||||
if '/url?q=' in link['href']:
|
if '/url?q=' in link['href']:
|
||||||
link_netloc = extract_q(parsed_link.query, link['href'])
|
link_netloc = extract_q(parsed_link.query, link['href'])
|
||||||
else:
|
else:
|
||||||
|
@ -543,25 +548,54 @@ class Filter:
|
||||||
):
|
):
|
||||||
link["target"] = "_blank"
|
link["target"] = "_blank"
|
||||||
|
|
||||||
# Replace link location if "alts" config is enabled
|
def site_alt_swap(self) -> None:
|
||||||
if self.config.alts:
|
"""Replaces link locations and page elements if "alts" config
|
||||||
# Search and replace all link descriptions
|
is enabled
|
||||||
# with alternative location
|
"""
|
||||||
link['href'] = get_site_alt(link['href'])
|
for site, alt in SITE_ALTS.items():
|
||||||
link_desc = link.find_all(
|
for div in self.soup.find_all('div', text=re.compile(site)):
|
||||||
text=re.compile('|'.join(SITE_ALTS.keys())))
|
# Use the number of words in the div string to determine if the
|
||||||
if len(link_desc) == 0:
|
# string is a result description (shouldn't replace domains used
|
||||||
return
|
# in desc text).
|
||||||
|
# Also ignore medium.com replacements since these are handled
|
||||||
|
# specifically in the link description replacement, and medium
|
||||||
|
# results are never given their own "card" result where this
|
||||||
|
# replacement would make sense.
|
||||||
|
if site == 'medium.com' or len(div.string.split(' ')) > 1:
|
||||||
|
continue
|
||||||
|
|
||||||
# Replace link description
|
div.string = div.string.replace(site, alt)
|
||||||
link_desc = link_desc[0]
|
|
||||||
for site, alt in SITE_ALTS.items():
|
for link in self.soup.find_all('a', href=True):
|
||||||
|
# Search and replace all link descriptions
|
||||||
|
# with alternative location
|
||||||
|
link['href'] = get_site_alt(link['href'])
|
||||||
|
link_desc = link.find_all(
|
||||||
|
text=re.compile('|'.join(SITE_ALTS.keys())))
|
||||||
|
if len(link_desc) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Replace link description
|
||||||
|
link_desc = link_desc[0]
|
||||||
if site not in link_desc or not alt:
|
if site not in link_desc or not alt:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_desc = BeautifulSoup(features='html.parser').new_tag('div')
|
new_desc = BeautifulSoup(features='html.parser').new_tag('div')
|
||||||
new_desc.string = str(link_desc).replace(site, alt)
|
link_str = str(link_desc)
|
||||||
|
|
||||||
|
# Medium links should be handled differently, since 'medium.com'
|
||||||
|
# is a common substring of domain names, but shouldn't be
|
||||||
|
# replaced (i.e. 'philomedium.com' should stay as it is).
|
||||||
|
if 'medium.com' in link_str:
|
||||||
|
if link_str.startswith('medium.com') or '.medium.com' in link_str:
|
||||||
|
new_desc.string = link_str.replace(
|
||||||
|
'medium.com', 'farside.link/scribe')
|
||||||
|
else:
|
||||||
|
new_desc.string = link_str
|
||||||
|
else:
|
||||||
|
new_desc.string = link_str.replace(site, alt)
|
||||||
|
|
||||||
link_desc.replace_with(new_desc)
|
link_desc.replace_with(new_desc)
|
||||||
break
|
|
||||||
|
|
||||||
def view_image(self, soup) -> BeautifulSoup:
|
def view_image(self, soup) -> BeautifulSoup:
|
||||||
"""Replaces the soup with a new one that handles mobile results and
|
"""Replaces the soup with a new one that handles mobile results and
|
||||||
|
|
|
@ -34,8 +34,7 @@ SITE_ALTS = {
|
||||||
'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'),
|
'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'),
|
||||||
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
|
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
|
||||||
**dict.fromkeys([
|
**dict.fromkeys([
|
||||||
'.medium.com',
|
'medium.com',
|
||||||
'//medium.com',
|
|
||||||
'levelup.gitconnected.com'
|
'levelup.gitconnected.com'
|
||||||
], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
|
], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
|
||||||
'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'),
|
'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'),
|
||||||
|
@ -144,7 +143,8 @@ def get_site_alt(link: str) -> str:
|
||||||
hostcomp = f'{parsed_link.scheme}://{hostname}'
|
hostcomp = f'{parsed_link.scheme}://{hostname}'
|
||||||
|
|
||||||
for site_key in SITE_ALTS.keys():
|
for site_key in SITE_ALTS.keys():
|
||||||
if not hostname or site_key not in hostname or not SITE_ALTS[site_key]:
|
site_alt = f'{parsed_link.scheme}://{site_key}'
|
||||||
|
if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Wikipedia -> Wikiless replacements require the subdomain (if it's
|
# Wikipedia -> Wikiless replacements require the subdomain (if it's
|
||||||
|
|
Loading…
Reference in New Issue