Refactor site alt link replacement

Replacing result links and text when site alts are enabled is now part of its own function, and handles replacement of link location and link description separately. Fixes #880
2022-12-05 13:28:29 -07:00 · 2022-12-05 13:28:29 -07:00 · fd85f1573a
parent 0310f0f542
commit fd85f1573a
2 changed files with 77 additions and 43 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -119,6 +119,7 @@ class Filter:
            page_url='',
            query='',
            mobile=False) -> None:
+        self.soup = None
        self.config = config
        self.mobile = mobile
        self.user_key = user_key
@ -149,46 +150,50 @@ class Filter:
        return Fernet(self.user_key).encrypt(path.encode()).decode()

    def clean(self, soup) -> BeautifulSoup:
-        self.main_divs = soup.find('div', {'id': 'main'})
+        self.soup = soup
+        self.main_divs = self.soup.find('div', {'id': 'main'})
        self.remove_ads()
        self.remove_block_titles()
        self.remove_block_url()
        self.collapse_sections()
-        self.update_css(soup)
-        self.update_styling(soup)
-        self.remove_block_tabs(soup)
+        self.update_css()
+        self.update_styling()
+        self.remove_block_tabs()

-        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
+        for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]:
            self.update_element_src(img, 'image/png')

-        for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
+        for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]:
            self.update_element_src(audio, 'audio/mpeg')

-        for link in soup.find_all('a', href=True):
+        for link in self.soup.find_all('a', href=True):
            self.update_link(link)

-        input_form = soup.find('form')
+        if self.config.alts:
+            self.site_alt_swap()
+
+        input_form = self.soup.find('form')
        if input_form is not None:
            input_form['method'] = 'GET' if self.config.get_only else 'POST'
            # Use a relative URI for submissions
            input_form['action'] = 'search'

        # Ensure no extra scripts passed through
-        for script in soup('script'):
+        for script in self.soup('script'):
            script.decompose()

        # Update default footer and header
-        footer = soup.find('footer')
+        footer = self.soup.find('footer')
        if footer:
            # Remove divs that have multiple links beyond just page navigation
            [_.decompose() for _ in footer.find_all('div', recursive=False)
             if len(_.find_all('a', href=True)) > 3]

-        header = soup.find('header')
+        header = self.soup.find('header')
        if header:
            header.decompose()
-        self.remove_site_blocks(soup)
-        return soup
+        self.remove_site_blocks(self.soup)
+        return self.soup

    def remove_site_blocks(self, soup) -> None:
        if not self.config.block or not soup.body:
@ -233,7 +238,7 @@ class Filter:
                          if block_url.search(_.attrs['href']) is not None]
            _ = div.decompose() if len(block_divs) else None

-    def remove_block_tabs(self, soup) -> None:
+    def remove_block_tabs(self) -> None:
        if self.main_divs:
            for div in self.main_divs.find_all(
                'div',
@ -242,7 +247,7 @@ class Filter:
                _ = div.decompose()
        else:
            # when in images tab
-            for div in soup.find_all(
+            for div in self.soup.find_all(
                'div',
                attrs={'class': f'{GClasses.images_tbm_tab}'}
            ):
@ -369,7 +374,7 @@ class Filter:
            ) + '&type=' + urlparse.quote(mime)
        )

-    def update_css(self, soup) -> None:
+    def update_css(self) -> None:
        """Updates URLs used in inline styles to be proxied by Whoogle
        using the /element endpoint.

@ -378,7 +383,7 @@ class Filter:

        """
        # Filter all <style> tags
-        for style in soup.find_all('style'):
+        for style in self.soup.find_all('style'):
            style.string = clean_css(style.string, self.page_url)

        # TODO: Convert remote stylesheets to style tags and proxy all
@ -386,20 +391,20 @@ class Filter:
        # for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
            # print(link)

-    def update_styling(self, soup) -> None:
+    def update_styling(self) -> None:
        # Update CSS classes for result divs
-        soup = GClasses.replace_css_classes(soup)
+        soup = GClasses.replace_css_classes(self.soup)

        # Remove unnecessary button(s)
-        for button in soup.find_all('button'):
+        for button in self.soup.find_all('button'):
            button.decompose()

        # Remove svg logos
-        for svg in soup.find_all('svg'):
+        for svg in self.soup.find_all('svg'):
            svg.decompose()

        # Update logo
-        logo = soup.find('a', {'class': 'l'})
+        logo = self.soup.find('a', {'class': 'l'})
        if logo and self.mobile:
            logo['style'] = ('display:flex; justify-content:center; '
                             'align-items:center; color:#685e79; '
@ -407,14 +412,15 @@ class Filter:

        # Fix search bar length on mobile
        try:
-            search_bar = soup.find('header').find('form').find('div')
+            search_bar = self.soup.find('header').find('form').find('div')
            search_bar['style'] = 'width: 100%;'
        except AttributeError:
            pass

        # Fix body max width on images tab
-        style = soup.find('style')
-        div = soup.find('div', attrs={'class': f'{GClasses.images_tbm_tab}'})
+        style = self.soup.find('style')
+        div = self.soup.find('div', attrs={
+            'class': f'{GClasses.images_tbm_tab}'})
        if style and div and not self.mobile:
            css = style.string
            css_html_tag = (
@ -444,7 +450,6 @@ class Filter:

        """
        parsed_link = urlparse.urlparse(link['href'])
-        link_netloc = ''
        if '/url?q=' in link['href']:
            link_netloc = extract_q(parsed_link.query, link['href'])
        else:
@ -543,25 +548,54 @@ class Filter:
        ):
            link["target"] = "_blank"

-        # Replace link location if "alts" config is enabled
-        if self.config.alts:
-            # Search and replace all link descriptions
-            # with alternative location
-            link['href'] = get_site_alt(link['href'])
-            link_desc = link.find_all(
-                text=re.compile('|'.join(SITE_ALTS.keys())))
-            if len(link_desc) == 0:
-                return
+    def site_alt_swap(self) -> None:
+        """Replaces link locations and page elements if "alts" config
+        is enabled
+        """
+        for site, alt in SITE_ALTS.items():
+            for div in self.soup.find_all('div', text=re.compile(site)):
+                # Use the number of words in the div string to determine if the
+                # string is a result description (shouldn't replace domains used
+                # in desc text).
+                # Also ignore medium.com replacements since these are handled
+                # specifically in the link description replacement, and medium
+                # results are never given their own "card" result where this
+                # replacement would make sense.
+                if site == 'medium.com' or len(div.string.split(' ')) > 1:
+                    continue

-            # Replace link description
-            link_desc = link_desc[0]
-            for site, alt in SITE_ALTS.items():
+                div.string = div.string.replace(site, alt)
+
+            for link in self.soup.find_all('a', href=True):
+                # Search and replace all link descriptions
+                # with alternative location
+                link['href'] = get_site_alt(link['href'])
+                link_desc = link.find_all(
+                    text=re.compile('|'.join(SITE_ALTS.keys())))
+                if len(link_desc) == 0:
+                    continue
+
+                # Replace link description
+                link_desc = link_desc[0]
                if site not in link_desc or not alt:
                    continue
+
                new_desc = BeautifulSoup(features='html.parser').new_tag('div')
-                new_desc.string = str(link_desc).replace(site, alt)
+                link_str = str(link_desc)
+
+                # Medium links should be handled differently, since 'medium.com'
+                # is a common substring of domain names, but shouldn't be
+                # replaced (i.e. 'philomedium.com' should stay as it is).
+                if 'medium.com' in link_str:
+                    if link_str.startswith('medium.com') or '.medium.com' in link_str:
+                        new_desc.string = link_str.replace(
+                            'medium.com', 'farside.link/scribe')
+                    else:
+                        new_desc.string = link_str
+                else:
+                    new_desc.string = link_str.replace(site, alt)
+
                link_desc.replace_with(new_desc)
-                break

    def view_image(self, soup) -> BeautifulSoup:
        """Replaces the soup with a new one that handles mobile results and
--- a/app/utils/results.py
+++ b/app/utils/results.py
@ -34,8 +34,7 @@ SITE_ALTS = {
    'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'),
    'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
    **dict.fromkeys([
-        '.medium.com',
-        '//medium.com',
+        'medium.com',
        'levelup.gitconnected.com'
    ], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
    'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'),
@ -144,7 +143,8 @@ def get_site_alt(link: str) -> str:
    hostcomp = f'{parsed_link.scheme}://{hostname}'

    for site_key in SITE_ALTS.keys():
-        if not hostname or site_key not in hostname or not SITE_ALTS[site_key]:
+        site_alt = f'{parsed_link.scheme}://{site_key}'
+        if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
            continue

        # Wikipedia -> Wikiless replacements require the subdomain (if it's