Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically take up a lot of room on the results page, and don't always have the most useful information. This checks for result elements with more than 7 child divs, extracts the section title, and wraps all elements in a "details" element that can be expanded/collapsed by the user. Note that this functionality existed previously (albeit not implemented as well), but due to changes in how Google returns searches (switching from using <h2> elements for section headers to <span> or <div> elements), the approach to collapsing these sections needed to be updated.
2021-06-23 18:59:57 -04:00 · 2021-06-23 18:59:57 -04:00 · afd01820bb
parent d894bd347d
commit afd01820bb
1 changed files with 41 additions and 24 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -38,6 +38,10 @@ def clean_query(query: str) -> str:
 class Filter:
    # Limit used for determining if a result is a "regular" result or a list
    # type result (such as "people also asked", "related searches", etc)
    RESULT_CHILD_LIMIT = 7
    def __init__(self, user_key: str, mobile=False, config=None) -> None:
        if config is None:
            config = {}
@ -83,7 +87,7 @@ class Filter:
    def clean(self, soup) -> BeautifulSoup:
        self.main_divs = soup.find('div', {'id': 'main'})
        self.remove_ads()
-        self.fix_question_section()
+        self.collapse_sections()
        self.update_styling(soup)
        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
@ -130,41 +134,54 @@ class Filter:
                       if has_ad_content(_.text)]
            _ = div.decompose() if len(div_ads) else None
-    def fix_question_section(self) -> None:
+    def collapse_sections(self) -> None:
-        """Collapses the "People Also Asked" section into a "details" element
+        """Collapses long result sections ("people also asked", "related
         searches", etc) into "details" elements
        These sections are typically the only sections in the results page that
-        are structured as <div><h2>Title</h2><div>...</div></div>, so they are
+        have more than ~5 child divs within a primary result div.
        extracted by checking all result divs for h2 children.
        Returns:
            None (The soup object is modified directly)
        """
        def pull_child_divs(result_div: BeautifulSoup):
            try:
                return result_div.findChildren(
                    'div', recursive=False
                )[0].findChildren(
                    'div', recursive=False)
            except IndexError:
                return []
        if not self.main_divs:
            return
-        question_divs = [_ for _ in self.main_divs.find_all(
+        # Loop through results and check for the number of child divs in each
-            'div', recursive=False
+        for result in self.main_divs:
-        ) if len(_.find_all('h2')) > 0]
+            result_children = pull_child_divs(result)
            if len(result_children) < self.RESULT_CHILD_LIMIT:
                continue
-        if len(question_divs) == 0:
+            # Find and decompose the first element with an inner HTML text val.
-            return
+            # This typically extracts the title of the section (i.e. "Related
            # Searches", "People also ask", etc)
            label = 'Collapsed Results'
            for elem in result_children:
                if elem.text:
                    label = elem.text
                    elem.decompose()
                    break
-        # Wrap section in details element to allow collapse/expand
+            # Create the new details element to wrap around the result's
-        details = BeautifulSoup(features='html.parser').new_tag('details')
+            # immediate parent
-        summary = BeautifulSoup(features='html.parser').new_tag('summary')
+            parent = result_children[0].parent
-        summary.string = question_divs[0].find('h2').text
+            details = BeautifulSoup(features='html.parser').new_tag('details')
-        question_divs[0].find('h2').decompose()
+            summary = BeautifulSoup(features='html.parser').new_tag('summary')
-        details.append(summary)
+            summary.string = label
-        question_divs[0].wrap(details)
+            details.append(summary)
-        for question_div in question_divs:
+            if parent:
-            questions = [_ for _ in question_div.find_all(
+                parent.wrap(details)
                'div', recursive=True
            ) if _.text.endswith('?')]
            for question in questions:
                question['style'] = 'padding: 10px; font-style: italic;'
    def update_element_src(self, element: Tag, mime: str) -> None:
        """Encrypts the original src of an element and rewrites the element src