Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.
main
Ben Busby 2021-06-23 18:59:57 -04:00
parent d894bd347d
commit afd01820bb
No known key found for this signature in database
GPG Key ID: 3B08611DF6E62ED2
1 changed files with 41 additions and 24 deletions

View File

@ -38,6 +38,10 @@ def clean_query(query: str) -> str:
class Filter:
# Limit used for determining if a result is a "regular" result or a list
# type result (such as "people also asked", "related searches", etc)
RESULT_CHILD_LIMIT = 7
def __init__(self, user_key: str, mobile=False, config=None) -> None:
if config is None:
config = {}
@ -83,7 +87,7 @@ class Filter:
def clean(self, soup) -> BeautifulSoup:
self.main_divs = soup.find('div', {'id': 'main'})
self.remove_ads()
self.fix_question_section()
self.collapse_sections()
self.update_styling(soup)
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
@ -130,41 +134,54 @@ class Filter:
if has_ad_content(_.text)]
_ = div.decompose() if len(div_ads) else None
def fix_question_section(self) -> None:
"""Collapses the "People Also Asked" section into a "details" element
def collapse_sections(self) -> None:
"""Collapses long result sections ("people also asked", "related
searches", etc) into "details" elements
These sections are typically the only sections in the results page that
are structured as <div><h2>Title</h2><div>...</div></div>, so they are
extracted by checking all result divs for h2 children.
have more than ~5 child divs within a primary result div.
Returns:
None (The soup object is modified directly)
"""
def pull_child_divs(result_div: BeautifulSoup):
try:
return result_div.findChildren(
'div', recursive=False
)[0].findChildren(
'div', recursive=False)
except IndexError:
return []
if not self.main_divs:
return
question_divs = [_ for _ in self.main_divs.find_all(
'div', recursive=False
) if len(_.find_all('h2')) > 0]
# Loop through results and check for the number of child divs in each
for result in self.main_divs:
result_children = pull_child_divs(result)
if len(result_children) < self.RESULT_CHILD_LIMIT:
continue
if len(question_divs) == 0:
return
# Find and decompose the first element with an inner HTML text val.
# This typically extracts the title of the section (i.e. "Related
# Searches", "People also ask", etc)
label = 'Collapsed Results'
for elem in result_children:
if elem.text:
label = elem.text
elem.decompose()
break
# Wrap section in details element to allow collapse/expand
details = BeautifulSoup(features='html.parser').new_tag('details')
summary = BeautifulSoup(features='html.parser').new_tag('summary')
summary.string = question_divs[0].find('h2').text
question_divs[0].find('h2').decompose()
details.append(summary)
question_divs[0].wrap(details)
# Create the new details element to wrap around the result's
# immediate parent
parent = result_children[0].parent
details = BeautifulSoup(features='html.parser').new_tag('details')
summary = BeautifulSoup(features='html.parser').new_tag('summary')
summary.string = label
details.append(summary)
for question_div in question_divs:
questions = [_ for _ in question_div.find_all(
'div', recursive=True
) if _.text.endswith('?')]
for question in questions:
question['style'] = 'padding: 10px; font-style: italic;'
if parent:
parent.wrap(details)
def update_element_src(self, element: Tag, mime: str) -> None:
"""Encrypts the original src of an element and rewrites the element src