Collapse long result sections into details/summary elements
Sections such as "People also asked" and "related searches" typically take up a lot of room on the results page, and don't always have the most useful information. This checks for result elements with more than 7 child divs, extracts the section title, and wraps all elements in a "details" element that can be expanded/collapsed by the user. Note that this functionality existed previously (albeit not implemented as well), but due to changes in how Google returns searches (switching from using <h2> elements for section headers to <span> or <div> elements), the approach to collapsing these sections needed to be updated.main
parent
d894bd347d
commit
afd01820bb
|
@ -38,6 +38,10 @@ def clean_query(query: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
class Filter:
|
class Filter:
|
||||||
|
# Limit used for determining if a result is a "regular" result or a list
|
||||||
|
# type result (such as "people also asked", "related searches", etc)
|
||||||
|
RESULT_CHILD_LIMIT = 7
|
||||||
|
|
||||||
def __init__(self, user_key: str, mobile=False, config=None) -> None:
|
def __init__(self, user_key: str, mobile=False, config=None) -> None:
|
||||||
if config is None:
|
if config is None:
|
||||||
config = {}
|
config = {}
|
||||||
|
@ -83,7 +87,7 @@ class Filter:
|
||||||
def clean(self, soup) -> BeautifulSoup:
|
def clean(self, soup) -> BeautifulSoup:
|
||||||
self.main_divs = soup.find('div', {'id': 'main'})
|
self.main_divs = soup.find('div', {'id': 'main'})
|
||||||
self.remove_ads()
|
self.remove_ads()
|
||||||
self.fix_question_section()
|
self.collapse_sections()
|
||||||
self.update_styling(soup)
|
self.update_styling(soup)
|
||||||
|
|
||||||
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
|
for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
|
||||||
|
@ -130,41 +134,54 @@ class Filter:
|
||||||
if has_ad_content(_.text)]
|
if has_ad_content(_.text)]
|
||||||
_ = div.decompose() if len(div_ads) else None
|
_ = div.decompose() if len(div_ads) else None
|
||||||
|
|
||||||
def fix_question_section(self) -> None:
|
def collapse_sections(self) -> None:
|
||||||
"""Collapses the "People Also Asked" section into a "details" element
|
"""Collapses long result sections ("people also asked", "related
|
||||||
|
searches", etc) into "details" elements
|
||||||
|
|
||||||
These sections are typically the only sections in the results page that
|
These sections are typically the only sections in the results page that
|
||||||
are structured as <div><h2>Title</h2><div>...</div></div>, so they are
|
have more than ~5 child divs within a primary result div.
|
||||||
extracted by checking all result divs for h2 children.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None (The soup object is modified directly)
|
None (The soup object is modified directly)
|
||||||
"""
|
"""
|
||||||
|
def pull_child_divs(result_div: BeautifulSoup):
|
||||||
|
try:
|
||||||
|
return result_div.findChildren(
|
||||||
|
'div', recursive=False
|
||||||
|
)[0].findChildren(
|
||||||
|
'div', recursive=False)
|
||||||
|
except IndexError:
|
||||||
|
return []
|
||||||
|
|
||||||
if not self.main_divs:
|
if not self.main_divs:
|
||||||
return
|
return
|
||||||
|
|
||||||
question_divs = [_ for _ in self.main_divs.find_all(
|
# Loop through results and check for the number of child divs in each
|
||||||
'div', recursive=False
|
for result in self.main_divs:
|
||||||
) if len(_.find_all('h2')) > 0]
|
result_children = pull_child_divs(result)
|
||||||
|
if len(result_children) < self.RESULT_CHILD_LIMIT:
|
||||||
|
continue
|
||||||
|
|
||||||
if len(question_divs) == 0:
|
# Find and decompose the first element with an inner HTML text val.
|
||||||
return
|
# This typically extracts the title of the section (i.e. "Related
|
||||||
|
# Searches", "People also ask", etc)
|
||||||
|
label = 'Collapsed Results'
|
||||||
|
for elem in result_children:
|
||||||
|
if elem.text:
|
||||||
|
label = elem.text
|
||||||
|
elem.decompose()
|
||||||
|
break
|
||||||
|
|
||||||
# Wrap section in details element to allow collapse/expand
|
# Create the new details element to wrap around the result's
|
||||||
details = BeautifulSoup(features='html.parser').new_tag('details')
|
# immediate parent
|
||||||
summary = BeautifulSoup(features='html.parser').new_tag('summary')
|
parent = result_children[0].parent
|
||||||
summary.string = question_divs[0].find('h2').text
|
details = BeautifulSoup(features='html.parser').new_tag('details')
|
||||||
question_divs[0].find('h2').decompose()
|
summary = BeautifulSoup(features='html.parser').new_tag('summary')
|
||||||
details.append(summary)
|
summary.string = label
|
||||||
question_divs[0].wrap(details)
|
details.append(summary)
|
||||||
|
|
||||||
for question_div in question_divs:
|
if parent:
|
||||||
questions = [_ for _ in question_div.find_all(
|
parent.wrap(details)
|
||||||
'div', recursive=True
|
|
||||||
) if _.text.endswith('?')]
|
|
||||||
|
|
||||||
for question in questions:
|
|
||||||
question['style'] = 'padding: 10px; font-style: italic;'
|
|
||||||
|
|
||||||
def update_element_src(self, element: Tag, mime: str) -> None:
|
def update_element_src(self, element: Tag, mime: str) -> None:
|
||||||
"""Encrypts the original src of an element and rewrites the element src
|
"""Encrypts the original src of an element and rewrites the element src
|
||||||
|
|
Loading…
Reference in New Issue