From fd85f1573a96b38c5c1917e66aef4a3908aecba2 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Mon, 5 Dec 2022 13:28:29 -0700 Subject: [PATCH] Refactor site alt link replacement Replacing result links and text when site alts are enabled is now part of its own function, and handles replacement of link location and link description separately. Fixes #880 --- app/filter.py | 114 ++++++++++++++++++++++++++++--------------- app/utils/results.py | 6 +-- 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/app/filter.py b/app/filter.py index cd07318..92e5a9d 100644 --- a/app/filter.py +++ b/app/filter.py @@ -119,6 +119,7 @@ class Filter: page_url='', query='', mobile=False) -> None: + self.soup = None self.config = config self.mobile = mobile self.user_key = user_key @@ -149,46 +150,50 @@ class Filter: return Fernet(self.user_key).encrypt(path.encode()).decode() def clean(self, soup) -> BeautifulSoup: - self.main_divs = soup.find('div', {'id': 'main'}) + self.soup = soup + self.main_divs = self.soup.find('div', {'id': 'main'}) self.remove_ads() self.remove_block_titles() self.remove_block_url() self.collapse_sections() - self.update_css(soup) - self.update_styling(soup) - self.remove_block_tabs(soup) + self.update_css() + self.update_styling() + self.remove_block_tabs() - for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: + for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]: self.update_element_src(img, 'image/png') - for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: + for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]: self.update_element_src(audio, 'audio/mpeg') - for link in soup.find_all('a', href=True): + for link in self.soup.find_all('a', href=True): self.update_link(link) - input_form = soup.find('form') + if self.config.alts: + self.site_alt_swap() + + input_form = self.soup.find('form') if input_form is not None: input_form['method'] = 'GET' if self.config.get_only else 'POST' # Use a relative URI for submissions input_form['action'] = 'search' # Ensure no extra scripts passed through - for script in soup('script'): + for script in self.soup('script'): script.decompose() # Update default footer and header - footer = soup.find('footer') + footer = self.soup.find('footer') if footer: # Remove divs that have multiple links beyond just page navigation [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 3] - header = soup.find('header') + header = self.soup.find('header') if header: header.decompose() - self.remove_site_blocks(soup) - return soup + self.remove_site_blocks(self.soup) + return self.soup def remove_site_blocks(self, soup) -> None: if not self.config.block or not soup.body: @@ -233,7 +238,7 @@ class Filter: if block_url.search(_.attrs['href']) is not None] _ = div.decompose() if len(block_divs) else None - def remove_block_tabs(self, soup) -> None: + def remove_block_tabs(self) -> None: if self.main_divs: for div in self.main_divs.find_all( 'div', @@ -242,7 +247,7 @@ class Filter: _ = div.decompose() else: # when in images tab - for div in soup.find_all( + for div in self.soup.find_all( 'div', attrs={'class': f'{GClasses.images_tbm_tab}'} ): @@ -369,7 +374,7 @@ class Filter: ) + '&type=' + urlparse.quote(mime) ) - def update_css(self, soup) -> None: + def update_css(self) -> None: """Updates URLs used in inline styles to be proxied by Whoogle using the /element endpoint. @@ -378,7 +383,7 @@ class Filter: """ # Filter all