From 4324fcd8f8ea274b5e913840fd97c04465feb82a Mon Sep 17 00:00:00 2001 From: Ben Busby <33362396+benbusby@users.noreply.github.com> Date: Sun, 7 Jun 2020 14:06:49 -0600 Subject: [PATCH] Added better multilingual support, updated filter Results page now includes method for switching to "All Languages" from whichever language is specified as the primary in the config (see #74). Also removes the non-Whoogle links from the page footer, leaving only the page navigation controls Added support for the date range filter on the results page, though I'd still recommend using the ":past " query instead. --- app/filter.py | 19 ++++++++----------- app/request.py | 30 ++++++++++++++++++++++++------ app/routes.py | 1 + 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/app/filter.py b/app/filter.py index be9809b..9944441 100644 --- a/app/filter.py +++ b/app/filter.py @@ -116,14 +116,11 @@ class Filter: for script in soup('script'): script.decompose() - # Remove google's language/time config - st_card = soup.find('div', id='st-card') - if st_card: - st_card.decompose() - - footer = soup.find('div', id='sfooter') + # Update default footer and header + footer = soup.find('footer') if footer: - footer.decompose() + # Remove divs that have multiple links beyond just page navigation + [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2] header = soup.find('header') if header: @@ -144,12 +141,12 @@ class Filter: return question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0] - for x in question_divs: - questions = [_ for _ in x.find_all('div', recursive=True) if _.text.endswith('?')] + for question_div in question_divs: + questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')] for question in questions: question['style'] = 'padding: 10px; font-style: italic;' - def update_element_src(self, element, mimetype): + def update_element_src(self, element, mime): element_src = element['src'] if element_src.startswith('//'): element_src = 'https:' + element_src @@ -163,7 +160,7 @@ class Filter: return element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ - '&type=' + urlparse.quote(mimetype) + '&type=' + urlparse.quote(mime) # TODO: Non-mobile image results link to website instead of image # if not self.mobile: # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) diff --git a/app/request.py b/app/request.py index 38b47b0..398ab71 100644 --- a/app/request.py +++ b/app/request.py @@ -12,7 +12,7 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0' DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' # Valid query params -VALID_PARAMS = ['tbs', 'tbm', 'start', 'near'] +VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source'] def gen_user_agent(is_mobile): @@ -28,11 +28,22 @@ def gen_user_agent(is_mobile): def gen_query(query, args, config, near_city=None): param_dict = {key: '' for key in VALID_PARAMS} + # Use :past(hour/day/week/month/year) if available # example search "new restaurants :past month" - if ':past' in query: + sub_lang = '' + if ':past' in query and 'tbs' not in args: time_range = str.strip(query.split(':past', 1)[-1]) - param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0]) + param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0])) + elif 'tbs' in args: + result_tbs = args.get('tbs') + param_dict['tbs'] = '&tbs=' + result_tbs + + # Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted + # strangely. This is a (admittedly not very elegant) solution for this. + # Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case + sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _] + sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else '' # Ensure search query is parsable query = urlparse.quote(query) @@ -49,13 +60,20 @@ def gen_query(query, args, config, near_city=None): if near_city: param_dict['near'] = '&near=' + urlparse.quote(near_city) - # Set language for results (lr) and interface (hl) - param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '') + # Set language for results (lr) if source isn't set, otherwise use the result + # language param provided by google (but with the strange digit(s) removed) + if 'source' in args: + param_dict['source'] = '&source=' + args.get('source') + param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else '' + else: + param_dict['lr'] = '&lr=' + config.lang + param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else '' + param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '') param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off') for val in param_dict.values(): - if not val or val is None: + if not val: continue query += val diff --git a/app/routes.py b/app/routes.py index 99e05a6..0139594 100644 --- a/app/routes.py +++ b/app/routes.py @@ -87,6 +87,7 @@ def after_request_func(response): for key in session_list: session.pop(key) + response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate" return response