Added better multilingual support, updated filter

Results page now includes method for switching to "All Languages" from whichever language is specified as the primary in the config (see #74). Also removes the non-Whoogle links from the page footer, leaving only the page navigation controls Added support for the date range filter on the results page, though I'd still recommend using the ":past <unit>" query instead.
2020-06-07 14:06:49 -06:00 · 2020-06-07 14:06:49 -06:00 · 4324fcd8f8
parent 6ec65f8754
commit 4324fcd8f8
3 changed files with 33 additions and 17 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -116,14 +116,11 @@ class Filter:
        for script in soup('script'):
            script.decompose()

-        # Remove google's language/time config
-        st_card = soup.find('div', id='st-card')
-        if st_card:
-            st_card.decompose()
-
-        footer = soup.find('div', id='sfooter')
+        # Update default footer and header
+        footer = soup.find('footer')
        if footer:
-            footer.decompose()
+            # Remove divs that have multiple links beyond just page navigation
+            [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2]

        header = soup.find('header')
        if header:
@ -144,12 +141,12 @@ class Filter:
            return

        question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0]
-        for x in question_divs:
-            questions = [_ for _ in x.find_all('div', recursive=True) if _.text.endswith('?')]
+        for question_div in question_divs:
+            questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')]
            for question in questions:
                question['style'] = 'padding: 10px; font-style: italic;'

-    def update_element_src(self, element, mimetype):
+    def update_element_src(self, element, mime):
        element_src = element['src']
        if element_src.startswith('//'):
            element_src = 'https:' + element_src
@ -163,7 +160,7 @@ class Filter:
            return

        element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \
-                         '&type=' + urlparse.quote(mimetype)
+                         '&type=' + urlparse.quote(mime)
        # TODO: Non-mobile image results link to website instead of image
        # if not self.mobile:
        # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser'))
--- a/app/request.py
+++ b/app/request.py
@ -12,7 +12,7 @@ MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
 DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'

 # Valid query params
-VALID_PARAMS = ['tbs', 'tbm', 'start', 'near']
+VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source']


 def gen_user_agent(is_mobile):
@ -28,11 +28,22 @@ def gen_user_agent(is_mobile):

 def gen_query(query, args, config, near_city=None):
    param_dict = {key: '' for key in VALID_PARAMS}
+
    # Use :past(hour/day/week/month/year) if available
    # example search "new restaurants :past month"
-    if ':past' in query:
+    sub_lang = ''
+    if ':past' in query and 'tbs' not in args:
        time_range = str.strip(query.split(':past', 1)[-1])
-        param_dict['tbs'] = '&tbs=qdr:' + str.lower(time_range[0])
+        param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0]))
+    elif 'tbs' in args:
+        result_tbs = args.get('tbs')
+        param_dict['tbs'] = '&tbs=' + result_tbs
+
+        # Occasionally the 'tbs' param provided by google also contains a field for 'lr', but formatted
+        # strangely. This is a (admittedly not very elegant) solution for this.
+        # Ex/ &tbs=qdr:h,lr:lang_1pl --> the lr param needs to be extracted and have the "1" digit removed in this case
+        sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _]
+        sub_lang = sub_lang[0][sub_lang[0].find('lr:') + 3:len(sub_lang[0])] if len(sub_lang) > 0 else ''

    # Ensure search query is parsable
    query = urlparse.quote(query)
@ -49,13 +60,20 @@ def gen_query(query, args, config, near_city=None):
    if near_city:
        param_dict['near'] = '&near=' + urlparse.quote(near_city)

-    # Set language for results (lr) and interface (hl)
-    param_dict['lr'] = '&lr=' + config.lang + '&hl=' + config.lang.replace('lang_', '')
+    # Set language for results (lr) if source isn't set, otherwise use the result
+    # language param provided by google (but with the strange digit(s) removed)
+    if 'source' in args:
+        param_dict['source'] = '&source=' + args.get('source')
+        param_dict['lr'] = ('&lr=' + ''.join([_ for _ in sub_lang if not _.isdigit()])) if sub_lang else ''
+    else:
+        param_dict['lr'] = '&lr=' + config.lang
+
    param_dict['cr'] = ('&cr=' + config.ctry) if config.ctry else ''
+    param_dict['hl'] = '&hl=' + config.lang.replace('lang_', '')
    param_dict['safe'] = '&safe=' + ('active' if config.safe else 'off')

    for val in param_dict.values():
-        if not val or val is None:
+        if not val:
            continue
        query += val

--- a/app/routes.py
+++ b/app/routes.py
@ -87,6 +87,7 @@ def after_request_func(response):
        for key in session_list:
            session.pop(key)

+    response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
    return response