whoogle-search/app/filter.py

import cssutils
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from cryptography.fernet import Fernet
from flask import render_template
import urllib.parse as urlparse
from urllib.parse import parse_qs
import re

from app.models.g_classes import GClasses
from app.request import VALID_PARAMS, MAPS_URL
from app.utils.misc import get_abs_url, read_config_bool
from app.utils.results import (
    BLANK_B64, GOOG_IMG, GOOG_STATIC, G_M_LOGO_URL, LOGO_URL, SITE_ALTS,
    has_ad_content, filter_link_args, append_anon_view, get_site_alt,
)
from app.models.endpoint import Endpoint
from app.models.config import Config


MAPS_ARGS = ['q', 'daddr']

minimal_mode_sections = ['Top stories', 'Images', 'People also ask']
unsupported_g_pages = [
    'support.google.com',
    'accounts.google.com',
    'policies.google.com',
    'google.com/preferences',
    'google.com/intl',
    'advanced_search',
    'tbm=shop'
]


def extract_q(q_str: str, href: str) -> str:
    """Extracts the 'q' element from a result link. This is typically
    either the link to a result's website, or a string.

    Args:
        q_str: The result link to parse
        href: The full url to check for standalone 'q' elements first,
              rather than parsing the whole query string and then checking.

    Returns:
        str: The 'q' element of the link, or an empty string
    """
    return parse_qs(q_str)['q'][0] if ('&q=' in href or '?q=' in href) else ''


def build_map_url(href: str) -> str:
    """Tries to extract known args that explain the location in the url. If a
    location is found, returns the default url with it. Otherwise, returns the
    url unchanged.

    Args:
        href: The full url to check.

    Returns:
        str: The parsed url, or the url unchanged.
    """
    # parse the url
    parsed_url = parse_qs(href)
    # iterate through the known parameters and try build the url
    for param in MAPS_ARGS:
        if param in parsed_url:
            return MAPS_URL + "?q=" + parsed_url[param][0]

    # query could not be extracted returning unchanged url
    return href


def clean_query(query: str) -> str:
    """Strips the blocked site list from the query, if one is being
    used.

    Args:
        query: The query string

    Returns:
        str: The query string without any "-site:..." filters
    """
    return query[:query.find('-site:')] if '-site:' in query else query


def clean_css(css: str, page_url: str) -> str:
    """Removes all remote URLs from a CSS string.

    Args:
        css: The CSS string

    Returns:
        str: The filtered CSS, with URLs proxied through Whoogle
    """
    sheet = cssutils.parseString(css)
    urls = cssutils.getUrls(sheet)

    for url in urls:
        abs_url = get_abs_url(url, page_url)
        if abs_url.startswith('data:'):
            continue
        css = css.replace(
            url,
            f'{Endpoint.element}?type=image/png&url={abs_url}'
        )

    return css


class Filter:
    # Limit used for determining if a result is a "regular" result or a list
    # type result (such as "people also asked", "related searches", etc)
    RESULT_CHILD_LIMIT = 7

    def __init__(
            self,
            user_key: str,
            config: Config,
            root_url='',
            page_url='',
            query='',
            mobile=False) -> None:
        self.config = config
        self.mobile = mobile
        self.user_key = user_key
        self.page_url = page_url
        self.query = query
        self.main_divs = ResultSet('')
        self._elements = 0
        self._av = set()

        self.root_url = root_url[:-1] if root_url.endswith('/') else root_url

    def __getitem__(self, name):
        return getattr(self, name)

    @property
    def elements(self):
        return self._elements

    def encrypt_path(self, path, is_element=False) -> str:
        # Encrypts path to avoid plaintext results in logs
        if is_element:
            # Element paths are encrypted separately from text, to allow key
            # regeneration once all items have been served to the user
            enc_path = Fernet(self.user_key).encrypt(path.encode()).decode()
            self._elements += 1
            return enc_path

        return Fernet(self.user_key).encrypt(path.encode()).decode()

    def clean(self, soup) -> BeautifulSoup:
        self.main_divs = soup.find('div', {'id': 'main'})
        self.remove_ads()
        self.remove_block_titles()
        self.remove_block_url()
        self.collapse_sections()
        self.update_css(soup)
        self.update_styling(soup)
        self.remove_block_tabs(soup)

        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
            self.update_element_src(img, 'image/png')

        for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
            self.update_element_src(audio, 'audio/mpeg')

        for link in soup.find_all('a', href=True):
            self.update_link(link)

        input_form = soup.find('form')
        if input_form is not None:
            input_form['method'] = 'GET' if self.config.get_only else 'POST'
            # Use a relative URI for submissions
            input_form['action'] = 'search'

        # Ensure no extra scripts passed through
        for script in soup('script'):
            script.decompose()

        # Update default footer and header
        footer = soup.find('footer')
        if footer:
            # Remove divs that have multiple links beyond just page navigation
            [_.decompose() for _ in footer.find_all('div', recursive=False)
             if len(_.find_all('a', href=True)) > 3]

        header = soup.find('header')
        if header:
            header.decompose()
        self.remove_site_blocks(soup)
        return soup

    def remove_site_blocks(self, soup) -> None:
        if not self.config.block or not soup.body:
            return
        search_string = ' '.join(['-site:' +
                                 _ for _ in self.config.block.split(',')])
        selected = soup.body.findAll(text=re.compile(search_string))

        for result in selected:
            result.string.replace_with(result.string.replace(
                                       search_string, ''))

    def remove_ads(self) -> None:
        """Removes ads found in the list of search result divs

        Returns:
            None (The soup object is modified directly)
        """
        if not self.main_divs:
            return

        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            div_ads = [_ for _ in div.find_all('span', recursive=True)
                       if has_ad_content(_.text)]
            _ = div.decompose() if len(div_ads) else None

    def remove_block_titles(self) -> None:
        if not self.main_divs or not self.config.block_title:
            return
        block_title = re.compile(self.block_title)
        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            block_divs = [_ for _ in div.find_all('h3', recursive=True)
                          if block_title.search(_.text) is not None]
            _ = div.decompose() if len(block_divs) else None

    def remove_block_url(self) -> None:
        if not self.main_divs or not self.config.block_url:
            return
        block_url = re.compile(self.block_url)
        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
            block_divs = [_ for _ in div.find_all('a', recursive=True)
                          if block_url.search(_.attrs['href']) is not None]
            _ = div.decompose() if len(block_divs) else None

    def remove_block_tabs(self, soup) -> None:
        if self.main_divs:
            for div in self.main_divs.find_all(
                'div',
                attrs={'class': f'{GClasses.main_tbm_tab}'}
            ):
                _ = div.decompose()
        else:
            # when in images tab
            for div in soup.find_all(
                'div',
                attrs={'class': f'{GClasses.images_tbm_tab}'}
            ):
                _ = div.decompose()

    def collapse_sections(self) -> None:
        """Collapses long result sections ("people also asked", "related
         searches", etc) into "details" elements

        These sections are typically the only sections in the results page that
        have more than ~5 child divs within a primary result div.

        Returns:
            None (The soup object is modified directly)
        """
        minimal_mode = read_config_bool('WHOOGLE_MINIMAL')

        def pull_child_divs(result_div: BeautifulSoup):
            try:
                return result_div.findChildren(
                    'div', recursive=False
                )[0].findChildren(
                    'div', recursive=False)
            except IndexError:
                return []

        if not self.main_divs:
            return

        # Loop through results and check for the number of child divs in each
        for result in self.main_divs.find_all():
            result_children = pull_child_divs(result)
            if minimal_mode:
                if any(f">{x}</span" in str(s) for s in result_children
                   for x in minimal_mode_sections):
                    result.decompose()
                    continue
                for s in result_children:
                    if ('Twitter ›' in str(s)):
                        result.decompose()
                        continue
                if len(result_children) < self.RESULT_CHILD_LIMIT:
                    continue
            else:
                if len(result_children) < self.RESULT_CHILD_LIMIT:
                    continue

            # Find and decompose the first element with an inner HTML text val.
            # This typically extracts the title of the section (i.e. "Related
            # Searches", "People also ask", etc)
            # If there are more than one child tags with text
            # parenthesize the rest except the first
            label = 'Collapsed Results'
            subtitle = None
            for elem in result_children:
                if elem.text:
                    content = list(elem.strings)
                    label = content[0]
                    if len(content) > 1:
                        subtitle = '<span> (' + \
                            ''.join(content[1:]) + ')</span>'
                    elem.decompose()
                    break

            # Create the new details element to wrap around the result's
            # first parent
            parent = None
            idx = 0
            while not parent and idx < len(result_children):
                parent = result_children[idx].parent
                idx += 1

            details = BeautifulSoup(features='html.parser').new_tag('details')
            summary = BeautifulSoup(features='html.parser').new_tag('summary')
            summary.string = label

            if subtitle:
                soup = BeautifulSoup(subtitle, 'html.parser')
                summary.append(soup)

            details.append(summary)

            if parent and not minimal_mode:
                parent.wrap(details)
            elif parent and minimal_mode:
                # Remove parent element from document if "minimal mode" is
                # enabled
                parent.decompose()

    def update_element_src(self, element: Tag, mime: str, attr='src') -> None:
        """Encrypts the original src of an element and rewrites the element src
        to use the "/element?src=" pass-through.

        Returns:
            None (The soup element is modified directly)

        """
        src = element[attr].split(' ')[0]

        if src.startswith('//'):
            src = 'https:' + src
        elif src.startswith('data:'):
            return

        if src.startswith(LOGO_URL):
            # Re-brand with Whoogle logo
            element.replace_with(BeautifulSoup(
                render_template('logo.html'),
                features='html.parser'))
            return
        elif src.startswith(G_M_LOGO_URL):
            # Re-brand with single-letter Whoogle logo
            element['src'] = 'static/img/favicon/apple-icon.png'
            element.parent['href'] = 'home'
            return
        elif src.startswith(GOOG_IMG) or GOOG_STATIC in src:
            element['src'] = BLANK_B64
            return

        element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
            self.encrypt_path(
                src,
                is_element=True
            ) + '&type=' + urlparse.quote(mime)
        )

    def update_css(self, soup) -> None:
        """Updates URLs used in inline styles to be proxied by Whoogle
        using the /element endpoint.

        Returns:
            None (The soup element is modified directly)

        """
        # Filter all <style> tags
        for style in soup.find_all('style'):
            style.string = clean_css(style.string, self.page_url)

        # TODO: Convert remote stylesheets to style tags and proxy all
        # remote requests
        # for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
            # print(link)

    def update_styling(self, soup) -> None:
        # Update CSS classes for result divs
        soup = GClasses.replace_css_classes(soup)

        # Remove unnecessary button(s)
        for button in soup.find_all('button'):
            button.decompose()

        # Remove svg logos
        for svg in soup.find_all('svg'):
            svg.decompose()

        # Update logo
        logo = soup.find('a', {'class': 'l'})
        if logo and self.mobile:
            logo['style'] = ('display:flex; justify-content:center; '
                             'align-items:center; color:#685e79; '
                             'font-size:18px; ')

        # Fix search bar length on mobile
        try:
            search_bar = soup.find('header').find('form').find('div')
            search_bar['style'] = 'width: 100%;'
        except AttributeError:
            pass

        # Fix body max width on images tab
        style = soup.find('style')
        div = soup.find('div', attrs={'class': f'{GClasses.images_tbm_tab}'})
        if style and div and not self.mobile:
            css = style.string
            css_html_tag = (
                'html{'
                'font-family: Roboto, Helvetica Neue, Arial, sans-serif;'
                'font-size: 14px;'
                'line-height: 20px;'
                'text-size-adjust: 100%;'
                'word-wrap: break-word;'
                '}'
            )
            css = f"{css_html_tag}{css}"
            css = re.sub('body{(.*?)}',
                         'body{padding:0 8px;margin:0 auto;max-width:736px;}',
                         css)
            style.string = css

    def update_link(self, link: Tag) -> None:
        """Update internal link paths with encrypted path, otherwise remove
        unnecessary redirects and/or marketing params from the url

        Args:
            link: A bs4 Tag element to inspect and update

        Returns:
            None (the tag is updated directly)

        """
        parsed_link = urlparse.urlparse(link['href'])
        link_netloc = ''
        if '/url?q=' in link['href']:
            link_netloc = extract_q(parsed_link.query, link['href'])
        else:
            link_netloc = parsed_link.netloc

        # Remove any elements that direct to unsupported Google pages
        if any(url in link_netloc for url in unsupported_g_pages):
            # FIXME: The "Shopping" tab requires further filtering (see #136)
            # Temporarily removing all links to that tab for now.
            
            # Replaces the /url google unsupported link to the direct url
            link['href'] = link_netloc
            parent = link.parent

            if 'google.com/preferences?hl=' in link_netloc:
                # Handle case where a search is performed in a different
                # language than what is configured. This usually returns a
                # div with the same classes as normal search results, but with
                # a link to configure language preferences through Google.
                # Since we want all language config done through Whoogle, we
                # can safely decompose this element.
                while parent:
                    p_cls = parent.attrs.get('class') or []
                    if f'{GClasses.result_class_a}' in p_cls:
                        parent.decompose()
                        break
                    parent = parent.parent
            else:
                # Remove cases where google links appear in the footer
                while parent:
                    p_cls = parent.attrs.get('class') or []
                    if parent.name == 'footer' or f'{GClasses.footer}' in p_cls:
                        link.decompose()
                    parent = parent.parent
            return

        # Replace href with only the intended destination (no "utm" type tags)
        href = link['href'].replace('https://www.google.com', '')
        result_link = urlparse.urlparse(href)
        q = extract_q(result_link.query, href)

        if q.startswith('/') and q not in self.query and 'spell=1' not in href:
            # Internal google links (i.e. mail, maps, etc) should still
            # be forwarded to Google
            link['href'] = 'https://google.com' + q
        elif q.startswith('https://accounts.google.com'):
            # Remove Sign-in link
            link.decompose()
            return
        elif '/search?q=' in href:
            # "li:1" implies the query should be interpreted verbatim,
            # which is accomplished by wrapping the query in double quotes
            if 'li:1' in href:
                q = '"' + q + '"'
            new_search = 'search?q=' + self.encrypt_path(q)

            query_params = parse_qs(urlparse.urlparse(href).query)
            for param in VALID_PARAMS:
                if param not in query_params:
                    continue
                param_val = query_params[param][0]
                new_search += '&' + param + '=' + param_val
            link['href'] = new_search
        elif 'url?q=' in href:
            # Strip unneeded arguments
            link['href'] = filter_link_args(q)

            # Add alternate viewing options for results,
            # if the result doesn't already have an AV link
            netloc = urlparse.urlparse(link['href']).netloc
            if self.config.anon_view and netloc not in self._av:
                self._av.add(netloc)
                append_anon_view(link, self.config)

        else:
            if href.startswith(MAPS_URL):
                # Maps links don't work if a site filter is applied
                link['href'] = build_map_url(link['href'])
            elif (href.startswith('/?') or href.startswith('/search?') or
                  href.startswith('/imgres?')):
                # make sure that tags can be clicked as relative URLs
                link['href'] = href[1:]
            elif href.startswith('/intl/'):
                # do nothing, keep original URL for ToS
                pass
            elif href.startswith('/preferences'):
                # there is no config specific URL, remove this
                link.decompose()
                return
            else:
                link['href'] = href

        if self.config.new_tab and (
            link["href"].startswith("http")
            or link["href"].startswith("imgres?")
        ):
            link["target"] = "_blank"

        # Replace link location if "alts" config is enabled
        if self.config.alts:
            # Search and replace all link descriptions
            # with alternative location
            link['href'] = get_site_alt(link['href'])
            link_desc = link.find_all(
                text=re.compile('|'.join(SITE_ALTS.keys())))
            if len(link_desc) == 0:
                return

            # Replace link description
            link_desc = link_desc[0]
            for site, alt in SITE_ALTS.items():
                if site not in link_desc or not alt:
                    continue
                new_desc = BeautifulSoup(features='html.parser').new_tag('div')
                new_desc.string = str(link_desc).replace(site, alt)
                link_desc.replace_with(new_desc)
                break

    def view_image(self, soup) -> BeautifulSoup:
        """Replaces the soup with a new one that handles mobile results and
        adds the link of the image full res to the results.

        Args:
            soup: A BeautifulSoup object containing the image mobile results.

        Returns:
            BeautifulSoup: The new BeautifulSoup object
        """

        # get some tags that are unchanged between mobile and pc versions
        cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
        next_pages = soup.find_all('table', attrs={'class': "uZgmoc"})[0]

        results = []
        # find results div
        results_div = soup.find_all('div', attrs={'class': "nQvrDb"})[0]
        # find all the results
        results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})

        for item in results_all:
            urls = item.find('a')['href'].split('&imgrefurl=')

            # Skip urls that are not two-element lists
            if len(urls) != 2:
                continue

            img_url = urlparse.unquote(urls[0].replace(
                f'/{Endpoint.imgres}?imgurl=', ''))

            try:
                # Try to strip out only the necessary part of the web page link
                web_page = urlparse.unquote(urls[1].split('&')[0])
            except IndexError:
                web_page = urlparse.unquote(urls[1])

            img_tbn = urlparse.unquote(item.find('a').find('img')['src'])

            results.append({
                'domain': urlparse.urlparse(web_page).netloc,
                'img_url': img_url,
                'web_page': web_page,
                'img_tbn': img_tbn
            })

        soup = BeautifulSoup(render_template('imageresults.html',
                                             length=len(results),
                                             results=results,
                                             view_label="View Image"),
                             features='html.parser')

        # replace correction suggested by google object if exists
        if len(cor_suggested):
            soup.find_all(
                'table',
                attrs={'class': "By0U9"}
            )[0].replaceWith(cor_suggested[0])
        # replace next page object at the bottom of the page
        soup.find_all('table',
                      attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
        return soup
-												Counter latest result page style changes

Google updated their styling of the result page, which broke some
components of Whoogle's result page styling (namely the result div
backgrounds for dark mode).

The GClasses class has been updated to keep track of what class names
have been updated to, and roll them back to a value that works for
Whoogle. A function was added that loops through new class names and
replaces them with their older counterparts.

											
										
										
											2022-06-10 01:30:55 +03:00
+								import cssutils
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								from bs4 import BeautifulSoup
 								from bs4.element import ResultSet, Tag
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
+								from cryptography.fernet import Fernet
-												Expand custom css variables and functionality

Squashed commit of the following:

commit 37e22d2945b077a94d9997d064f4355ff8819bae
Author: Ben Busby <benbusby@protonmail.com>
Date:   Mon Apr 5 10:27:05 2021 -0400

    Pass user config to logo template

commit 2406fee05c3e221112fbe802fbf2ecca1df99127
Author: Ben Busby <benbusby@protonmail.com>
Date:   Mon Apr 5 10:24:54 2021 -0400

    Fix incorrect contrast text in dark theme

commit 91dd677e22c2e99819123154e03e9f519f95a9bd
Author: Ben Busby <benbusby@protonmail.com>
Date:   Fri Apr 2 17:21:38 2021 -0400

    Remove inline onclicks, fix svg sizing

commit 91bbf9c0fae36febd6a6a0d8e6a560babe8622d5
Merge: 72637df b1227bd
Author: Ben Busby <benbusby@protonmail.com>
Date:   Fri Apr 2 15:35:37 2021 -0400

    Merge remote-tracking branch 'origin/develop' into custom-css-tweaks

commit 72637df213f4b9e83e4b58fe76973de02f63ec8e
Author: Ben Busby <benbusby@protonmail.com>
Date:   Fri Apr 2 11:38:38 2021 -0400

    Use svg logo w/ custom styling on results pages

commit 666a7ceac4a6e4d3fe1975dcee91e6094b66149e
Author: Ben Busby <benbusby@protonmail.com>
Date:   Fri Apr 2 11:10:37 2021 -0400

    Split whoogle-accent into whoogle-element-bg and whoogle-logo

    See discussion on #247

											
										
										
											2021-04-05 17:37:39 +03:00
+								from flask import render_template
-												Remove wildcard imports (#791)


											
										
										
											2022-06-24 19:51:15 +03:00
+								import urllib.parse as urlparse
 								from urllib.parse import parse_qs
 								import re
-												Counter latest result page style changes

Google updated their styling of the result page, which broke some
components of Whoogle's result page styling (namely the result div
backgrounds for dark mode).

The GClasses class has been updated to keep track of what class names
have been updated to, and roll them back to a value that works for
Whoogle. A function was added that loops through new class names and
replaces them with their older counterparts.

											
										
										
											2022-06-10 01:30:55 +03:00
 								from app.models.g_classes import GClasses
 								from app.request import VALID_PARAMS, MAPS_URL
 								from app.utils.misc import get_abs_url, read_config_bool
-												Remove wildcard imports (#791)


											
										
										
											2022-06-24 19:51:15 +03:00
+								from app.utils.results import (
 								    BLANK_B64, GOOG_IMG, GOOG_STATIC, G_M_LOGO_URL, LOGO_URL, SITE_ALTS,
 								    has_ad_content, filter_link_args, append_anon_view, get_site_alt,
 								)
 								from app.models.endpoint import Endpoint
 								from app.models.config import Config
-												Refactored routes, added filter class for returned results, added dockerignore

											
										
										
											2020-04-10 23:52:27 +03:00
-												Fixes handling of maps (#792)

* fixes map url, e.g. when no q parameter is given

* move maps_args from results to filter where it is used
											
										
										
											2022-06-27 21:33:08 +03:00
+								MAPS_ARGS = ['q', 'daddr']
-												Update minimal mode for new Google formatting (#637)

Google's latest formatting changes broke the modifications made when enabling
`WHOOGLE_MINIMAL`. This updates the result filtering to work with the new
changes.

Fixes #634
											
										
										
											2022-02-02 22:57:05 +03:00
+								minimal_mode_sections = ['Top stories', 'Images', 'People also ask']
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
+								unsupported_g_pages = [
 								    'support.google.com',
 								    'accounts.google.com',
 								    'policies.google.com',
 								    'google.com/preferences',
 								    'google.com/intl',
 								    'advanced_search',
 								    'tbm=shop'
 								]
-												Update minimal mode for new Google formatting (#637)

Google's latest formatting changes broke the modifications made when enabling
`WHOOGLE_MINIMAL`. This updates the result filtering to work with the new
changes.

Fixes #634
											
										
										
											2022-02-02 22:57:05 +03:00
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Hotfix: extract only 'q' element from query string

Occasionally the search results will contain links with arguments such
as 'dq', which was being erroneously used in attempts to extract the 'q'
element from query strings. This enforces that only links with '?q=' or
'&q=' (elements with a standalone 'q' arg) will have the element
extracted.

I also refactored the naming of this element once extracted to be just
'q'. Although this seems counterintuitive, it makes a little more sense
since this element is the one we're extracting. It's a vague url arg
name, but it is what it is.

Bump version to 0.5.2 for hotfix release

											
										
										
											2021-05-29 19:21:20 +03:00
+								def extract_q(q_str: str, href: str) -> str:
 								    """Extracts the 'q' element from a result link. This is typically
 								    either the link to a result's website, or a string.
 								    Args:
 								        q_str: The result link to parse
 								        href: The full url to check for standalone 'q' elements first,
 								              rather than parsing the whole query string and then checking.
 								    Returns:
 								        str: The 'q' element of the link, or an empty string
 								    """
 								    return parse_qs(q_str)['q'][0] if ('&q=' in href or '?q=' in href) else ''
-												Fixes handling of maps (#792)

* fixes map url, e.g. when no q parameter is given

* move maps_args from results to filter where it is used
											
										
										
											2022-06-27 21:33:08 +03:00
+								def build_map_url(href: str) -> str:
 								    """Tries to extract known args that explain the location in the url. If a
 								    location is found, returns the default url with it. Otherwise, returns the
 								    url unchanged.
 								    Args:
 								        href: The full url to check.
 								    Returns:
 								        str: The parsed url, or the url unchanged.
 								    """
 								    # parse the url
 								    parsed_url = parse_qs(href)
 								    # iterate through the known parameters and try build the url
 								    for param in MAPS_ARGS:
 								        if param in parsed_url:
 								            return MAPS_URL + "?q=" + parsed_url[param][0]
 								    # query could not be extracted returning unchanged url
 								    return href
-												Add fallback interface/search lang + cleanup

Since the interface language defaults to IP geolocation by google, the
default language is now set to english. Still not sure if this is the
best solution, but at least temporarily should clear up some confusion
for users with instances deployed in countries outside of their own.

Also performed some minor cleanup:
  - Updated name of strip_blocked_sites to clean_query
  - Added clean_query to list of jinja template functions
  - Ensured site block list doesn't contain duplicate filters

											
										
										
											2021-06-04 18:09:30 +03:00
+								def clean_query(query: str) -> str:
 								    """Strips the blocked site list from the query, if one is being
 								    used.
 								    Args:
 								        query: The query string
 								    Returns:
 								        str: The query string without any "-site:..." filters
 								    """
 								    return query[:query.find('-site:')] if '-site:' in query else query
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								def clean_css(css: str, page_url: str) -> str:
 								    """Removes all remote URLs from a CSS string.
 								    Args:
 								        css: The CSS string
 								    Returns:
 								        str: The filtered CSS, with URLs proxied through Whoogle
 								    """
 								    sheet = cssutils.parseString(css)
 								    urls = cssutils.getUrls(sheet)
 								    for url in urls:
 								        abs_url = get_abs_url(url, page_url)
 								        if abs_url.startswith('data:'):
 								            continue
 								        css = css.replace(
 								            url,
-												Remove "/" before endpoints & tags (#734)

Removes the leading slash before imgres and other endpoints

Fix #733
											
										
										
											2022-04-27 23:25:14 +03:00
+								            f'{Endpoint.element}?type=image/png&url={abs_url}'
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        )
 								    return css
-												Added testing and ci build, refactored filter class, refactored project structure

											
										
										
											2020-04-16 02:41:53 +03:00
+								class Filter:
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								    # Limit used for determining if a result is a "regular" result or a list
 								    # type result (such as "people also asked", "related searches", etc)
 								    RESULT_CHILD_LIMIT = 7
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								    def __init__(
 								            self,
 								            user_key: str,
 								            config: Config,
 								            root_url='',
 								            page_url='',
-												Ensure searches with a leading slash are treated as queries

A user reported a bug where searches with a leading slash (in this case:
"/e/OS apps" were interpreted as a Google specific link when clicking
the next page of results.

This was due to the behavior that Google's search results exhibit, where
internal links for pages like support.google.com are delivered with
params like "?q=/support" rather than a direct link. This fixes that
scenario by checking the "q" param value against the user's original
query to ensure they don't match before assuming that the result is
intended as a redirect.

Fixes #776

											
										
										
											2022-06-03 23:03:57 +03:00
+								            query='',
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								            mobile=False) -> None:
-												Fix incorrect request type for image searches

Previously had hardcoded POST requests for all requests that didn't use
the header template (which currently is only the image tab).

Also refactored how the Filter class works. It now requires a valid
Config model to be provided, which is then set up as a class var that
the filtering functions can use as needed, rather than setting specific
values from the config as individual values (which was confusing and
sloppy).

Fixes #561

											
										
										
											2021-12-07 07:39:50 +03:00
+								        self.config = config
-												Minor refactor of filter class, updated tests, fixed html/css, added ua to config

											
										
										
											2020-04-16 19:01:02 +03:00
+								        self.mobile = mobile
-												Switch to single Fernet key per session

This moves away from the previous (messy) approach of using two separate
keys for decrypting text and element URLs separately and regenerating
them for new searches. The current implementation of sessions is not very
reliable, which lead to keys being regenerated too soon, which would
break page navigation. Until that can be addressed, the single
key per session approach should work a lot better.

Fixes #250

Fixes #90

											
										
										
											2021-04-01 07:23:30 +03:00
+								        self.user_key = user_key
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        self.page_url = page_url
-												Ensure searches with a leading slash are treated as queries

A user reported a bug where searches with a leading slash (in this case:
"/e/OS apps" were interpreted as a Google specific link when clicking
the next page of results.

This was due to the behavior that Google's search results exhibit, where
internal links for pages like support.google.com are delivered with
params like "?q=/support" rather than a direct link. This fixes that
scenario by checking the "q" param value against the user's original
query to ensure they don't match before assuming that the result is
intended as a redirect.

Fixes #776

											
										
										
											2022-06-03 23:03:57 +03:00
+								        self.query = query
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        self.main_divs = ResultSet('')
 								        self._elements = 0
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        self._av = set()
-												Added testing and ci build, refactored filter class, refactored project structure

											
										
										
											2020-04-16 02:41:53 +03:00
-												Strip trailing slash on root url in filter

If a trailing slash is defined here, it causes the Whoogle instance to
redirect these element requests back to the home page, causing unwanted
behavior.

											
										
										
											2022-04-20 23:55:19 +03:00
+								        self.root_url = root_url[:-1] if root_url.endswith('/') else root_url
-												Refactoring of user requests and routing

Curl requests and user agent related functionality was moved to its own
request class.

Routes was refactored to only include strictly routing related
functionality.

Filter class was cleaned up (had routing/request related logic in here,
which didn't make sense)

											
										
										
											2020-04-24 05:59:43 +03:00
+								    def __getitem__(self, name):
 								        return getattr(self, name)
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								    @property
 								    def elements(self):
 								        return self._elements
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								    def encrypt_path(self, path, is_element=False) -> str:
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        # Encrypts path to avoid plaintext results in logs
 								        if is_element:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            # Element paths are encrypted separately from text, to allow key
 								            # regeneration once all items have been served to the user
-												Switch to single Fernet key per session

This moves away from the previous (messy) approach of using two separate
keys for decrypting text and element URLs separately and regenerating
them for new searches. The current implementation of sessions is not very
reliable, which lead to keys being regenerated too soon, which would
break page navigation. Until that can be addressed, the single
key per session approach should work a lot better.

Fixes #250

Fixes #90

											
										
										
											2021-04-01 07:23:30 +03:00
+								            enc_path = Fernet(self.user_key).encrypt(path.encode()).decode()
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								            self._elements += 1
 								            return enc_path
-												Switch to single Fernet key per session

This moves away from the previous (messy) approach of using two separate
keys for decrypting text and element URLs separately and regenerating
them for new searches. The current implementation of sessions is not very
reliable, which lead to keys being regenerated too soon, which would
break page navigation. Until that can be addressed, the single
key per session approach should work a lot better.

Fixes #250

Fixes #90

											
										
										
											2021-04-01 07:23:30 +03:00
+								        return Fernet(self.user_key).encrypt(path.encode()).decode()
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								    def clean(self, soup) -> BeautifulSoup:
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        self.main_divs = soup.find('div', {'id': 'main'})
 								        self.remove_ads()
-												Block by result title or url using regex (#473)

Allows blocking search results using a regex filter for either
result title or result url
											
										
										
											2021-10-21 05:01:04 +03:00
+								        self.remove_block_titles()
 								        self.remove_block_url()
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								        self.collapse_sections()
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        self.update_css(soup)
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
+								        self.update_styling(soup)
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
+								        self.remove_block_tabs(soup)
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
 								        for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]:
 								            self.update_element_src(img, 'image/png')
 								        for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]:
 								            self.update_element_src(audio, 'audio/mpeg')
 								        for link in soup.find_all('a', href=True):
 								            self.update_link(link)
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
 								        input_form = soup.find('form')
-												Updated tests, fixed a few bugs

Added opensearch routes test and individual tests for searching via GET
and POST separately.

Fixed incorrect assignment in gen_query.

											
										
										
											2020-04-29 03:59:33 +03:00
+								        if input_form is not None:
-												Fix incorrect request type for image searches

Previously had hardcoded POST requests for all requests that didn't use
the header template (which currently is only the image tab).

Also refactored how the Filter class works. It now requires a valid
Config model to be provided, which is then set up as a class var that
the filtering functions can use as needed, rather than setting specific
values from the config as individual values (which was confusing and
sloppy).

Fixes #561

											
										
										
											2021-12-07 07:39:50 +03:00
+								            input_form['method'] = 'GET' if self.config.get_only else 'POST'
-												Add support for relative search results (#715)

* Relativization of search results

* Fix JavaScript error when opening images

* Replace single-letter logo and remove sign-in link

* Add `WHOOGLE_URL_PREFIX` env var to support relative path redirection

The `WHOOGLE_URL_PREFIX` var can now be set to fix internal app
redirects, such as the `/session` redirect performed on the first visit
to the Whoogle home page.

Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-04-19 00:27:45 +03:00
+								            # Use a relative URI for submissions
 								            input_form['action'] = 'search'
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
-												Cleaned up filter class, updated js config tool

											
										
										
											2020-04-29 18:46:18 +03:00
+								        # Ensure no extra scripts passed through
 								        for script in soup('script'):
 								            script.decompose()
-												Added better multilingual support, updated filter

Results page now includes method for switching to "All Languages" from
whichever language is specified as the primary in the config (see #74).

Also removes the non-Whoogle links from the page footer, leaving only
the page navigation controls

Added support for the date range filter on the results page, though I'd
still recommend using the ":past <unit>" query instead.

											
										
										
											2020-06-07 23:06:49 +03:00
+								        # Update default footer and header
 								        footer = soup.find('footer')
-												Feature: autocomplete/search suggestions (#72)

Basic autocomplete/search suggestion functionality added

* Adds new GET and POST routes for '/autocomplete' that accept a string query and returns an array of suggestions

* Adds new autoscript.js file for handling queries on the main page and results view

* Updated requests class to include autocomplete method

* Updated opensearch template to handle search suggestions

* Added header template to allow for autocomplete on results view

* Updated readme to mention autocomplete feature
											
										
										
											2020-05-24 23:03:11 +03:00
+								        if footer:
-												Added better multilingual support, updated filter

Results page now includes method for switching to "All Languages" from
whichever language is specified as the primary in the config (see #74).

Also removes the non-Whoogle links from the page footer, leaving only
the page navigation controls

Added support for the date range filter on the results page, though I'd
still recommend using the ":past <unit>" query instead.

											
										
										
											2020-06-07 23:06:49 +03:00
+								            # Remove divs that have multiple links beyond just page navigation
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            [_.decompose() for _ in footer.find_all('div', recursive=False)
 								             if len(_.find_all('a', href=True)) > 3]
-												Cleaned up filter class, updated js config tool

											
										
										
											2020-04-29 18:46:18 +03:00
-												Feature: autocomplete/search suggestions (#72)

Basic autocomplete/search suggestion functionality added

* Adds new GET and POST routes for '/autocomplete' that accept a string query and returns an array of suggestions

* Adds new autoscript.js file for handling queries on the main page and results view

* Updated requests class to include autocomplete method

* Updated opensearch template to handle search suggestions

* Added header template to allow for autocomplete on results view

* Updated readme to mention autocomplete feature
											
										
										
											2020-05-24 23:03:11 +03:00
+								        header = soup.find('header')
 								        if header:
 								            header.decompose()
-												Clean "Show more results" of all site blocks (#646)


											
										
										
											2022-02-08 20:57:00 +03:00
+								        self.remove_site_blocks(soup)
-												Added testing and ci build, refactored filter class, refactored project structure

											
										
										
											2020-04-16 02:41:53 +03:00
+								        return soup
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
-												Clean "Show more results" of all site blocks (#646)


											
										
										
											2022-02-08 20:57:00 +03:00
+								    def remove_site_blocks(self, soup) -> None:
-												Check for soup body in `remove_site_blocks` (#651)

Fixes error with `remove_site_blocks` in the Images tab
											
										
										
											2022-02-12 00:42:11 +03:00
+								        if not self.config.block or not soup.body:
-												Clean "Show more results" of all site blocks (#646)


											
										
										
											2022-02-08 20:57:00 +03:00
+								            return
 								        search_string = ' '.join(['-site:' +
 								                                 _ for _ in self.config.block.split(',')])
 								        selected = soup.body.findAll(text=re.compile(search_string))
 								        for result in selected:
 								            result.string.replace_with(result.string.replace(
 								                                       search_string, ''))
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								    def remove_ads(self) -> None:
 								        """Removes ads found in the list of search result divs
 								        Returns:
 								            None (The soup object is modified directly)
 								        """
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        if not self.main_divs:
-												Fixed filter params, updated search button text

											
										
										
											2020-04-29 19:03:34 +03:00
+								            return
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            div_ads = [_ for _ in div.find_all('span', recursive=True)
 								                       if has_ad_content(_.text)]
 								            _ = div.decompose() if len(div_ads) else None
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
-												Block by result title or url using regex (#473)

Allows blocking search results using a regex filter for either
result title or result url
											
										
										
											2021-10-21 05:01:04 +03:00
+								    def remove_block_titles(self) -> None:
-												Fix incorrect request type for image searches

Previously had hardcoded POST requests for all requests that didn't use
the header template (which currently is only the image tab).

Also refactored how the Filter class works. It now requires a valid
Config model to be provided, which is then set up as a class var that
the filtering functions can use as needed, rather than setting specific
values from the config as individual values (which was confusing and
sloppy).

Fixes #561

											
										
										
											2021-12-07 07:39:50 +03:00
+								        if not self.main_divs or not self.config.block_title:
-												Block by result title or url using regex (#473)

Allows blocking search results using a regex filter for either
result title or result url
											
										
										
											2021-10-21 05:01:04 +03:00
+								            return
 								        block_title = re.compile(self.block_title)
 								        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
 								            block_divs = [_ for _ in div.find_all('h3', recursive=True)
 								                          if block_title.search(_.text) is not None]
 								            _ = div.decompose() if len(block_divs) else None
 								    def remove_block_url(self) -> None:
-												Fix incorrect request type for image searches

Previously had hardcoded POST requests for all requests that didn't use
the header template (which currently is only the image tab).

Also refactored how the Filter class works. It now requires a valid
Config model to be provided, which is then set up as a class var that
the filtering functions can use as needed, rather than setting specific
values from the config as individual values (which was confusing and
sloppy).

Fixes #561

											
										
										
											2021-12-07 07:39:50 +03:00
+								        if not self.main_divs or not self.config.block_url:
-												Block by result title or url using regex (#473)

Allows blocking search results using a regex filter for either
result title or result url
											
										
										
											2021-10-21 05:01:04 +03:00
+								            return
 								        block_url = re.compile(self.block_url)
 								        for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
 								            block_divs = [_ for _ in div.find_all('a', recursive=True)
 								                          if block_url.search(_.attrs['href']) is not None]
 								            _ = div.decompose() if len(block_divs) else None
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
+								    def remove_block_tabs(self, soup) -> None:
 								        if self.main_divs:
 								            for div in self.main_divs.find_all(
 								                'div',
 								                attrs={'class': f'{GClasses.main_tbm_tab}'}
 								            ):
 								                _ = div.decompose()
 								        else:
 								            # when in images tab
 								            for div in soup.find_all(
 								                'div',
 								                attrs={'class': f'{GClasses.images_tbm_tab}'}
 								            ):
 								                _ = div.decompose()
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								    def collapse_sections(self) -> None:
 								        """Collapses long result sections ("people also asked", "related
 								         searches", etc) into "details" elements
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
 								        These sections are typically the only sections in the results page that
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								        have more than ~5 child divs within a primary result div.
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
 								        Returns:
 								            None (The soup object is modified directly)
 								        """
-												Add WHOOGLE_MINIMAL to docs, tweak min mode logic

Activating minimal mode should also remove all collapsed sections, if
any are found.

WHOOGLE_MINIMAL now documented in readme and app.json (for heroku).

											
										
										
											2021-10-26 19:38:20 +03:00
+								        minimal_mode = read_config_bool('WHOOGLE_MINIMAL')
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								        def pull_child_divs(result_div: BeautifulSoup):
 								            try:
 								                return result_div.findChildren(
 								                    'div', recursive=False
 								                )[0].findChildren(
 								                    'div', recursive=False)
 								            except IndexError:
 								                return []
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								        if not self.main_divs:
-												Add ability to collapse "people also ask"

This adds a step in the filter process to wrap the "people also ask"
section in a <details> element, which automatically collapses the
contents of the section. Clicking/tapping the details element expands
the view as normal.

See #113

											
										
										
											2020-12-15 19:09:48 +03:00
+								            return
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								        # Loop through results and check for the number of child divs in each
-												Fix `collapse_sections` for `MINIMAL_MODE` (#654)


											
										
										
											2022-02-12 00:44:08 +03:00
+								        for result in self.main_divs.find_all():
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								            result_children = pull_child_divs(result)
-												Add WHOOGLE_MINIMAL to docs, tweak min mode logic

Activating minimal mode should also remove all collapsed sections, if
any are found.

WHOOGLE_MINIMAL now documented in readme and app.json (for heroku).

											
										
										
											2021-10-26 19:38:20 +03:00
+								            if minimal_mode:
-												Update minimal mode for new Google formatting (#637)

Google's latest formatting changes broke the modifications made when enabling
`WHOOGLE_MINIMAL`. This updates the result filtering to work with the new
changes.

Fixes #634
											
										
										
											2022-02-02 22:57:05 +03:00
+								                if any(f">{x}</span" in str(s) for s in result_children
 								                   for x in minimal_mode_sections):
 								                    result.decompose()
 								                    continue
-												Fix `collapse_sections` for `MINIMAL_MODE` (#654)


											
										
										
											2022-02-12 00:44:08 +03:00
+								                for s in result_children:
 								                    if ('Twitter ›' in str(s)):
 								                        result.decompose()
 								                        continue
-												Update minimal mode for new Google formatting (#637)

Google's latest formatting changes broke the modifications made when enabling
`WHOOGLE_MINIMAL`. This updates the result filtering to work with the new
changes.

Fixes #634
											
										
										
											2022-02-02 22:57:05 +03:00
+								                if len(result_children) < self.RESULT_CHILD_LIMIT:
-												Add a "minimal mode" for condensing results (#485)

If WHOOGLE_MINIMAL is set, all non-link results are
removed from the view.
											
										
										
											2021-10-26 19:35:12 +03:00
+								                    continue
 								            else:
 								                if len(result_children) < self.RESULT_CHILD_LIMIT:
 								                    continue
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
 								            # Find and decompose the first element with an inner HTML text val.
 								            # This typically extracts the title of the section (i.e. "Related
 								            # Searches", "People also ask", etc)
-												Improve formatting of collapsible infobox (#612)


											
										
										
											2022-01-18 23:47:35 +03:00
+								            # If there are more than one child tags with text
 								            # parenthesize the rest except the first
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								            label = 'Collapsed Results'
-												Improve formatting of collapsible infobox (#612)


											
										
										
											2022-01-18 23:47:35 +03:00
+								            subtitle = None
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								            for elem in result_children:
 								                if elem.text:
-												Improve formatting of collapsible infobox (#612)


											
										
										
											2022-01-18 23:47:35 +03:00
+								                    content = list(elem.strings)
 								                    label = content[0]
 								                    if len(content) > 1:
 								                        subtitle = '<span> (' + \
 								                            ''.join(content[1:]) + ')</span>'
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								                    elem.decompose()
 								                    break
 								            # Create the new details element to wrap around the result's
-												Find valid parent element when collapsing result content

Previously if a result element marked for collapsing didn't have a valid
"parent" element, the collapsing was skipped altogether. This loops
through child elements until a valid parent is found (or if one isn't
found, the element will not be collapsed).

											
										
										
											2021-07-04 22:20:19 +03:00
+								            # first parent
 								            parent = None
 								            idx = 0
 								            while not parent and idx < len(result_children):
 								                parent = result_children[idx].parent
 								                idx += 1
-												Add WHOOGLE_MINIMAL to docs, tweak min mode logic

Activating minimal mode should also remove all collapsed sections, if
any are found.

WHOOGLE_MINIMAL now documented in readme and app.json (for heroku).

											
										
										
											2021-10-26 19:38:20 +03:00
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								            details = BeautifulSoup(features='html.parser').new_tag('details')
 								            summary = BeautifulSoup(features='html.parser').new_tag('summary')
 								            summary.string = label
-												Improve formatting of collapsible infobox (#612)


											
										
										
											2022-01-18 23:47:35 +03:00
 								            if subtitle:
 								                soup = BeautifulSoup(subtitle, 'html.parser')
 								                summary.append(soup)
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								            details.append(summary)
-												Add WHOOGLE_MINIMAL to docs, tweak min mode logic

Activating minimal mode should also remove all collapsed sections, if
any are found.

WHOOGLE_MINIMAL now documented in readme and app.json (for heroku).

											
										
										
											2021-10-26 19:38:20 +03:00
+								            if parent and not minimal_mode:
-												Collapse long result sections into details/summary elements

Sections such as "People also asked" and "related searches" typically
take up a lot of room on the results page, and don't always have the
most useful information. This checks for result elements with more than
7 child divs, extracts the section title, and wraps all elements in a
"details" element that can be expanded/collapsed by the user.

Note that this functionality existed previously (albeit not implemented
as well), but due to changes in how Google returns searches (switching
from using <h2> elements for section headers to <span> or <div>
elements), the approach to collapsing these sections needed to be
updated.

											
										
										
											2021-06-24 01:59:57 +03:00
+								                parent.wrap(details)
-												Add WHOOGLE_MINIMAL to docs, tweak min mode logic

Activating minimal mode should also remove all collapsed sections, if
any are found.

WHOOGLE_MINIMAL now documented in readme and app.json (for heroku).

											
										
										
											2021-10-26 19:38:20 +03:00
+								            elif parent and minimal_mode:
 								                # Remove parent element from document if "minimal mode" is
 								                # enabled
 								                parent.decompose()
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								    def update_element_src(self, element: Tag, mime: str, attr='src') -> None:
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								        """Encrypts the original src of an element and rewrites the element src
 								        to use the "/element?src=" pass-through.
 								        Returns:
 								            None (The soup element is modified directly)
 								        """
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        src = element[attr].split(' ')[0]
-												Ensure G logo doesn't appear in mobile img results

Adds a separate check to remove all images sourced from www.gstatic.com,
which is where the mobile logo in particular is coming from.

											
										
										
											2021-02-20 23:04:32 +03:00
 								        if src.startswith('//'):
 								            src = 'https:' + src
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        elif src.startswith('data:'):
 								            return
-												Ensure G logo doesn't appear in mobile img results

Adds a separate check to remove all images sourced from www.gstatic.com,
which is where the mobile logo in particular is coming from.

											
										
										
											2021-02-20 23:04:32 +03:00
 								        if src.startswith(LOGO_URL):
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								            # Re-brand with Whoogle logo
-												Expand custom css theming support

Also adds new default dark theme designed by @gripped.

											
										
										
											2021-04-09 18:00:02 +03:00
+								            element.replace_with(BeautifulSoup(
-												Fix incorrect request type for image searches

Previously had hardcoded POST requests for all requests that didn't use
the header template (which currently is only the image tab).

Also refactored how the Filter class works. It now requires a valid
Config model to be provided, which is then set up as a class var that
the filtering functions can use as needed, rather than setting specific
values from the config as individual values (which was confusing and
sloppy).

Fixes #561

											
										
										
											2021-12-07 07:39:50 +03:00
+								                render_template('logo.html'),
-												Expand custom css theming support

Also adds new default dark theme designed by @gripped.

											
										
										
											2021-04-09 18:00:02 +03:00
+								                features='html.parser'))
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								            return
-												Add support for relative search results (#715)

* Relativization of search results

* Fix JavaScript error when opening images

* Replace single-letter logo and remove sign-in link

* Add `WHOOGLE_URL_PREFIX` env var to support relative path redirection

The `WHOOGLE_URL_PREFIX` var can now be set to fix internal app
redirects, such as the `/session` redirect performed on the first visit
to the Whoogle home page.

Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-04-19 00:27:45 +03:00
+								        elif src.startswith(G_M_LOGO_URL):
 								            # Re-brand with single-letter Whoogle logo
 								            element['src'] = 'static/img/favicon/apple-icon.png'
 								            element.parent['href'] = 'home'
 								            return
-												Ensure G logo doesn't appear in mobile img results

Adds a separate check to remove all images sourced from www.gstatic.com,
which is where the mobile logo in particular is coming from.

											
										
										
											2021-02-20 23:04:32 +03:00
+								        elif src.startswith(GOOG_IMG) or GOOG_STATIC in src:
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								            element['src'] = BLANK_B64
 								            return
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								        element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
 								            self.encrypt_path(
 								                src,
 								                is_element=True
 								            ) + '&type=' + urlparse.quote(mime)
 								        )
 								    def update_css(self, soup) -> None:
 								        """Updates URLs used in inline styles to be proxied by Whoogle
 								        using the /element endpoint.
 								        Returns:
 								            None (The soup element is modified directly)
 								        """
 								        # Filter all <style> tags
 								        for style in soup.find_all('style'):
 								            style.string = clean_css(style.string, self.page_url)
 								        # TODO: Convert remote stylesheets to style tags and proxy all
 								        # remote requests
 								        # for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
 								            # print(link)
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								    def update_styling(self, soup) -> None:
-												Counter latest result page style changes

Google updated their styling of the result page, which broke some
components of Whoogle's result page styling (namely the result div
backgrounds for dark mode).

The GClasses class has been updated to keep track of what class names
have been updated to, and roll them back to a value that works for
Whoogle. A function was added that loops through new class names and
replaces them with their older counterparts.

											
										
										
											2022-06-10 01:30:55 +03:00
+								        # Update CSS classes for result divs
 								        soup = GClasses.replace_css_classes(soup)
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
+								        # Remove unnecessary button(s)
 								        for button in soup.find_all('button'):
 								            button.decompose()
 								        # Remove svg logos
 								        for svg in soup.find_all('svg'):
 								            svg.decompose()
 								        # Update logo
 								        logo = soup.find('a', {'class': 'l'})
 								        if logo and self.mobile:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            logo['style'] = ('display:flex; justify-content:center; '
 								                             'align-items:center; color:#685e79; '
 								                             'font-size:18px; ')
-												Added POST search, encrypted query strings, refactoring

The implementation of POST search support comes with a few benefits. The
most apparent is the avoidance of search queries appearing in web server
logs -- instead of the prior GET approach (i.e.
/search?q=my+search+query), using POST requests with the query stored in
the request body creates logs that simply appear as "/search".

Since a lot of relative links are generated in the results page, I came
up with a way to generate a unique key at run time that is used to
encrypt any query strings before sending to the user. This benefits both
regular text queries as well as fetching of image links and means that
web logs will only show an encrypted string where a link or query
string might slip through.

Unfortunately, GET search requests still need to be supported, as it
doesn't seem that Firefox (on iOS) supports loading search engines by
their opensearch.xml file, but instead relies on manual entry of a
search query string. Once this is updated, I'll probably remove GET
request search support.

											
										
										
											2020-04-29 03:19:34 +03:00
 								        # Fix search bar length on mobile
 								        try:
 								            search_bar = soup.find('header').find('form').find('div')
 								            search_bar['style'] = 'width: 100%;'
 								        except AttributeError:
 								            pass
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
+								        # Fix body max width on images tab
 								        style = soup.find('style')
 								        div = soup.find('div', attrs={'class': f'{GClasses.images_tbm_tab}'})
 								        if style and div and not self.mobile:
 								            css = style.string
 								            css_html_tag = (
 								                'html{'
 								                'font-family: Roboto, Helvetica Neue, Arial, sans-serif;'
 								                'font-size: 14px;'
 								                'line-height: 20px;'
 								                'text-size-adjust: 100%;'
 								                'word-wrap: break-word;'
 								                '}'
 								            )
 								            css = f"{css_html_tag}{css}"
 								            css = re.sub('body{(.*?)}',
 								                         'body{padding:0 8px;margin:0 auto;max-width:736px;}',
 								                         css)
 								            style.string = css
-												Improve static typing throughout repo

Eventually this should be part of a separate mypy ci build, but right
now it's just a general guideline. Future commits and PRs should be
validated for static typing wherever possible.

For reference, the testing commands used for this commit were:

mypy --ignore-missing-imports --pretty --disallow-untyped-calls app/
mypy --ignore-missing-imports --pretty --disallow-untyped-calls test/

											
										
										
											2021-03-24 22:13:52 +03:00
+								    def update_link(self, link: Tag) -> None:
 								        """Update internal link paths with encrypted path, otherwise remove
 								        unnecessary redirects and/or marketing params from the url
 								        Args:
 								            link: A bs4 Tag element to inspect and update
 								        Returns:
 								            None (the tag is updated directly)
 								        """
-												Remove google prefs link for mismatched language queries

Queries performed in a different language than what is configured
contain a result div that prompts the user to configure their language
preferences using google's preferences page.

Since we want all language configuration to occur on Whoogle only, we
can safely remove this result div.

Fixes #444
Fixes #386

											
										
										
											2022-08-01 22:46:06 +03:00
+								        parsed_link = urlparse.urlparse(link['href'])
 								        link_netloc = ''
 								        if '/url?q=' in link['href']:
 								            link_netloc = extract_q(parsed_link.query, link['href'])
 								        else:
 								            link_netloc = parsed_link.netloc
-												Improve G page distinction between footer and results

Pages in the Whoogle footer that by default route to Google pages were
previously being removed, but caused results that also routed to similar
pages to no longer be accessible. This was due to the removal of the
'/url' endpoint that Google uses for each result.

To fix this, the result link is now parsed so that the domain of the
result can be checked against the disallowed G page list. Since results
are delivered in a "/url?q=<domain>" format -- even for pages to
Google's own products -- and the footer links are formatted as
"<product>.google.com", footer links are removed and result links are
parsed correctly.

Fixes #747

											
										
										
											2022-05-16 18:53:48 +03:00
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
+								        # Remove any elements that direct to unsupported Google pages
-												Improve G page distinction between footer and results

Pages in the Whoogle footer that by default route to Google pages were
previously being removed, but caused results that also routed to similar
pages to no longer be accessible. This was due to the removal of the
'/url' endpoint that Google uses for each result.

To fix this, the result link is now parsed so that the domain of the
result can be checked against the disallowed G page list. Since results
are delivered in a "/url?q=<domain>" format -- even for pages to
Google's own products -- and the footer links are formatted as
"<product>.google.com", footer links are removed and result links are
parsed correctly.

Fixes #747

											
										
										
											2022-05-16 18:53:48 +03:00
+								        if any(url in link_netloc for url in unsupported_g_pages):
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            # FIXME: The "Shopping" tab requires further filtering (see #136)
-												Fix improper header styling, remove shopping tab links

The header template was using Google's classes for the "Whoogle" logo,
which meant keeping up with their list of colors used in the logo. The
template was updated to only ever use the Whoogle logo color.
Accordingly, the logo specific styling in filter.py was removed, since
it is no longer needed.

Also removes all links to the shopping tab, as it seems that the
majority of the links to items are Google specific links (usually
google.com/aclk links without any discernible param for determining the
true location for the link). The shopping page should be addressed
separately with unique filtering/formatting. Further tracking of this
task will be followed in #136.

											
										
										
											2020-10-25 20:52:30 +03:00
+								            # Temporarily removing all links to that tab for now.
-												Fix bad internal redirection for google links (#850)


											
										
										
											2022-09-20 20:10:27 +03:00
 								            # Replaces the /url google unsupported link to the direct url
 								            link['href'] = link_netloc
-												Only remove G links in footer

Links that were directed at G domains were previously removed
universally, when really they only needed to be removed from the footer
to reduce possible confusion caused by mixed Whoogle and G links.

Fixes #656

											
										
										
											2022-03-01 22:48:33 +03:00
+								            parent = link.parent
-												Fix bad internal redirection for google links (#850)


											
										
										
											2022-09-20 20:10:27 +03:00
-												Remove google prefs link for mismatched language queries

Queries performed in a different language than what is configured
contain a result div that prompts the user to configure their language
preferences using google's preferences page.

Since we want all language configuration to occur on Whoogle only, we
can safely remove this result div.

Fixes #444
Fixes #386

											
										
										
											2022-08-01 22:46:06 +03:00
+								            if 'google.com/preferences?hl=' in link_netloc:
 								                # Handle case where a search is performed in a different
 								                # language than what is configured. This usually returns a
 								                # div with the same classes as normal search results, but with
 								                # a link to configure language preferences through Google.
 								                # Since we want all language config done through Whoogle, we
 								                # can safely decompose this element.
 								                while parent:
 								                    p_cls = parent.attrs.get('class') or []
 								                    if f'{GClasses.result_class_a}' in p_cls:
 								                        parent.decompose()
 								                        break
 								                    parent = parent.parent
 								            else:
 								                # Remove cases where google links appear in the footer
 								                while parent:
 								                    p_cls = parent.attrs.get('class') or []
 								                    if parent.name == 'footer' or f'{GClasses.footer}' in p_cls:
 								                        link.decompose()
 								                    parent = parent.parent
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								            return
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
+								        # Replace href with only the intended destination (no "utm" type tags)
 								        href = link['href'].replace('https://www.google.com', '')
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        result_link = urlparse.urlparse(href)
-												Hotfix: extract only 'q' element from query string

Occasionally the search results will contain links with arguments such
as 'dq', which was being erroneously used in attempts to extract the 'q'
element from query strings. This enforces that only links with '?q=' or
'&q=' (elements with a standalone 'q' arg) will have the element
extracted.

I also refactored the naming of this element once extracted to be just
'q'. Although this seems counterintuitive, it makes a little more sense
since this element is the one we're extracting. It's a vague url arg
name, but it is what it is.

Bump version to 0.5.2 for hotfix release

											
										
										
											2021-05-29 19:21:20 +03:00
+								        q = extract_q(result_link.query, href)
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Fix redirect for misspelled queries starting with `/`

Fixes #818

											
										
										
											2022-08-01 21:12:55 +03:00
+								        if q.startswith('/') and q not in self.query and 'spell=1' not in href:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            # Internal google links (i.e. mail, maps, etc) should still
 								            # be forwarded to Google
-												Hotfix: extract only 'q' element from query string

Occasionally the search results will contain links with arguments such
as 'dq', which was being erroneously used in attempts to extract the 'q'
element from query strings. This enforces that only links with '?q=' or
'&q=' (elements with a standalone 'q' arg) will have the element
extracted.

I also refactored the naming of this element once extracted to be just
'q'. Although this seems counterintuitive, it makes a little more sense
since this element is the one we're extracting. It's a vague url arg
name, but it is what it is.

Bump version to 0.5.2 for hotfix release

											
										
										
											2021-05-29 19:21:20 +03:00
+								            link['href'] = 'https://google.com' + q
-												Add support for relative search results (#715)

* Relativization of search results

* Fix JavaScript error when opening images

* Replace single-letter logo and remove sign-in link

* Add `WHOOGLE_URL_PREFIX` env var to support relative path redirection

The `WHOOGLE_URL_PREFIX` var can now be set to fix internal app
redirects, such as the `/session` redirect performed on the first visit
to the Whoogle home page.

Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-04-19 00:27:45 +03:00
+								        elif q.startswith('https://accounts.google.com'):
 								            # Remove Sign-in link
 								            link.decompose()
 								            return
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        elif '/search?q=' in href:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            # "li:1" implies the query should be interpreted verbatim,
 								            # which is accomplished by wrapping the query in double quotes
-												Privacy respecting alternatives in results view (#106)

Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration.

Verbatim search and option to ignore search autocorrect are now supported as well.

Also cleaned up the javascript side of whoogle config so that it now
uses arrays of available fields for parsing config values instead of manually assigning each
one to a variable.

This doesn't include support for Google Maps -> Open Street Maps, that
seems a bit more involved than the social media redirects were, so it
should likely be a separate effort.
											
										
										
											2020-07-26 20:53:59 +03:00
+								            if 'li:1' in href:
-												Hotfix: extract only 'q' element from query string

Occasionally the search results will contain links with arguments such
as 'dq', which was being erroneously used in attempts to extract the 'q'
element from query strings. This enforces that only links with '?q=' or
'&q=' (elements with a standalone 'q' arg) will have the element
extracted.

I also refactored the naming of this element once extracted to be just
'q'. Although this seems counterintuitive, it makes a little more sense
since this element is the one we're extracting. It's a vague url arg
name, but it is what it is.

Bump version to 0.5.2 for hotfix release

											
										
										
											2021-05-29 19:21:20 +03:00
+								                q = '"' + q + '"'
 								            new_search = 'search?q=' + self.encrypt_path(q)
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
 								            query_params = parse_qs(urlparse.urlparse(href).query)
 								            for param in VALID_PARAMS:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								                if param not in query_params:
 								                    continue
 								                param_val = query_params[param][0]
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								                new_search += '&' + param + '=' + param_val
 								            link['href'] = new_search
 								        elif 'url?q=' in href:
 								            # Strip unneeded arguments
-												Hotfix: extract only 'q' element from query string

Occasionally the search results will contain links with arguments such
as 'dq', which was being erroneously used in attempts to extract the 'q'
element from query strings. This enforces that only links with '?q=' or
'&q=' (elements with a standalone 'q' arg) will have the element
extracted.

I also refactored the naming of this element once extracted to be just
'q'. Although this seems counterintuitive, it makes a little more sense
since this element is the one we're extracting. It's a vague url arg
name, but it is what it is.

Bump version to 0.5.2 for hotfix release

											
										
										
											2021-05-29 19:21:20 +03:00
+								            link['href'] = filter_link_args(q)
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Support proxying results through Whoogle (aka "anonymous view") (#682)

* Expand `/window` endpoint to behave like a proxy

The `/window` endpoint was previously used as a type of proxy, but only
for removing Javascript from the result page. This expands the existing
functionality to allow users to proxy search result pages (with or without
Javascript) through their Whoogle instance.

* Implement filtering of remote content from css

* Condense NoJS feature into Anonymous View

Enabling NoJS now removes Javascript from the Anonymous View, rather
than creating a separate option.

* Exclude 'data:' urls from filter, add translations

The 'data:' url must be allowed in results to view certain elements on
the page, such as stars for review based results.

Add translations for the remaining languages.

* Add cssutils to requirements
											
										
										
											2022-04-13 20:29:07 +03:00
+								            # Add alternate viewing options for results,
 								            # if the result doesn't already have an AV link
 								            netloc = urlparse.urlparse(link['href']).netloc
 								            if self.config.anon_view and netloc not in self._av:
 								                self._av.add(netloc)
 								                append_anon_view(link, self.config)
-												Only open external links in a new tab (#380)


											
										
										
											2021-08-24 18:06:41 +03:00
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
+								        else:
-												Hotfix: remove site filter for maps links

The new site filter breaks links to Maps results, so filter.py needed
to be updated to handle these links as a unique case. A new method was
introduced to easily remove any "-site:..." filters from the query,
which is now also used to format queries in the header template rather
than manually removing the blocked site list within the template itself.

Bumps version to 0.5.1 for releasing the bugfix

Fixes #329

											
										
										
											2021-05-27 19:01:57 +03:00
+								            if href.startswith(MAPS_URL):
 								                # Maps links don't work if a site filter is applied
-												Fixes handling of maps (#792)

* fixes map url, e.g. when no q parameter is given

* move maps_args from results to filter where it is used
											
										
										
											2022-06-27 21:33:08 +03:00
+								                link['href'] = build_map_url(link['href'])
-												Remove "/" before endpoints & tags (#734)

Removes the leading slash before imgres and other endpoints

Fix #733
											
										
										
											2022-04-27 23:25:14 +03:00
+								            elif (href.startswith('/?') or href.startswith('/search?') or
 								                  href.startswith('/imgres?')):
-												Add support for relative search results (#715)

* Relativization of search results

* Fix JavaScript error when opening images

* Replace single-letter logo and remove sign-in link

* Add `WHOOGLE_URL_PREFIX` env var to support relative path redirection

The `WHOOGLE_URL_PREFIX` var can now be set to fix internal app
redirects, such as the `/session` redirect performed on the first visit
to the Whoogle home page.

Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-04-19 00:27:45 +03:00
+								                # make sure that tags can be clicked as relative URLs
 								                link['href'] = href[1:]
 								            elif href.startswith('/intl/'):
 								                # do nothing, keep original URL for ToS
 								                pass
 								            elif href.startswith('/preferences'):
 								                # there is no config specific URL, remove this
 								                link.decompose()
 								                return
-												Hotfix: remove site filter for maps links

The new site filter breaks links to Maps results, so filter.py needed
to be updated to handle these links as a unique case. A new method was
introduced to easily remove any "-site:..." filters from the query,
which is now also used to format queries in the header template rather
than manually removing the blocked site list within the template itself.

Bumps version to 0.5.1 for releasing the bugfix

Fixes #329

											
										
										
											2021-05-27 19:01:57 +03:00
+								            else:
 								                link['href'] = href
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Fix image links not being opened in new tab (#790)

The majority of image links and links that are not handle by whoogle are not
opening in new tabs, this allow links that are not related to the application
to open in new tabs.
											
										
										
											2022-06-24 19:50:14 +03:00
+								        if self.config.new_tab and (
 								            link["href"].startswith("http")
 								            or link["href"].startswith("imgres?")
 								        ):
 								            link["target"] = "_blank"
-												Privacy respecting alternatives in results view (#106)

Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration.

Verbatim search and option to ignore search autocorrect are now supported as well.

Also cleaned up the javascript side of whoogle config so that it now
uses arrays of available fields for parsing config values instead of manually assigning each
one to a variable.

This doesn't include support for Google Maps -> Open Street Maps, that
seems a bit more involved than the social media redirects were, so it
should likely be a separate effort.
											
										
										
											2020-07-26 20:53:59 +03:00
+								        # Replace link location if "alts" config is enabled
-												Fix incorrect request type for image searches

Previously had hardcoded POST requests for all requests that didn't use
the header template (which currently is only the image tab).

Also refactored how the Filter class works. It now requires a valid
Config model to be provided, which is then set up as a class var that
the filtering functions can use as needed, rather than setting specific
values from the config as individual values (which was confusing and
sloppy).

Fixes #561

											
										
										
											2021-12-07 07:39:50 +03:00
+								        if self.config.alts:
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            # Search and replace all link descriptions
 								            # with alternative location
-												Privacy respecting alternatives in results view (#106)

Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration.

Verbatim search and option to ignore search autocorrect are now supported as well.

Also cleaned up the javascript side of whoogle config so that it now
uses arrays of available fields for parsing config values instead of manually assigning each
one to a variable.

This doesn't include support for Google Maps -> Open Street Maps, that
seems a bit more involved than the social media redirects were, so it
should likely be a separate effort.
											
										
										
											2020-07-26 20:53:59 +03:00
+								            link['href'] = get_site_alt(link['href'])
-												PEP-8: Fix formatting issues, add CI workflow (#161)

Enforces PEP-8 formatting for all python code

Adds a github action build for checking pep8 formatting using pycodestyle
											
										
										
											2020-12-18 00:06:47 +03:00
+								            link_desc = link.find_all(
 								                text=re.compile('|'.join(SITE_ALTS.keys())))
-												Privacy respecting alternatives in results view (#106)

Full implementation of social media alt redirects (twitter/youtube/instagram -> nitter/invidious/bibliogram) depending on configuration.

Verbatim search and option to ignore search autocorrect are now supported as well.

Also cleaned up the javascript side of whoogle config so that it now
uses arrays of available fields for parsing config values instead of manually assigning each
one to a variable.

This doesn't include support for Google Maps -> Open Street Maps, that
seems a bit more involved than the social media redirects were, so it
should likely be a separate effort.
											
										
										
											2020-07-26 20:53:59 +03:00
+								            if len(link_desc) == 0:
 								                return
-												Project refactor (#85)

* Major refactor of requests and session management

- Switches from pycurl to requests library
  - Allows for less janky decoding, especially with non-latin character
  sets
- Adds session level management of user configs
  - Allows for each session to set its own config (people are probably
  going to complain about this, though not sure if it'll be the same
  number of people who are upset that their friends/family have to share
  their config)
- Updates key gen/regen to more aggressively swap out keys after each
request

* Added ability to save/load configs by name

- New PUT method for config allows changing config with specified name
- New methods in js controller to handle loading/saving of configs

* Result formatting and removal of unused elements

- Fixed question section formatting from results page (added appropriate
padding and made questions styled as italic)
- Removed user agent display from main config settings

* Minor change to button label

* Fixed issue with "de-pickling" of flask session

Having a gitignore-everything ("*") file within a flask session folder seems to cause a
weird bug where the state of the app becomes unusable from continuously
trying to prune files listed in the gitignore (and it can't prune '*').

* Switched to pickling saved configs

* Updated ad/sponsored content filter and conf naming

Configs are now named with a .conf extension to allow for easier manual
cleanup/modification of named config files

Sponsored content now removed by basic string matching of span content

* Version bump to 0.2.0

* Fixed request.send return style
											
										
										
											2020-06-02 21:54:47 +03:00
-												Update domains used for scribe.rip replacements

The levelup.gitconnected.com site is a Medium site that can also be
replaced with scribe.rip whenever privacy respecting site alternatives
are enabled in the config.

Also modified how link descriptions are updated when that config is
enabled (before it was missing replacements on quite a few
descriptions).

											
										
										
											2021-10-24 08:23:37 +03:00
+								            # Replace link description
 								            link_desc = link_desc[0]
 								            for site, alt in SITE_ALTS.items():
-												Ignore blank alts if site alt config is enabled

If the alt for a particular service is blank, the original source is
used instead.

Example:
1. Site alts enabled in config
2. User wants wikipedia links, not wikiless
3. WHOOGLE_ALT_WIKI set to ""
4. All available alt links redirected to farside, except wikipedia

Fixes #704

											
										
										
											2022-03-30 23:46:33 +03:00
+								                if site not in link_desc or not alt:
-												Update domains used for scribe.rip replacements

The levelup.gitconnected.com site is a Medium site that can also be
replaced with scribe.rip whenever privacy respecting site alternatives
are enabled in the config.

Also modified how link descriptions are updated when that config is
enabled (before it was missing replacements on quite a few
descriptions).

											
										
										
											2021-10-24 08:23:37 +03:00
+								                    continue
 								                new_desc = BeautifulSoup(features='html.parser').new_tag('div')
 								                new_desc.string = str(link_desc).replace(site, alt)
 								                link_desc.replace_with(new_desc)
 								                break
-												Add "view image" functionality (#268)

* add view image option

* prevent whoogle links from opening in a new tab.

* remove view image template on mobile requests

* change loop values to be more robust to the number of images

* Update app/templates/imageresults.html

* fix "Basically the .cvifge class needs width: 100%; in order to expand the search input to fit the form width."

* Update app/templates/imageresults.html

* remove hardcoded string from template

* Add view image config var to app.json

* Add view image config var to whoogle.env

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <benbusby@protonmail.com>

											
										
										
											2021-04-16 17:16:14 +03:00
 								    def view_image(self, soup) -> BeautifulSoup:
 								        """Replaces the soup with a new one that handles mobile results and
 								        adds the link of the image full res to the results.
 								        Args:
 								            soup: A BeautifulSoup object containing the image mobile results.
 								        Returns:
 								            BeautifulSoup: The new BeautifulSoup object
 								        """
 								        # get some tags that are unchanged between mobile and pc versions
 								        cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
 								        next_pages = soup.find_all('table', attrs={'class': "uZgmoc"})[0]
 								        results = []
 								        # find results div
 								        results_div = soup.find_all('div', attrs={'class': "nQvrDb"})[0]
 								        # find all the results
 								        results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
 								        for item in results_all:
 								            urls = item.find('a')['href'].split('&imgrefurl=')
-												Improve URL parsing for full size images (#521)

Skip URLs that are not two-element lists

Fixes #520
											
										
										
											2021-11-03 01:22:24 +03:00
+								            # Skip urls that are not two-element lists
 								            if len(urls) != 2:
 								                continue
-												Improve public instance session management (#480)

This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.

Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.

Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.

Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.

Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.

Sessions are also now (semi)permanent and have a lifetime of 1 year.
											
										
										
											2021-11-18 05:35:30 +03:00
+								            img_url = urlparse.unquote(urls[0].replace(
-												Replace leading slash for image links (#762)

The leading slash was previously removed without noticing it was part of a
string replacement in #734. This caused the href of "View Image" contain a
leading "/" which is wrong.
											
										
										
											2022-05-25 20:18:17 +03:00
+								                f'/{Endpoint.imgres}?imgurl=', ''))
-												Handle error when parsing image result url

											
										
										
											2021-06-16 17:40:18 +03:00
 								            try:
 								                # Try to strip out only the necessary part of the web page link
 								                web_page = urlparse.unquote(urls[1].split('&')[0])
 								            except IndexError:
 								                web_page = urlparse.unquote(urls[1])
-												Add "view image" functionality (#268)

* add view image option

* prevent whoogle links from opening in a new tab.

* remove view image template on mobile requests

* change loop values to be more robust to the number of images

* Update app/templates/imageresults.html

* fix "Basically the .cvifge class needs width: 100%; in order to expand the search input to fit the form width."

* Update app/templates/imageresults.html

* remove hardcoded string from template

* Add view image config var to app.json

* Add view image config var to whoogle.env

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <benbusby@protonmail.com>

											
										
										
											2021-04-16 17:16:14 +03:00
+								            img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
-												Handle error when parsing image result url

											
										
										
											2021-06-16 17:40:18 +03:00
-												Add "view image" functionality (#268)

* add view image option

* prevent whoogle links from opening in a new tab.

* remove view image template on mobile requests

* change loop values to be more robust to the number of images

* Update app/templates/imageresults.html

* fix "Basically the .cvifge class needs width: 100%; in order to expand the search input to fit the form width."

* Update app/templates/imageresults.html

* remove hardcoded string from template

* Add view image config var to app.json

* Add view image config var to whoogle.env

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <benbusby@protonmail.com>

											
										
										
											2021-04-16 17:16:14 +03:00
+								            results.append({
-												Handle error when parsing image result url

											
										
										
											2021-06-16 17:40:18 +03:00
+								                'domain': urlparse.urlparse(web_page).netloc,
-												Add "view image" functionality (#268)

* add view image option

* prevent whoogle links from opening in a new tab.

* remove view image template on mobile requests

* change loop values to be more robust to the number of images

* Update app/templates/imageresults.html

* fix "Basically the .cvifge class needs width: 100%; in order to expand the search input to fit the form width."

* Update app/templates/imageresults.html

* remove hardcoded string from template

* Add view image config var to app.json

* Add view image config var to whoogle.env

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <benbusby@protonmail.com>

											
										
										
											2021-04-16 17:16:14 +03:00
+								                'img_url': img_url,
-												Handle error when parsing image result url

											
										
										
											2021-06-16 17:40:18 +03:00
+								                'web_page': web_page,
-												Add "view image" functionality (#268)

* add view image option

* prevent whoogle links from opening in a new tab.

* remove view image template on mobile requests

* change loop values to be more robust to the number of images

* Update app/templates/imageresults.html

* fix "Basically the .cvifge class needs width: 100%; in order to expand the search input to fit the form width."

* Update app/templates/imageresults.html

* remove hardcoded string from template

* Add view image config var to app.json

* Add view image config var to whoogle.env

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <benbusby@protonmail.com>

											
										
										
											2021-04-16 17:16:14 +03:00
+								                'img_tbn': img_tbn
 								            })
 								        soup = BeautifulSoup(render_template('imageresults.html',
 								                                             length=len(results),
 								                                             results=results,
 								                                             view_label="View Image"),
 								                             features='html.parser')
-												Use consistent header for all result types (#535)

Introduces a header for switching between result types (i.e. "All", "News",
etc) that is consistent between the different result types. Previously, image
results had a tab header that was formatted in a drastically different manner,
which was jarring when switching from a different result page to the Images
page.

Created a G class enum to reference class names returned in search
results. As noted in the class doc, this should only be used/updated as
a last resort, as class names change frequently. For some instances,
such as replacing the tbm tab, it's a lot easier to just replace by
header name than attempting to replace it based on how the element is
structured.

Also updated a few styles to revert the latest styling changes being
applied by Google.

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <contact@benbusby.com>
											
										
										
											2022-02-07 20:47:25 +03:00
-												Add "view image" functionality (#268)

* add view image option

* prevent whoogle links from opening in a new tab.

* remove view image template on mobile requests

* change loop values to be more robust to the number of images

* Update app/templates/imageresults.html

* fix "Basically the .cvifge class needs width: 100%; in order to expand the search input to fit the form width."

* Update app/templates/imageresults.html

* remove hardcoded string from template

* Add view image config var to app.json

* Add view image config var to whoogle.env

Co-authored-by: jacr13 <ramos.joao@protonmail.com>
Co-authored-by: Ben Busby <benbusby@protonmail.com>

											
										
										
											2021-04-16 17:16:14 +03:00
+								        # replace correction suggested by google object if exists
 								        if len(cor_suggested):
 								            soup.find_all(
 								                'table',
 								                attrs={'class': "By0U9"}
 								            )[0].replaceWith(cor_suggested[0])
 								        # replace next page object at the bottom of the page
 								        soup.find_all('table',
 								                      attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
 								        return soup