whoogle-search/app/filter.py

from bs4 import BeautifulSoup
import re
import urllib.parse as urlparse
from urllib.parse import parse_qs

SKIP_ARGS = ['ref_src', 'utm']


class Filter:
    def __init__(self, mobile=False, config=None):
        if config is None:
            config = {}

        self.near = config['near'] if 'near' in config else None
        self.dark = config['dark'] if 'dark' in config else False
        self.nojs = config['nojs'] if 'nojs' in config else False
        self.mobile = mobile

    def __getitem__(self, name):
        return getattr(self, name)

    def reskin(self, page):
        # Aesthetic only re-skinning
        page = page.replace('>G<', '>Sh<')
        pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
        page = pattern.sub('685e79', page)
        if self.dark:
            page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')

        return page

    def clean(self, soup):
        def remove_ads():
            main_divs = soup.find('div', {'id': 'main'})
            if main_divs is None:
                return

            result_divs = main_divs.findAll('div', recursive=False)

            # Only ads/sponsored content use classes in the list of result divs
            ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]
            for div in ad_divs:
                div.decompose()

        def sync_images():
            for img in soup.find_all('img'):
                if img['src'].startswith('//'):
                    img['src'] = 'https:' + img['src']

                img['src'] = '/tmp?image_url=' + img['src']

        def update_styling():
            # Remove unnecessary button(s)
            for button in soup.find_all('button'):
                button.decompose()

            # Remove svg logos
            for svg in soup.find_all('svg'):
                svg.decompose()

            # Update logo
            logo = soup.find('a', {'class': 'l'})
            if logo and self.mobile:
                logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \
                                'font-size:18px; '

            # Fix search bar length on mobile
            try:
                search_bar = soup.find('header').find('form').find('div')
                search_bar['style'] = 'width: 100%;'
            except AttributeError:
                pass

            # Set up dark mode if active
            if self.dark:
                soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
                for input_element in soup.findAll('input'):
                    input_element['style'] = 'color:#fff;'

        def update_links():
            # Replace hrefs with only the intended destination (no "utm" type tags)
            for a in soup.find_all('a', href=True):
                href = a['href']
                if '/advanced_search' in href:
                    a.decompose()
                    continue

                if 'url?q=' in href:
                    # Strip unneeded arguments
                    result_link = urlparse.urlparse(href)
                    result_link = parse_qs(result_link.query)['q'][0]

                    parsed_link = urlparse.urlparse(result_link)
                    link_args = parse_qs(parsed_link.query)
                    safe_args = {}

                    for arg in link_args.keys():
                        if arg in SKIP_ARGS:
                            continue

                        safe_args[arg] = link_args[arg]

                    # Remove original link query and replace with filtered args
                    result_link = result_link.replace(parsed_link.query, '')
                    if len(safe_args) > 1:
                        result_link = result_link + urlparse.urlencode(safe_args)
                    else:
                        result_link = result_link.replace('?', '')

                    a['href'] = result_link

                    # Add no-js option
                    if self.nojs:
                        nojs_link = soup.new_tag('a')
                        nojs_link['href'] = '/window?location=' + result_link
                        nojs_link['style'] = 'display:block;width:100%;'
                        nojs_link.string = 'NoJS Link: ' + nojs_link['href']
                        a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
                        a.append(nojs_link)

            # Ensure no extra scripts passed through
            try:
                for script in soup('script'):
                    script.decompose()
                soup.find('div', id='sfooter').decompose()
            except Exception:
                pass

        remove_ads()
        sync_images()
        update_styling()
        update_links()
        return soup