whoogle-search/app/filter.py
Ben Busby 4180aedd87 Added image proxying, refactored filter class
Images were previously directly fetched from google search results,
which was a potential privacy hazard. All image sources are now modified
to be passed through shoogle's routing first, which will then fetch raw
image data and pass it through to the user.

Filter class was refactored to split the primary clean method into
smaller, more manageable submethods.
2020-04-27 20:21:36 -06:00

134 lines
4.7 KiB
Python

from bs4 import BeautifulSoup
import re
import urllib.parse as urlparse
from urllib.parse import parse_qs
SKIP_ARGS = ['ref_src', 'utm']
class Filter:
def __init__(self, mobile=False, config=None):
if config is None:
config = {}
self.near = config['near'] if 'near' in config else None
self.dark = config['dark'] if 'dark' in config else False
self.nojs = config['nojs'] if 'nojs' in config else False
self.mobile = mobile
def __getitem__(self, name):
return getattr(self, name)
def reskin(self, page):
# Aesthetic only re-skinning
page = page.replace('>G<', '>Sh<')
pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
page = pattern.sub('685e79', page)
if self.dark:
page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
return page
def clean(self, soup):
def remove_ads():
main_divs = soup.find('div', {'id': 'main'})
if main_divs is None:
return
result_divs = main_divs.findAll('div', recursive=False)
# Only ads/sponsored content use classes in the list of result divs
ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]
for div in ad_divs:
div.decompose()
def sync_images():
for img in soup.find_all('img'):
if img['src'].startswith('//'):
img['src'] = 'https:' + img['src']
img['src'] = '/tmp?image_url=' + img['src']
def update_styling():
# Remove unnecessary button(s)
for button in soup.find_all('button'):
button.decompose()
# Remove svg logos
for svg in soup.find_all('svg'):
svg.decompose()
# Update logo
logo = soup.find('a', {'class': 'l'})
if logo and self.mobile:
logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \
'font-size:18px; '
# Fix search bar length on mobile
try:
search_bar = soup.find('header').find('form').find('div')
search_bar['style'] = 'width: 100%;'
except AttributeError:
pass
# Set up dark mode if active
if self.dark:
soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
for input_element in soup.findAll('input'):
input_element['style'] = 'color:#fff;'
def update_links():
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
href = a['href']
if '/advanced_search' in href:
a.decompose()
continue
if 'url?q=' in href:
# Strip unneeded arguments
result_link = urlparse.urlparse(href)
result_link = parse_qs(result_link.query)['q'][0]
parsed_link = urlparse.urlparse(result_link)
link_args = parse_qs(parsed_link.query)
safe_args = {}
for arg in link_args.keys():
if arg in SKIP_ARGS:
continue
safe_args[arg] = link_args[arg]
# Remove original link query and replace with filtered args
result_link = result_link.replace(parsed_link.query, '')
if len(safe_args) > 1:
result_link = result_link + urlparse.urlencode(safe_args)
else:
result_link = result_link.replace('?', '')
a['href'] = result_link
# Add no-js option
if self.nojs:
nojs_link = soup.new_tag('a')
nojs_link['href'] = '/window?location=' + result_link
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
a.append(nojs_link)
# Ensure no extra scripts passed through
try:
for script in soup('script'):
script.decompose()
soup.find('div', id='sfooter').decompose()
except Exception:
pass
remove_ads()
sync_images()
update_styling()
update_links()
return soup