Implement filtering of remote content from css

This commit is contained in:
Ben Busby 2022-03-30 17:18:48 -06:00
parent 2abfbc310f
commit e7b70dd34d
No known key found for this signature in database
GPG Key ID: B9B7231E01D924A1
4 changed files with 76 additions and 7 deletions

View File

@ -2,11 +2,12 @@ from app.models.config import Config
from app.models.endpoint import Endpoint from app.models.endpoint import Endpoint
from app.models.g_classes import GClasses from app.models.g_classes import GClasses
from app.request import VALID_PARAMS, MAPS_URL from app.request import VALID_PARAMS, MAPS_URL
from app.utils.misc import read_config_bool from app.utils.misc import get_abs_url, read_config_bool
from app.utils.results import * from app.utils.results import *
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag from bs4.element import ResultSet, Tag
from cryptography.fernet import Fernet from cryptography.fernet import Fernet
import cssutils
from flask import render_template from flask import render_template
import re import re
import urllib.parse as urlparse import urllib.parse as urlparse
@ -53,15 +54,45 @@ def clean_query(query: str) -> str:
return query[:query.find('-site:')] if '-site:' in query else query return query[:query.find('-site:')] if '-site:' in query else query
def clean_css(css: str, page_url: str) -> str:
"""Removes all remote URLs from a CSS string.
Args:
css: The CSS string
Returns:
str: The filtered CSS, with URLs proxied through Whoogle
"""
sheet = cssutils.parseString(css)
urls = cssutils.getUrls(sheet)
for url in urls:
abs_url = get_abs_url(url, page_url)
css = css.replace(
url,
f'/element?type=image/png&url={abs_url}'
)
return css
class Filter: class Filter:
# Limit used for determining if a result is a "regular" result or a list # Limit used for determining if a result is a "regular" result or a list
# type result (such as "people also asked", "related searches", etc) # type result (such as "people also asked", "related searches", etc)
RESULT_CHILD_LIMIT = 7 RESULT_CHILD_LIMIT = 7
def __init__(self, user_key: str, config: Config, mobile=False) -> None: def __init__(
self,
user_key: str,
config: Config,
root_url: str,
page_url='',
mobile=False) -> None:
self.config = config self.config = config
self.mobile = mobile self.mobile = mobile
self.user_key = user_key self.user_key = user_key
self.root_url = root_url
self.page_url = page_url
self.main_divs = ResultSet('') self.main_divs = ResultSet('')
self._elements = 0 self._elements = 0
@ -89,6 +120,7 @@ class Filter:
self.remove_block_titles() self.remove_block_titles()
self.remove_block_url() self.remove_block_url()
self.collapse_sections() self.collapse_sections()
self.update_css(soup)
self.update_styling(soup) self.update_styling(soup)
self.remove_block_tabs(soup) self.remove_block_tabs(soup)
@ -289,9 +321,28 @@ class Filter:
element['src'] = BLANK_B64 element['src'] = BLANK_B64
return return
element[attr] = f'{Endpoint.element}?url=' + self.encrypt_path( element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
self.encrypt_path(
src, src,
is_element=True) + '&type=' + urlparse.quote(mime) is_element=True
) + '&type=' + urlparse.quote(mime)
)
def update_css(self, soup) -> None:
"""Updates URLs used in inline styles to be proxied by Whoogle
using the /element endpoint.
Returns:
None (The soup element is modified directly)
"""
# Filter all <style> tags
for style in soup.find_all('style'):
style.string = clean_css(style.string, self.page_url)
# Convert remote stylesheets to style tags
for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
print(link)
def update_styling(self, soup) -> None: def update_styling(self, soup) -> None:
# Remove unnecessary button(s) # Remove unnecessary button(s)

View File

@ -489,7 +489,10 @@ def window():
cipher_suite = Fernet(g.session_key) cipher_suite = Fernet(g.session_key)
target_url = cipher_suite.decrypt(target_url.encode()).decode() target_url = cipher_suite.decrypt(target_url.encode()).decode()
content_filter = Filter(g.session_key, config=g.user_config) content_filter = Filter(
g.session_key,
root_url=request.url_root,
config=g.user_config)
target = urlparse.urlparse(target_url) target = urlparse.urlparse(target_url)
host_url = f'{target.scheme}://{target.netloc}' host_url = f'{target.scheme}://{target.netloc}'
@ -514,10 +517,11 @@ def window():
content_filter.update_element_src(script, 'application/javascript') content_filter.update_element_src(script, 'application/javascript')
# Replace all possible image attributes # Replace all possible image attributes
img_sources = ['src', 'data-src', 'data-srcset', 'srcset']
for img in results.find_all('img'): for img in results.find_all('img'):
_ = [ _ = [
content_filter.update_element_src(img, 'image/png', attr=_) content_filter.update_element_src(img, 'image/png', attr=_)
for _ in ['src', 'data-src', 'data-srcset', 'srcset'] if img.has_attr(_) for _ in img_sources if img.has_attr(_)
] ]
# Replace all stylesheet sources # Replace all stylesheet sources

View File

@ -3,6 +3,7 @@ from flask import Request
import hashlib import hashlib
import os import os
from requests import exceptions, get from requests import exceptions, get
from urllib.parse import urlparse
def gen_file_hash(path: str, static_file: str) -> str: def gen_file_hash(path: str, static_file: str) -> str:
@ -47,3 +48,14 @@ def check_for_update(version_url: str, current: str) -> int:
has_update = '' has_update = ''
return has_update return has_update
def get_abs_url(url, page_url):
# Creates a valid absolute URL using a partial or relative URL
if url.startswith('//'):
return f'https:{url}'
elif url.startswith('/'):
return f'{urlparse(page_url).netloc}{url}'
elif url.startswith('./'):
return f'{page_url}{url[2:]}'
return url

View File

@ -56,6 +56,7 @@ class Search:
""" """
def __init__(self, request, config, session_key, cookies_disabled=False): def __init__(self, request, config, session_key, cookies_disabled=False):
method = request.method method = request.method
self.request = request
self.request_params = request.args if method == 'GET' else request.form self.request_params = request.args if method == 'GET' else request.form
self.user_agent = request.headers.get('User-Agent') self.user_agent = request.headers.get('User-Agent')
self.feeling_lucky = False self.feeling_lucky = False
@ -115,6 +116,7 @@ class Search:
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(self.session_key, content_filter = Filter(self.session_key,
root_url=self.request.url_root,
mobile=mobile, mobile=mobile,
config=self.config) config=self.config)
full_query = gen_query(self.query, full_query = gen_query(self.query,