Implement filtering of remote content from css

This commit is contained in:
Ben Busby 2022-03-30 17:18:48 -06:00
parent 2abfbc310f
commit e7b70dd34d
No known key found for this signature in database
GPG Key ID: B9B7231E01D924A1
4 changed files with 76 additions and 7 deletions

View File

@ -2,11 +2,12 @@ from app.models.config import Config
from app.models.endpoint import Endpoint
from app.models.g_classes import GClasses
from app.request import VALID_PARAMS, MAPS_URL
from app.utils.misc import read_config_bool
from app.utils.misc import get_abs_url, read_config_bool
from app.utils.results import *
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from cryptography.fernet import Fernet
import cssutils
from flask import render_template
import re
import urllib.parse as urlparse
@ -53,15 +54,45 @@ def clean_query(query: str) -> str:
return query[:query.find('-site:')] if '-site:' in query else query
def clean_css(css: str, page_url: str) -> str:
"""Removes all remote URLs from a CSS string.
Args:
css: The CSS string
Returns:
str: The filtered CSS, with URLs proxied through Whoogle
"""
sheet = cssutils.parseString(css)
urls = cssutils.getUrls(sheet)
for url in urls:
abs_url = get_abs_url(url, page_url)
css = css.replace(
url,
f'/element?type=image/png&url={abs_url}'
)
return css
class Filter:
# Limit used for determining if a result is a "regular" result or a list
# type result (such as "people also asked", "related searches", etc)
RESULT_CHILD_LIMIT = 7
def __init__(self, user_key: str, config: Config, mobile=False) -> None:
def __init__(
self,
user_key: str,
config: Config,
root_url: str,
page_url='',
mobile=False) -> None:
self.config = config
self.mobile = mobile
self.user_key = user_key
self.root_url = root_url
self.page_url = page_url
self.main_divs = ResultSet('')
self._elements = 0
@ -89,6 +120,7 @@ class Filter:
self.remove_block_titles()
self.remove_block_url()
self.collapse_sections()
self.update_css(soup)
self.update_styling(soup)
self.remove_block_tabs(soup)
@ -289,9 +321,28 @@ class Filter:
element['src'] = BLANK_B64
return
element[attr] = f'{Endpoint.element}?url=' + self.encrypt_path(
src,
is_element=True) + '&type=' + urlparse.quote(mime)
element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
self.encrypt_path(
src,
is_element=True
) + '&type=' + urlparse.quote(mime)
)
def update_css(self, soup) -> None:
"""Updates URLs used in inline styles to be proxied by Whoogle
using the /element endpoint.
Returns:
None (The soup element is modified directly)
"""
# Filter all <style> tags
for style in soup.find_all('style'):
style.string = clean_css(style.string, self.page_url)
# Convert remote stylesheets to style tags
for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
print(link)
def update_styling(self, soup) -> None:
# Remove unnecessary button(s)

View File

@ -489,7 +489,10 @@ def window():
cipher_suite = Fernet(g.session_key)
target_url = cipher_suite.decrypt(target_url.encode()).decode()
content_filter = Filter(g.session_key, config=g.user_config)
content_filter = Filter(
g.session_key,
root_url=request.url_root,
config=g.user_config)
target = urlparse.urlparse(target_url)
host_url = f'{target.scheme}://{target.netloc}'
@ -514,10 +517,11 @@ def window():
content_filter.update_element_src(script, 'application/javascript')
# Replace all possible image attributes
img_sources = ['src', 'data-src', 'data-srcset', 'srcset']
for img in results.find_all('img'):
_ = [
content_filter.update_element_src(img, 'image/png', attr=_)
for _ in ['src', 'data-src', 'data-srcset', 'srcset'] if img.has_attr(_)
for _ in img_sources if img.has_attr(_)
]
# Replace all stylesheet sources

View File

@ -3,6 +3,7 @@ from flask import Request
import hashlib
import os
from requests import exceptions, get
from urllib.parse import urlparse
def gen_file_hash(path: str, static_file: str) -> str:
@ -47,3 +48,14 @@ def check_for_update(version_url: str, current: str) -> int:
has_update = ''
return has_update
def get_abs_url(url, page_url):
# Creates a valid absolute URL using a partial or relative URL
if url.startswith('//'):
return f'https:{url}'
elif url.startswith('/'):
return f'{urlparse(page_url).netloc}{url}'
elif url.startswith('./'):
return f'{page_url}{url[2:]}'
return url

View File

@ -56,6 +56,7 @@ class Search:
"""
def __init__(self, request, config, session_key, cookies_disabled=False):
method = request.method
self.request = request
self.request_params = request.args if method == 'GET' else request.form
self.user_agent = request.headers.get('User-Agent')
self.feeling_lucky = False
@ -115,6 +116,7 @@ class Search:
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
content_filter = Filter(self.session_key,
root_url=self.request.url_root,
mobile=mobile,
config=self.config)
full_query = gen_query(self.query,