Implement filtering of remote content from css

2022-03-30 17:18:48 -06:00 · 2022-03-30 17:18:48 -06:00 · e7b70dd34d
commit e7b70dd34d
parent 2abfbc310f
4 changed files with 76 additions and 7 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -2,11 +2,12 @@ from app.models.config import Config
 from app.models.endpoint import Endpoint
 from app.models.g_classes import GClasses
 from app.request import VALID_PARAMS, MAPS_URL
-from app.utils.misc import read_config_bool
+from app.utils.misc import get_abs_url, read_config_bool
 from app.utils.results import *
 from bs4 import BeautifulSoup
 from bs4.element import ResultSet, Tag
 from cryptography.fernet import Fernet
+import cssutils
 from flask import render_template
 import re
 import urllib.parse as urlparse
@ -53,15 +54,45 @@ def clean_query(query: str) -> str:
    return query[:query.find('-site:')] if '-site:' in query else query


+def clean_css(css: str, page_url: str) -> str:
+    """Removes all remote URLs from a CSS string.
+
+    Args:
+        css: The CSS string
+
+    Returns:
+        str: The filtered CSS, with URLs proxied through Whoogle
+    """
+    sheet = cssutils.parseString(css)
+    urls = cssutils.getUrls(sheet)
+
+    for url in urls:
+        abs_url = get_abs_url(url, page_url)
+        css = css.replace(
+            url,
+            f'/element?type=image/png&url={abs_url}'
+        )
+
+    return css
+
+
 class Filter:
    # Limit used for determining if a result is a "regular" result or a list
    # type result (such as "people also asked", "related searches", etc)
    RESULT_CHILD_LIMIT = 7

-    def __init__(self, user_key: str, config: Config, mobile=False) -> None:
+    def __init__(
+            self,
+            user_key: str,
+            config: Config,
+            root_url: str,
+            page_url='',
+            mobile=False) -> None:
        self.config = config
        self.mobile = mobile
        self.user_key = user_key
+        self.root_url = root_url
+        self.page_url = page_url
        self.main_divs = ResultSet('')
        self._elements = 0

@ -89,6 +120,7 @@ class Filter:
        self.remove_block_titles()
        self.remove_block_url()
        self.collapse_sections()
+        self.update_css(soup)
        self.update_styling(soup)
        self.remove_block_tabs(soup)

@ -289,9 +321,28 @@ class Filter:
            element['src'] = BLANK_B64
            return

-        element[attr] = f'{Endpoint.element}?url=' + self.encrypt_path(
-            src,
-            is_element=True) + '&type=' + urlparse.quote(mime)
+        element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
+            self.encrypt_path(
+                src,
+                is_element=True
+            ) + '&type=' + urlparse.quote(mime)
+        )
+
+    def update_css(self, soup) -> None:
+        """Updates URLs used in inline styles to be proxied by Whoogle
+        using the /element endpoint.
+
+        Returns:
+            None (The soup element is modified directly)
+
+        """
+        # Filter all <style> tags
+        for style in soup.find_all('style'):
+            style.string = clean_css(style.string, self.page_url)
+
+        # Convert remote stylesheets to style tags
+        for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
+            print(link)

    def update_styling(self, soup) -> None:
        # Remove unnecessary button(s)
--- a/app/routes.py
+++ b/app/routes.py
@ -489,7 +489,10 @@ def window():
        cipher_suite = Fernet(g.session_key)
        target_url = cipher_suite.decrypt(target_url.encode()).decode()

-    content_filter = Filter(g.session_key, config=g.user_config)
+    content_filter = Filter(
+        g.session_key,
+        root_url=request.url_root,
+        config=g.user_config)
    target = urlparse.urlparse(target_url)
    host_url = f'{target.scheme}://{target.netloc}'

@ -514,10 +517,11 @@ def window():
            content_filter.update_element_src(script, 'application/javascript')

    # Replace all possible image attributes
+    img_sources = ['src', 'data-src', 'data-srcset', 'srcset']
    for img in results.find_all('img'):
        _ = [
            content_filter.update_element_src(img, 'image/png', attr=_)
-            for _ in ['src', 'data-src', 'data-srcset', 'srcset'] if img.has_attr(_)
+            for _ in img_sources if img.has_attr(_)
        ]

    # Replace all stylesheet sources
--- a/app/utils/misc.py
+++ b/app/utils/misc.py
@ -3,6 +3,7 @@ from flask import Request
 import hashlib
 import os
 from requests import exceptions, get
+from urllib.parse import urlparse


 def gen_file_hash(path: str, static_file: str) -> str:
@ -47,3 +48,14 @@ def check_for_update(version_url: str, current: str) -> int:
        has_update = ''

    return has_update
+
+
+def get_abs_url(url, page_url):
+    # Creates a valid absolute URL using a partial or relative URL
+    if url.startswith('//'):
+        return f'https:{url}'
+    elif url.startswith('/'):
+        return f'{urlparse(page_url).netloc}{url}'
+    elif url.startswith('./'):
+        return f'{page_url}{url[2:]}'
+    return url
--- a/app/utils/search.py
+++ b/app/utils/search.py
@ -56,6 +56,7 @@ class Search:
    """
    def __init__(self, request, config, session_key, cookies_disabled=False):
        method = request.method
+        self.request = request
        self.request_params = request.args if method == 'GET' else request.form
        self.user_agent = request.headers.get('User-Agent')
        self.feeling_lucky = False
@ -115,6 +116,7 @@ class Search:
        mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent

        content_filter = Filter(self.session_key,
+                                root_url=self.request.url_root,
                                mobile=mobile,
                                config=self.config)
        full_query = gen_query(self.query,