Added testing and ci build, refactored filter class, refactored project structure

This commit is contained in:
Ben Busby 2020-04-15 17:41:53 -06:00
parent 2600f494b7
commit b5b6e64177
15 changed files with 269 additions and 128 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ __pycache__/
*.pem
*.xml
config.json
test/static

6
.travis.yml Normal file
View File

@ -0,0 +1,6 @@
language: python
python: 3.6
install:
- pip install -r config/requirements.txt
script:
- ./run test

View File

@ -3,6 +3,6 @@ FROM python:3
WORKDIR /usr/src/app
COPY . .
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir -r config/requirements.txt
CMD ["./run.sh"]
CMD ["./run"]

View File

@ -29,14 +29,26 @@ heroku open
Now you're done! This series of commands can take a while, but once you run it once, you shouldn't have to run it again. The final command, `heroku open` will launch a tab in your web browser, where you can test out Shoogle and even [set it as your primary search engine](https://github.com/benbusby/shoogle#set-shoogle-as-your-primary-search-engine).
#### B) Using your own server, or alternative container deployment
There are other methods for deploying docker containers that are well outlined in [this article](https://rollout.io/blog/the-shortlist-of-docker-hosting/), but there are too many to describe set up for each here. Generally it should be about the same amount of effort as the Heroku deployment.
There are other methods for deploying docker containers that are well outlined in [this article](https://rollout.io/blog/the-shortlist-of-docker-hosting/), but there are too many to describe set up for each here. Generally it should be about the same amount of effort as the Heroku deployment.
Depending on your preferences, you can also deploy the app yourself on your own infrastructure. This route would require a few extra steps:
- A server (I personally recommend [Digital Ocean](https://www.digitalocean.com/pricing/) or [Linode](https://www.linode.com/pricing/), their cheapest tiers will work fine)
- Your own URL (I suppose this is optional, but recommended)
- SSL certificates (free through [Let's Encrypt](https://letsencrypt.org/getting-started/))
- A bit more experience or willingness to work through issues
## Setup (Local Only)
If you want to test the app out on your own machine first, you can build it with the following instructions:
```bash
git clone https://github.com/benbusby/shoogle.git
cd shoogle
python3 -m venv venv
source venv/bin/activate
pip install -r config/requirements.txt
./run
```
## Usage
Same as most search engines, with the exception of filtering by time range.
@ -44,7 +56,7 @@ To filter by a range of time, append ":past <time>" to the end of your search, w
## Extra Steps
### Set Shoogle as your primary search engine
1. From the main shoogle folder, run `python opensearch.py "<your app url>"`
1. From the main shoogle folder, run `python config/opensearch.py "<your app url>"`
2. Rebuild and release your updated app
- `heroku container:push web` and then `heroku container:release web`
3. Update browser settings
@ -69,4 +81,4 @@ Part of the deal with Heroku's free tier is that you're allocated 550 hours/mont
A good solution for this is to set up a simple cronjob on any device at your home that is consistently powered on and connected to the internet (in my case, a PiHole worked perfectly). All the device needs to do is fetch app content on a consistent basis to keep the app alive in whatever ~17 hour window you want it on (17 hrs * 31 days = 527, meaning you'd still have 23 leftover hours each month if you searched outside of your target window).
For instance: `*/20 7-23 * * * curl https://<your heroku app name>.herokuapp.com > /home/<username>/shoogle-refresh` will fetch the home page of the app every 20 minutes between 7am and midnight, allowing for downtime from midnight to 7am. And again, this wouldn't be a hard limit - you'd still have plenty of remaining hours of uptime each month in case you were searching after this window has closed.
For instance: `*/20 7-23 * * * curl https://<your heroku app name>.herokuapp.com > /home/<username>/shoogle-refresh` will fetch the home page of the app every 20 minutes between 7am and midnight, allowing for downtime from midnight to 7am. And again, this wouldn't be a hard limit - you'd still have plenty of remaining hours of uptime each month in case you were searching after this window has closed.

View File

@ -3,109 +3,118 @@ import re
import urllib.parse as urlparse
from urllib.parse import parse_qs
AD_CLASS = 'ZINbbc'
SPONS_CLASS = 'D1fz0e'
class Filter:
def __init__(self, mobile=False, config=None):
self.mobile = False
self.dark_mode = False
self.nojs = False
self.near_city = None
def reskin(page, dark_mode=False):
# Aesthetic only re-skinning
page = page.replace('>G<', '>Sh<')
pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
page = pattern.sub('685e79', page)
if dark_mode:
page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
if config is None:
config = {}
return page
near_city = config['near'] if 'near' in config else None
dark_mode = config['dark_mode'] if 'dark_mode' in config else False
nojs = config['nojs'] if 'nojs' in config else False
mobile = mobile
def reskin(self, page):
# Aesthetic only re-skinning
page = page.replace('>G<', '>Sh<')
pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
page = pattern.sub('685e79', page)
if self.dark_mode:
page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
def gen_query(q, args, near_city=None):
# Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month"
tbs = ''
# if 'tbs' in request.args:
# tbs = '&tbs=' + request.args.get('tbs')
# q = q.replace(q.split(':past', 1)[-1], '').replace(':past', '')
if ':past' in q:
time_range = str.strip(q.split(':past', 1)[-1])
tbs = '&tbs=qdr:' + str.lower(time_range[0])
return page
# Ensure search query is parsable
q = urlparse.quote(q)
def gen_query(self, q, args):
# Use :past(hour/day/week/month/year) if available
# example search "new restaurants :past month"
tbs = ''
if ':past' in q:
time_range = str.strip(q.split(':past', 1)[-1])
tbs = '&tbs=qdr:' + str.lower(time_range[0])
# Pass along type of results (news, images, books, etc)
tbm = ''
if 'tbm' in args:
tbm = '&tbm=' + args.get('tbm')
# Ensure search query is parsable
q = urlparse.quote(q)
# Get results page start value (10 per page, ie page 2 start val = 20)
start = ''
if 'start' in args:
start = '&start=' + args.get('start')
# Pass along type of results (news, images, books, etc)
tbm = ''
if 'tbm' in args:
tbm = '&tbm=' + args.get('tbm')
# Grab city from config, if available
near = ''
if near_city:
near = '&near=' + urlparse.quote(near_city)
# Get results page start value (10 per page, ie page 2 start val = 20)
start = ''
if 'start' in args:
start = '&start=' + args.get('start')
return q + tbs + tbm + start + near
# Grab city from config, if available
near = ''
if self.near_city:
near = '&near=' + urlparse.quote(self.near_city)
return q + tbs + tbm + start + near
def cook(soup, user_agent, nojs=False, dark_mode=False):
# Remove all ads (TODO: Ad specific div classes probably change over time, look into a more generic method)
main_divs = soup.find('div', {'id': 'main'})
if main_divs is not None:
ad_divs = main_divs.findAll('div', {'class': AD_CLASS}, recursive=False)
sponsored_divs = main_divs.findAll('div', {'class': SPONS_CLASS}, recursive=False)
for div in ad_divs + sponsored_divs:
div.decompose()
def clean(self, soup):
# Remove all ads
main_divs = soup.find('div', {'id': 'main'})
if main_divs is not None:
result_divs = main_divs.findAll('div', recursive=False)
# Remove unnecessary button(s)
for button in soup.find_all('button'):
button.decompose()
# Only ads/sponsored content use classes in the list of result divs
ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]
for div in ad_divs:
div.decompose()
# Remove svg logos
for svg in soup.find_all('svg'):
svg.decompose()
# Remove unnecessary button(s)
for button in soup.find_all('button'):
button.decompose()
# Update logo
logo = soup.find('a', {'class': 'l'})
if logo is not None and ('Android' in user_agent or 'iPhone' in user_agent):
logo.insert(0, 'Shoogle')
logo['style'] = 'display: flex;justify-content: center;align-items: center;color: #685e79;font-size: 18px;'
# Remove svg logos
for svg in soup.find_all('svg'):
svg.decompose()
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
href = a['href']
if '/advanced_search' in href:
a.decompose()
continue
# Update logo
logo = soup.find('a', {'class': 'l'})
if logo is not None and self.mobile:
logo.insert(0, 'Shoogle')
logo['style'] = 'display: flex;justify-content: center;align-items: center;color: #685e79;font-size: 18px;'
if 'url?q=' in href:
# Strip unneeded arguments
href = urlparse.urlparse(href)
href = parse_qs(href.query)['q'][0]
# Replace hrefs with only the intended destination (no "utm" type tags)
for a in soup.find_all('a', href=True):
href = a['href']
if '/advanced_search' in href:
a.decompose()
continue
# Add no-js option
if nojs:
nojs_link = soup.new_tag('a')
nojs_link['href'] = '/window?location=' + href
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
a.append(nojs_link)
if 'url?q=' in href:
# Strip unneeded arguments
href = urlparse.urlparse(href)
href = parse_qs(href.query)['q'][0]
# Set up dark mode if active
if dark_mode:
soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
for input_element in soup.findAll('input'):
input_element['style'] = 'color:#fff;'
# Add no-js option
if self.nojs:
nojs_link = soup.new_tag('a')
nojs_link['href'] = '/window?location=' + href
nojs_link['style'] = 'display:block;width:100%;'
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
a.append(nojs_link)
# Ensure no extra scripts passed through
try:
for script in soup('script'):
script.decompose()
soup.find('div', id='sfooter').decompose()
except Exception:
pass
# Set up dark mode if active
if self.dark_mode:
soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
for input_element in soup.findAll('input'):
input_element['style'] = 'color:#fff;'
return soup
# Ensure no extra scripts passed through
try:
for script in soup('script'):
script.decompose()
soup.find('div', id='sfooter').decompose()
except Exception:
pass
return soup

View File

@ -1,4 +1,5 @@
from app import app, rhyme, filter
from app import app, rhyme
from app.filter import Filter
from bs4 import BeautifulSoup
from flask import request, redirect, render_template
from io import BytesIO
@ -7,8 +8,8 @@ import os
import pycurl
import urllib.parse as urlparse
APP_ROOT = os.path.dirname(os.path.abspath(__file__))
STATIC_FOLDER = os.path.join(APP_ROOT, 'static')
app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__)))
app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static'))
# Get Mozilla Firefox rhyme (important) and form a new user agent
mozilla = rhyme.get_rhyme('Mo') + 'zilla'
@ -20,7 +21,7 @@ DESKTOP_UA = mozilla + '/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/2010010
# Base search url
SEARCH_URL = 'https://www.google.com/search?gbv=1&q='
user_config = json.load(open(STATIC_FOLDER + '/config.json'))
user_config = json.load(open(app.config['STATIC_FOLDER'] + '/config.json'))
def get_ua(user_agent):
@ -55,29 +56,31 @@ def search():
if q is None or len(q) <= 0:
return render_template('error.html')
full_query = filter.gen_query(q, request.args)
user_agent = request.headers.get('User-Agent')
dark_mode = 'dark' in user_config and user_config['dark']
nojs = 'nojs' in user_config and user_config['nojs']
mobile = 'Android' in user_agent or 'iPhone' in user_agent
get_body = filter.reskin(send_request(
SEARCH_URL + full_query, get_ua(user_agent)), dark_mode=dark_mode)
soup = filter.cook(BeautifulSoup(get_body, 'html.parser'), user_agent, nojs=nojs, dark_mode=dark_mode)
content_filter = Filter(mobile, user_config)
full_query = content_filter.gen_query(q, request.args)
get_body = send_request(SEARCH_URL + full_query, get_ua(user_agent))
get_body = content_filter.reskin(get_body)
soup = content_filter.clean(BeautifulSoup(get_body, 'html.parser'))
return render_template('display.html', query=urlparse.unquote(q), response=soup)
@app.route('/config', methods=['POST'])
@app.route('/config', methods=['GET', 'POST'])
def config():
global user_config
with open(STATIC_FOLDER + '/config.json', 'w') as config_file:
config_file.write(json.dumps(json.loads(request.data), indent=4))
config_file.close()
if request.method == 'GET':
return json.dumps(user_config)
else:
with open(app.config['STATIC_FOLDER'] + '/config.json', 'w') as config_file:
config_file.write(json.dumps(json.loads(request.data), indent=4))
config_file.close()
user_config = json.loads(request.data)
user_config = json.loads(request.data)
return 'New config: ' + str(request.data)
return 'New config: ' + str(request.data)
@app.route('/url', methods=['GET'])

View File

@ -1,7 +1,9 @@
import os
import sys
template_path = './app/static/opensearch.template'
opensearch_path = './app/static/opensearch.xml'
script_path = os.path.dirname(os.path.realpath(__file__))
template_path = script_path + '/../app/static/opensearch.template'
opensearch_path = script_path + '/../app/static/opensearch.xml'
replace_tag = 'SHOOGLE_URL'
if len(sys.argv) != 2:

View File

@ -11,6 +11,7 @@ Phyme==0.0.9
pycparser==2.19
pycurl==7.43.0.4
pyOpenSSL==19.1.0
pytest==5.4.1
six==1.14.0
soupsieve==1.9.5
Werkzeug==0.16.0

33
run Executable file
View File

@ -0,0 +1,33 @@
#!/bin/bash
# Usage:
# ./run # Runs the full web app
# ./run test # Runs the testing suite
SCRIPT=`realpath $0`
SCRIPT_DIR=`dirname $SCRIPT`
# Set default port if unavailable
if [[ -z "${PORT}" ]]; then
PORT=5000
fi
# Set directory to serve static content from
[[ ! -z $1 ]] && SUBDIR="$1" || SUBDIR="app"
export APP_ROOT=$SCRIPT_DIR/$SUBDIR
export STATIC_FOLDER=$APP_ROOT/static
mkdir -p $STATIC_FOLDER
# Create default config json if it doesn't exist
if [[ ! -f $STATIC_FOLDER/config.json ]]; then
echo "{}" > $STATIC_FOLDER/config.json
fi
pkill flask
# Check for regular vs test run
if [[ $SUBDIR == "test" ]]; then
pytest -sv
else
flask run --host="0.0.0.0" --port=$PORT
fi

17
run.sh
View File

@ -1,17 +0,0 @@
#!/bin/bash
SCRIPT=`realpath $0`
SCRIPT_DIR=`dirname $SCRIPT`
if [[ -z "${PORT}" ]]; then
PORT=5000
fi
# Create config json if it doesn't exist
if [[ ! -f $SCRIPT_DIR/app/static/config.json ]]; then
echo "{}" > $SCRIPT_DIR/app/static/config.json
fi
pkill flask
flask run --host="0.0.0.0" --port=$PORT

View File

@ -1 +0,0 @@
from app import app

0
test/__init__.py Normal file
View File

8
test/conftest.py Normal file
View File

@ -0,0 +1,8 @@
from app import app
import pytest
@pytest.fixture
def client():
client = app.test_client()
yield client

54
test/test_results.py Normal file
View File

@ -0,0 +1,54 @@
from bs4 import BeautifulSoup
from app.filter import Filter
import json
from datetime import datetime
from dateutil.parser import *
from test.conftest import client
def get_search_results(data):
soup = Filter().clean(BeautifulSoup(rv.data, 'html.parser'))
main_divs = soup.find('div', {'id': 'main'})
assert len(main_divs) > 1
result_divs = []
for div in main_divs:
# Result divs should only have 1 inner div
if len(list(div.children)) != 1 or not div.findChild() or 'div' not in div.findChild().name:
continue
result_divs.append(div)
return result_divs
def test_search_results(client):
rv = client.get('/search?q=test')
assert rv._status_code == 200
assert len(get_search_results(rv.data)) == 10
def test_recent_results(client):
times = {
'pastyear': 365,
'pastmonth': 31,
'pastweek': 7
}
for time, num_days in times.items():
rv = client.get('/search?q=test%20%3A' + time)
result_divs = get_search_results(rv.data)
current_date = datetime.now()
for div in result_divs:
date_span = div.find('span').decode_contents()
if not date_span or len(date_span) > 15:
continue
try:
date = parse(date_span)
assert (current_date - date).days < num_days
except ParserError:
assert ' ago' in date_span

30
test/test_routes.py Normal file
View File

@ -0,0 +1,30 @@
import json
from test.conftest import client
demo_config = {
'near': 'Seattle',
'dark_mode': 0,
'nojs': 0
}
def test_main(client):
rv = client.get('/')
assert rv._status_code == 200
def test_search(client):
rv = client.get('/search?q=test')
assert rv._status_code == 200
def test_config(client):
rv = client.post('/config', data=json.dumps(demo_config))
assert rv._status_code == 200
rv = client.get('/config')
assert rv._status_code == 200
config = json.loads(rv.data)
for key in demo_config.keys():
assert config[key] == demo_config[key]