Added testing and ci build, refactored filter class, refactored project structure
This commit is contained in:
parent
2600f494b7
commit
b5b6e64177
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -5,3 +5,4 @@ __pycache__/
|
|||
*.pem
|
||||
*.xml
|
||||
config.json
|
||||
test/static
|
||||
|
|
6
.travis.yml
Normal file
6
.travis.yml
Normal file
|
@ -0,0 +1,6 @@
|
|||
language: python
|
||||
python: 3.6
|
||||
install:
|
||||
- pip install -r config/requirements.txt
|
||||
script:
|
||||
- ./run test
|
|
@ -3,6 +3,6 @@ FROM python:3
|
|||
WORKDIR /usr/src/app
|
||||
COPY . .
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --no-cache-dir -r config/requirements.txt
|
||||
|
||||
CMD ["./run.sh"]
|
||||
CMD ["./run"]
|
||||
|
|
20
README.md
20
README.md
|
@ -29,14 +29,26 @@ heroku open
|
|||
Now you're done! This series of commands can take a while, but once you run it once, you shouldn't have to run it again. The final command, `heroku open` will launch a tab in your web browser, where you can test out Shoogle and even [set it as your primary search engine](https://github.com/benbusby/shoogle#set-shoogle-as-your-primary-search-engine).
|
||||
|
||||
#### B) Using your own server, or alternative container deployment
|
||||
There are other methods for deploying docker containers that are well outlined in [this article](https://rollout.io/blog/the-shortlist-of-docker-hosting/), but there are too many to describe set up for each here. Generally it should be about the same amount of effort as the Heroku deployment.
|
||||
There are other methods for deploying docker containers that are well outlined in [this article](https://rollout.io/blog/the-shortlist-of-docker-hosting/), but there are too many to describe set up for each here. Generally it should be about the same amount of effort as the Heroku deployment.
|
||||
|
||||
Depending on your preferences, you can also deploy the app yourself on your own infrastructure. This route would require a few extra steps:
|
||||
- A server (I personally recommend [Digital Ocean](https://www.digitalocean.com/pricing/) or [Linode](https://www.linode.com/pricing/), their cheapest tiers will work fine)
|
||||
- Your own URL (I suppose this is optional, but recommended)
|
||||
- SSL certificates (free through [Let's Encrypt](https://letsencrypt.org/getting-started/))
|
||||
- A bit more experience or willingness to work through issues
|
||||
|
||||
|
||||
## Setup (Local Only)
|
||||
If you want to test the app out on your own machine first, you can build it with the following instructions:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/benbusby/shoogle.git
|
||||
cd shoogle
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r config/requirements.txt
|
||||
./run
|
||||
```
|
||||
|
||||
## Usage
|
||||
Same as most search engines, with the exception of filtering by time range.
|
||||
|
||||
|
@ -44,7 +56,7 @@ To filter by a range of time, append ":past <time>" to the end of your search, w
|
|||
|
||||
## Extra Steps
|
||||
### Set Shoogle as your primary search engine
|
||||
1. From the main shoogle folder, run `python opensearch.py "<your app url>"`
|
||||
1. From the main shoogle folder, run `python config/opensearch.py "<your app url>"`
|
||||
2. Rebuild and release your updated app
|
||||
- `heroku container:push web` and then `heroku container:release web`
|
||||
3. Update browser settings
|
||||
|
@ -69,4 +81,4 @@ Part of the deal with Heroku's free tier is that you're allocated 550 hours/mont
|
|||
|
||||
A good solution for this is to set up a simple cronjob on any device at your home that is consistently powered on and connected to the internet (in my case, a PiHole worked perfectly). All the device needs to do is fetch app content on a consistent basis to keep the app alive in whatever ~17 hour window you want it on (17 hrs * 31 days = 527, meaning you'd still have 23 leftover hours each month if you searched outside of your target window).
|
||||
|
||||
For instance: `*/20 7-23 * * * curl https://<your heroku app name>.herokuapp.com > /home/<username>/shoogle-refresh` will fetch the home page of the app every 20 minutes between 7am and midnight, allowing for downtime from midnight to 7am. And again, this wouldn't be a hard limit - you'd still have plenty of remaining hours of uptime each month in case you were searching after this window has closed.
|
||||
For instance: `*/20 7-23 * * * curl https://<your heroku app name>.herokuapp.com > /home/<username>/shoogle-refresh` will fetch the home page of the app every 20 minutes between 7am and midnight, allowing for downtime from midnight to 7am. And again, this wouldn't be a hard limit - you'd still have plenty of remaining hours of uptime each month in case you were searching after this window has closed.
|
||||
|
|
179
app/filter.py
179
app/filter.py
|
@ -3,109 +3,118 @@ import re
|
|||
import urllib.parse as urlparse
|
||||
from urllib.parse import parse_qs
|
||||
|
||||
AD_CLASS = 'ZINbbc'
|
||||
SPONS_CLASS = 'D1fz0e'
|
||||
|
||||
class Filter:
|
||||
def __init__(self, mobile=False, config=None):
|
||||
self.mobile = False
|
||||
self.dark_mode = False
|
||||
self.nojs = False
|
||||
self.near_city = None
|
||||
|
||||
def reskin(page, dark_mode=False):
|
||||
# Aesthetic only re-skinning
|
||||
page = page.replace('>G<', '>Sh<')
|
||||
pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
|
||||
page = pattern.sub('685e79', page)
|
||||
if dark_mode:
|
||||
page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
|
||||
if config is None:
|
||||
config = {}
|
||||
|
||||
return page
|
||||
near_city = config['near'] if 'near' in config else None
|
||||
dark_mode = config['dark_mode'] if 'dark_mode' in config else False
|
||||
nojs = config['nojs'] if 'nojs' in config else False
|
||||
mobile = mobile
|
||||
|
||||
def reskin(self, page):
|
||||
# Aesthetic only re-skinning
|
||||
page = page.replace('>G<', '>Sh<')
|
||||
pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
|
||||
page = pattern.sub('685e79', page)
|
||||
if self.dark_mode:
|
||||
page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
|
||||
|
||||
def gen_query(q, args, near_city=None):
|
||||
# Use :past(hour/day/week/month/year) if available
|
||||
# example search "new restaurants :past month"
|
||||
tbs = ''
|
||||
# if 'tbs' in request.args:
|
||||
# tbs = '&tbs=' + request.args.get('tbs')
|
||||
# q = q.replace(q.split(':past', 1)[-1], '').replace(':past', '')
|
||||
if ':past' in q:
|
||||
time_range = str.strip(q.split(':past', 1)[-1])
|
||||
tbs = '&tbs=qdr:' + str.lower(time_range[0])
|
||||
return page
|
||||
|
||||
# Ensure search query is parsable
|
||||
q = urlparse.quote(q)
|
||||
def gen_query(self, q, args):
|
||||
# Use :past(hour/day/week/month/year) if available
|
||||
# example search "new restaurants :past month"
|
||||
tbs = ''
|
||||
if ':past' in q:
|
||||
time_range = str.strip(q.split(':past', 1)[-1])
|
||||
tbs = '&tbs=qdr:' + str.lower(time_range[0])
|
||||
|
||||
# Pass along type of results (news, images, books, etc)
|
||||
tbm = ''
|
||||
if 'tbm' in args:
|
||||
tbm = '&tbm=' + args.get('tbm')
|
||||
# Ensure search query is parsable
|
||||
q = urlparse.quote(q)
|
||||
|
||||
# Get results page start value (10 per page, ie page 2 start val = 20)
|
||||
start = ''
|
||||
if 'start' in args:
|
||||
start = '&start=' + args.get('start')
|
||||
# Pass along type of results (news, images, books, etc)
|
||||
tbm = ''
|
||||
if 'tbm' in args:
|
||||
tbm = '&tbm=' + args.get('tbm')
|
||||
|
||||
# Grab city from config, if available
|
||||
near = ''
|
||||
if near_city:
|
||||
near = '&near=' + urlparse.quote(near_city)
|
||||
# Get results page start value (10 per page, ie page 2 start val = 20)
|
||||
start = ''
|
||||
if 'start' in args:
|
||||
start = '&start=' + args.get('start')
|
||||
|
||||
return q + tbs + tbm + start + near
|
||||
# Grab city from config, if available
|
||||
near = ''
|
||||
if self.near_city:
|
||||
near = '&near=' + urlparse.quote(self.near_city)
|
||||
|
||||
return q + tbs + tbm + start + near
|
||||
|
||||
def cook(soup, user_agent, nojs=False, dark_mode=False):
|
||||
# Remove all ads (TODO: Ad specific div classes probably change over time, look into a more generic method)
|
||||
main_divs = soup.find('div', {'id': 'main'})
|
||||
if main_divs is not None:
|
||||
ad_divs = main_divs.findAll('div', {'class': AD_CLASS}, recursive=False)
|
||||
sponsored_divs = main_divs.findAll('div', {'class': SPONS_CLASS}, recursive=False)
|
||||
for div in ad_divs + sponsored_divs:
|
||||
div.decompose()
|
||||
def clean(self, soup):
|
||||
# Remove all ads
|
||||
main_divs = soup.find('div', {'id': 'main'})
|
||||
if main_divs is not None:
|
||||
result_divs = main_divs.findAll('div', recursive=False)
|
||||
|
||||
# Remove unnecessary button(s)
|
||||
for button in soup.find_all('button'):
|
||||
button.decompose()
|
||||
# Only ads/sponsored content use classes in the list of result divs
|
||||
ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]
|
||||
for div in ad_divs:
|
||||
div.decompose()
|
||||
|
||||
# Remove svg logos
|
||||
for svg in soup.find_all('svg'):
|
||||
svg.decompose()
|
||||
# Remove unnecessary button(s)
|
||||
for button in soup.find_all('button'):
|
||||
button.decompose()
|
||||
|
||||
# Update logo
|
||||
logo = soup.find('a', {'class': 'l'})
|
||||
if logo is not None and ('Android' in user_agent or 'iPhone' in user_agent):
|
||||
logo.insert(0, 'Shoogle')
|
||||
logo['style'] = 'display: flex;justify-content: center;align-items: center;color: #685e79;font-size: 18px;'
|
||||
# Remove svg logos
|
||||
for svg in soup.find_all('svg'):
|
||||
svg.decompose()
|
||||
|
||||
# Replace hrefs with only the intended destination (no "utm" type tags)
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
if '/advanced_search' in href:
|
||||
a.decompose()
|
||||
continue
|
||||
# Update logo
|
||||
logo = soup.find('a', {'class': 'l'})
|
||||
if logo is not None and self.mobile:
|
||||
logo.insert(0, 'Shoogle')
|
||||
logo['style'] = 'display: flex;justify-content: center;align-items: center;color: #685e79;font-size: 18px;'
|
||||
|
||||
if 'url?q=' in href:
|
||||
# Strip unneeded arguments
|
||||
href = urlparse.urlparse(href)
|
||||
href = parse_qs(href.query)['q'][0]
|
||||
# Replace hrefs with only the intended destination (no "utm" type tags)
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
if '/advanced_search' in href:
|
||||
a.decompose()
|
||||
continue
|
||||
|
||||
# Add no-js option
|
||||
if nojs:
|
||||
nojs_link = soup.new_tag('a')
|
||||
nojs_link['href'] = '/window?location=' + href
|
||||
nojs_link['style'] = 'display:block;width:100%;'
|
||||
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
|
||||
a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
|
||||
a.append(nojs_link)
|
||||
if 'url?q=' in href:
|
||||
# Strip unneeded arguments
|
||||
href = urlparse.urlparse(href)
|
||||
href = parse_qs(href.query)['q'][0]
|
||||
|
||||
# Set up dark mode if active
|
||||
if dark_mode:
|
||||
soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
|
||||
for input_element in soup.findAll('input'):
|
||||
input_element['style'] = 'color:#fff;'
|
||||
# Add no-js option
|
||||
if self.nojs:
|
||||
nojs_link = soup.new_tag('a')
|
||||
nojs_link['href'] = '/window?location=' + href
|
||||
nojs_link['style'] = 'display:block;width:100%;'
|
||||
nojs_link.string = 'NoJS Link: ' + nojs_link['href']
|
||||
a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
|
||||
a.append(nojs_link)
|
||||
|
||||
# Ensure no extra scripts passed through
|
||||
try:
|
||||
for script in soup('script'):
|
||||
script.decompose()
|
||||
soup.find('div', id='sfooter').decompose()
|
||||
except Exception:
|
||||
pass
|
||||
# Set up dark mode if active
|
||||
if self.dark_mode:
|
||||
soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
|
||||
for input_element in soup.findAll('input'):
|
||||
input_element['style'] = 'color:#fff;'
|
||||
|
||||
return soup
|
||||
# Ensure no extra scripts passed through
|
||||
try:
|
||||
for script in soup('script'):
|
||||
script.decompose()
|
||||
soup.find('div', id='sfooter').decompose()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return soup
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from app import app, rhyme, filter
|
||||
from app import app, rhyme
|
||||
from app.filter import Filter
|
||||
from bs4 import BeautifulSoup
|
||||
from flask import request, redirect, render_template
|
||||
from io import BytesIO
|
||||
|
@ -7,8 +8,8 @@ import os
|
|||
import pycurl
|
||||
import urllib.parse as urlparse
|
||||
|
||||
APP_ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||
STATIC_FOLDER = os.path.join(APP_ROOT, 'static')
|
||||
app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__)))
|
||||
app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static'))
|
||||
|
||||
# Get Mozilla Firefox rhyme (important) and form a new user agent
|
||||
mozilla = rhyme.get_rhyme('Mo') + 'zilla'
|
||||
|
@ -20,7 +21,7 @@ DESKTOP_UA = mozilla + '/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/2010010
|
|||
# Base search url
|
||||
SEARCH_URL = 'https://www.google.com/search?gbv=1&q='
|
||||
|
||||
user_config = json.load(open(STATIC_FOLDER + '/config.json'))
|
||||
user_config = json.load(open(app.config['STATIC_FOLDER'] + '/config.json'))
|
||||
|
||||
|
||||
def get_ua(user_agent):
|
||||
|
@ -55,29 +56,31 @@ def search():
|
|||
if q is None or len(q) <= 0:
|
||||
return render_template('error.html')
|
||||
|
||||
full_query = filter.gen_query(q, request.args)
|
||||
user_agent = request.headers.get('User-Agent')
|
||||
dark_mode = 'dark' in user_config and user_config['dark']
|
||||
nojs = 'nojs' in user_config and user_config['nojs']
|
||||
mobile = 'Android' in user_agent or 'iPhone' in user_agent
|
||||
|
||||
get_body = filter.reskin(send_request(
|
||||
SEARCH_URL + full_query, get_ua(user_agent)), dark_mode=dark_mode)
|
||||
|
||||
soup = filter.cook(BeautifulSoup(get_body, 'html.parser'), user_agent, nojs=nojs, dark_mode=dark_mode)
|
||||
content_filter = Filter(mobile, user_config)
|
||||
full_query = content_filter.gen_query(q, request.args)
|
||||
get_body = send_request(SEARCH_URL + full_query, get_ua(user_agent))
|
||||
get_body = content_filter.reskin(get_body)
|
||||
soup = content_filter.clean(BeautifulSoup(get_body, 'html.parser'))
|
||||
|
||||
return render_template('display.html', query=urlparse.unquote(q), response=soup)
|
||||
|
||||
|
||||
@app.route('/config', methods=['POST'])
|
||||
@app.route('/config', methods=['GET', 'POST'])
|
||||
def config():
|
||||
global user_config
|
||||
with open(STATIC_FOLDER + '/config.json', 'w') as config_file:
|
||||
config_file.write(json.dumps(json.loads(request.data), indent=4))
|
||||
config_file.close()
|
||||
if request.method == 'GET':
|
||||
return json.dumps(user_config)
|
||||
else:
|
||||
with open(app.config['STATIC_FOLDER'] + '/config.json', 'w') as config_file:
|
||||
config_file.write(json.dumps(json.loads(request.data), indent=4))
|
||||
config_file.close()
|
||||
|
||||
user_config = json.loads(request.data)
|
||||
user_config = json.loads(request.data)
|
||||
|
||||
return 'New config: ' + str(request.data)
|
||||
return 'New config: ' + str(request.data)
|
||||
|
||||
|
||||
@app.route('/url', methods=['GET'])
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import os
|
||||
import sys
|
||||
|
||||
template_path = './app/static/opensearch.template'
|
||||
opensearch_path = './app/static/opensearch.xml'
|
||||
script_path = os.path.dirname(os.path.realpath(__file__))
|
||||
template_path = script_path + '/../app/static/opensearch.template'
|
||||
opensearch_path = script_path + '/../app/static/opensearch.xml'
|
||||
replace_tag = 'SHOOGLE_URL'
|
||||
|
||||
if len(sys.argv) != 2:
|
|
@ -11,6 +11,7 @@ Phyme==0.0.9
|
|||
pycparser==2.19
|
||||
pycurl==7.43.0.4
|
||||
pyOpenSSL==19.1.0
|
||||
pytest==5.4.1
|
||||
six==1.14.0
|
||||
soupsieve==1.9.5
|
||||
Werkzeug==0.16.0
|
33
run
Executable file
33
run
Executable file
|
@ -0,0 +1,33 @@
|
|||
#!/bin/bash
|
||||
# Usage:
|
||||
# ./run # Runs the full web app
|
||||
# ./run test # Runs the testing suite
|
||||
|
||||
SCRIPT=`realpath $0`
|
||||
SCRIPT_DIR=`dirname $SCRIPT`
|
||||
|
||||
# Set default port if unavailable
|
||||
if [[ -z "${PORT}" ]]; then
|
||||
PORT=5000
|
||||
fi
|
||||
|
||||
# Set directory to serve static content from
|
||||
[[ ! -z $1 ]] && SUBDIR="$1" || SUBDIR="app"
|
||||
export APP_ROOT=$SCRIPT_DIR/$SUBDIR
|
||||
export STATIC_FOLDER=$APP_ROOT/static
|
||||
|
||||
mkdir -p $STATIC_FOLDER
|
||||
|
||||
# Create default config json if it doesn't exist
|
||||
if [[ ! -f $STATIC_FOLDER/config.json ]]; then
|
||||
echo "{}" > $STATIC_FOLDER/config.json
|
||||
fi
|
||||
|
||||
pkill flask
|
||||
|
||||
# Check for regular vs test run
|
||||
if [[ $SUBDIR == "test" ]]; then
|
||||
pytest -sv
|
||||
else
|
||||
flask run --host="0.0.0.0" --port=$PORT
|
||||
fi
|
17
run.sh
17
run.sh
|
@ -1,17 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
SCRIPT=`realpath $0`
|
||||
SCRIPT_DIR=`dirname $SCRIPT`
|
||||
|
||||
if [[ -z "${PORT}" ]]; then
|
||||
PORT=5000
|
||||
fi
|
||||
|
||||
# Create config json if it doesn't exist
|
||||
if [[ ! -f $SCRIPT_DIR/app/static/config.json ]]; then
|
||||
echo "{}" > $SCRIPT_DIR/app/static/config.json
|
||||
fi
|
||||
|
||||
pkill flask
|
||||
|
||||
flask run --host="0.0.0.0" --port=$PORT
|
0
test/__init__.py
Normal file
0
test/__init__.py
Normal file
8
test/conftest.py
Normal file
8
test/conftest.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from app import app
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
client = app.test_client()
|
||||
yield client
|
54
test/test_results.py
Normal file
54
test/test_results.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from bs4 import BeautifulSoup
|
||||
from app.filter import Filter
|
||||
import json
|
||||
from datetime import datetime
|
||||
from dateutil.parser import *
|
||||
from test.conftest import client
|
||||
|
||||
|
||||
def get_search_results(data):
|
||||
soup = Filter().clean(BeautifulSoup(rv.data, 'html.parser'))
|
||||
|
||||
main_divs = soup.find('div', {'id': 'main'})
|
||||
assert len(main_divs) > 1
|
||||
|
||||
result_divs = []
|
||||
for div in main_divs:
|
||||
# Result divs should only have 1 inner div
|
||||
if len(list(div.children)) != 1 or not div.findChild() or 'div' not in div.findChild().name:
|
||||
continue
|
||||
|
||||
result_divs.append(div)
|
||||
|
||||
return result_divs
|
||||
|
||||
|
||||
def test_search_results(client):
|
||||
rv = client.get('/search?q=test')
|
||||
assert rv._status_code == 200
|
||||
|
||||
assert len(get_search_results(rv.data)) == 10
|
||||
|
||||
|
||||
def test_recent_results(client):
|
||||
times = {
|
||||
'pastyear': 365,
|
||||
'pastmonth': 31,
|
||||
'pastweek': 7
|
||||
}
|
||||
|
||||
for time, num_days in times.items():
|
||||
rv = client.get('/search?q=test%20%3A' + time)
|
||||
result_divs = get_search_results(rv.data)
|
||||
|
||||
current_date = datetime.now()
|
||||
for div in result_divs:
|
||||
date_span = div.find('span').decode_contents()
|
||||
if not date_span or len(date_span) > 15:
|
||||
continue
|
||||
|
||||
try:
|
||||
date = parse(date_span)
|
||||
assert (current_date - date).days < num_days
|
||||
except ParserError:
|
||||
assert ' ago' in date_span
|
30
test/test_routes.py
Normal file
30
test/test_routes.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
import json
|
||||
from test.conftest import client
|
||||
|
||||
demo_config = {
|
||||
'near': 'Seattle',
|
||||
'dark_mode': 0,
|
||||
'nojs': 0
|
||||
}
|
||||
|
||||
|
||||
def test_main(client):
|
||||
rv = client.get('/')
|
||||
assert rv._status_code == 200
|
||||
|
||||
|
||||
def test_search(client):
|
||||
rv = client.get('/search?q=test')
|
||||
assert rv._status_code == 200
|
||||
|
||||
|
||||
def test_config(client):
|
||||
rv = client.post('/config', data=json.dumps(demo_config))
|
||||
assert rv._status_code == 200
|
||||
|
||||
rv = client.get('/config')
|
||||
assert rv._status_code == 200
|
||||
|
||||
config = json.loads(rv.data)
|
||||
for key in demo_config.keys():
|
||||
assert config[key] == demo_config[key]
|
Loading…
Reference in New Issue
Block a user