run lubimyczytac detail pages in threadpool

This commit is contained in:
collerek 2021-12-13 02:14:53 +01:00
parent d55626d445
commit 362fdc5716
3 changed files with 99 additions and 87 deletions

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) # This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
# Copyright (C) 2021 OzzieIsaacs # Copyright (C) 2021 OzzieIsaacs
# #
@ -18,7 +17,8 @@
import datetime import datetime
import json import json
import re import re
from typing import Dict, Generator, List, Optional, Tuple, Union from multiprocessing.pool import ThreadPool
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import quote from urllib.parse import quote
import requests import requests
@ -114,13 +114,14 @@ class LubimyCzytac(Metadata):
lc_parser = LubimyCzytacParser(root=root, metadata=self) lc_parser = LubimyCzytacParser(root=root, metadata=self)
matches = lc_parser.parse_search_results() matches = lc_parser.parse_search_results()
if matches: if matches:
final_matches = [] with ThreadPool(processes=10) as pool:
for match in matches: final_matches = pool.starmap(
response = requests.get(match.get("url")) lc_parser.parse_single_book,
match = lc_parser.parse_single_book( [
match=match, response=response, generic_cover=generic_cover (match, generic_cover)
for match in matches
],
) )
final_matches.append(match)
return final_matches return final_matches
return matches return matches
@ -146,46 +147,6 @@ class LubimyCzytac(Metadata):
return "" return ""
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
@staticmethod
def get_title_tokens(
title: str, strip_joiners: bool = True
) -> Generator[str, None, None]:
"""
Taken from calibre source code
"""
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
r"audiobook|audio\scd|paperback|turtleback|"
r"mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
# parentheses
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
# Remove commas used a separators in numbers
(r"(\d+),(\d+)", r"\1\2"),
# Remove hyphens only if they have whitespace before them
(r"(\s-)", " "),
# Replace other special chars with a space
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
]
]
for pat, repl in title_patterns:
title = pat.sub(repl, title)
tokens = title.split()
for token in tokens:
token = token.strip().strip('"').strip("'")
if token and (
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token
class LubimyCzytacParser: class LubimyCzytacParser:
PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>" PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
@ -232,8 +193,9 @@ class LubimyCzytacParser:
return matches return matches
def parse_single_book( def parse_single_book(
self, match: Dict, response, generic_cover: str self, match: Dict, generic_cover: str
) -> MetaRecord: ) -> MetaRecord:
response = requests.get(match.get("url"))
self.root = fromstring(response.text) self.root = fromstring(response.text)
match["series"], match["series_index"] = self._parse_series() match["series"], match["series_index"] = self._parse_series()
match["tags"] = self._parse_tags() match["tags"] = self._parse_tags()

View File

@ -16,25 +16,23 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import json
import importlib
import sys
import inspect
import datetime
import concurrent.futures import concurrent.futures
import importlib
import inspect
import json
import os
import sys
from flask import Blueprint, request, Response, url_for from flask import Blueprint, Response, request, url_for
from flask_login import current_user from flask_login import current_user
from flask_login import login_required from flask_login import login_required
from sqlalchemy.exc import InvalidRequestError, OperationalError
from sqlalchemy.orm.attributes import flag_modified from sqlalchemy.orm.attributes import flag_modified
from sqlalchemy.exc import OperationalError, InvalidRequestError
from . import constants, logger, ub
from cps.services.Metadata import Metadata from cps.services.Metadata import Metadata
from . import constants, logger, ub
meta = Blueprint("metadata", __name__)
meta = Blueprint('metadata', __name__)
log = logger.create() log = logger.create()
@ -42,7 +40,7 @@ new_list = list()
meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider") meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider")
modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider")) modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider"))
for f in modules: for f in modules:
if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'): if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"):
a = os.path.basename(f)[:-3] a = os.path.basename(f)[:-3]
try: try:
importlib.import_module("cps.metadata_provider." + a) importlib.import_module("cps.metadata_provider." + a)
@ -51,34 +49,46 @@ for f in modules:
log.error("Import error for metadata source: {}".format(a)) log.error("Import error for metadata source: {}".format(a))
pass pass
def list_classes(provider_list): def list_classes(provider_list):
classes = list() classes = list()
for element in provider_list: for element in provider_list:
for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]): for name, obj in inspect.getmembers(
if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata): sys.modules["cps.metadata_provider." + element]
):
if (
inspect.isclass(obj)
and name != "Metadata"
and issubclass(obj, Metadata)
):
classes.append(obj()) classes.append(obj())
return classes return classes
cl = list_classes(new_list) cl = list_classes(new_list)
@meta.route("/metadata/provider") @meta.route("/metadata/provider")
@login_required @login_required
def metadata_provider(): def metadata_provider():
active = current_user.view_settings.get('metadata', {}) active = current_user.view_settings.get("metadata", {})
provider = list() provider = list()
for c in cl: for c in cl:
ac = active.get(c.__id__, True) ac = active.get(c.__id__, True)
provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}) provider.append(
return Response(json.dumps(provider), mimetype='application/json') {"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}
)
return Response(json.dumps(provider), mimetype="application/json")
@meta.route("/metadata/provider", methods=['POST'])
@meta.route("/metadata/provider/<prov_name>", methods=['POST']) @meta.route("/metadata/provider", methods=["POST"])
@meta.route("/metadata/provider/<prov_name>", methods=["POST"])
@login_required @login_required
def metadata_change_active_provider(prov_name): def metadata_change_active_provider(prov_name):
new_state = request.get_json() new_state = request.get_json()
active = current_user.view_settings.get('metadata', {}) active = current_user.view_settings.get("metadata", {})
active[new_state['id']] = new_state['value'] active[new_state["id"]] = new_state["value"]
current_user.view_settings['metadata'] = active current_user.view_settings["metadata"] = active
try: try:
try: try:
flag_modified(current_user, "view_settings") flag_modified(current_user, "view_settings")
@ -91,27 +101,26 @@ def metadata_change_active_provider(prov_name):
if "initial" in new_state and prov_name: if "initial" in new_state and prov_name:
for c in cl: for c in cl:
if c.__id__ == prov_name: if c.__id__ == prov_name:
data = c.search(new_state.get('query', "")) data = c.search(new_state.get("query", ""))
break break
return Response(json.dumps(data), mimetype='application/json') return Response(json.dumps(data), mimetype="application/json")
return "" return ""
@meta.route("/metadata/search", methods=['POST'])
@meta.route("/metadata/search", methods=["POST"])
@login_required @login_required
def metadata_search(): def metadata_search():
query = request.form.to_dict().get('query') query = request.form.to_dict().get("query")
data = list() data = list()
active = current_user.view_settings.get('metadata', {}) active = current_user.view_settings.get("metadata", {})
if query: if query:
static_cover = url_for('static', filename='generic_cover.jpg') static_cover = url_for("static", filename="generic_cover.jpg")
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
meta = {executor.submit(c.search, query, static_cover): c for c in cl if active.get(c.__id__, True)} meta = {
executor.submit(c.search, query, static_cover): c
for c in cl
if active.get(c.__id__, True)
}
for future in concurrent.futures.as_completed(meta): for future in concurrent.futures.as_completed(meta):
data.extend(future.result()) data.extend(future.result())
return Response(json.dumps(data), mimetype='application/json') return Response(json.dumps(data), mimetype="application/json")

View File

@ -16,7 +16,8 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import abc import abc
from typing import Dict, List, Optional, TypedDict, Union import re
from typing import Dict, Generator, List, Optional, TypedDict, Union
class Metadata: class Metadata:
@ -30,9 +31,49 @@ class Metadata:
self.active = state self.active = state
@abc.abstractmethod @abc.abstractmethod
def search(self, query: str, generic_cover: str): def search(self, query: str, generic_cover: str = ""):
pass pass
@staticmethod
def get_title_tokens(
title: str, strip_joiners: bool = True
) -> Generator[str, None, None]:
"""
Taken from calibre source code
"""
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
r"audiobook|audio\scd|paperback|turtleback|"
r"mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
# parentheses
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
# Remove commas used a separators in numbers
(r"(\d+),(\d+)", r"\1\2"),
# Remove hyphens only if they have whitespace before them
(r"(\s-)", " "),
# Replace other special chars with a space
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
]
]
for pat, repl in title_patterns:
title = pat.sub(repl, title)
tokens = title.split()
for token in tokens:
token = token.strip().strip('"').strip("'")
if token and (
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token
class MetaSourceInfo(TypedDict): class MetaSourceInfo(TypedDict):
id: str id: str