run lubimyczytac detail pages in threadpool
This commit is contained in:
parent
d55626d445
commit
362fdc5716
|
@ -1,5 +1,4 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
|
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
|
||||||
# Copyright (C) 2021 OzzieIsaacs
|
# Copyright (C) 2021 OzzieIsaacs
|
||||||
#
|
#
|
||||||
|
@ -18,7 +17,8 @@
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import Dict, Generator, List, Optional, Tuple, Union
|
from multiprocessing.pool import ThreadPool
|
||||||
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
@ -114,13 +114,14 @@ class LubimyCzytac(Metadata):
|
||||||
lc_parser = LubimyCzytacParser(root=root, metadata=self)
|
lc_parser = LubimyCzytacParser(root=root, metadata=self)
|
||||||
matches = lc_parser.parse_search_results()
|
matches = lc_parser.parse_search_results()
|
||||||
if matches:
|
if matches:
|
||||||
final_matches = []
|
with ThreadPool(processes=10) as pool:
|
||||||
for match in matches:
|
final_matches = pool.starmap(
|
||||||
response = requests.get(match.get("url"))
|
lc_parser.parse_single_book,
|
||||||
match = lc_parser.parse_single_book(
|
[
|
||||||
match=match, response=response, generic_cover=generic_cover
|
(match, generic_cover)
|
||||||
|
for match in matches
|
||||||
|
],
|
||||||
)
|
)
|
||||||
final_matches.append(match)
|
|
||||||
return final_matches
|
return final_matches
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
@ -146,46 +147,6 @@ class LubimyCzytac(Metadata):
|
||||||
return ""
|
return ""
|
||||||
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
|
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_title_tokens(
|
|
||||||
title: str, strip_joiners: bool = True
|
|
||||||
) -> Generator[str, None, None]:
|
|
||||||
"""
|
|
||||||
Taken from calibre source code
|
|
||||||
"""
|
|
||||||
title_patterns = [
|
|
||||||
(re.compile(pat, re.IGNORECASE), repl)
|
|
||||||
for pat, repl in [
|
|
||||||
# Remove things like: (2010) (Omnibus) etc.
|
|
||||||
(
|
|
||||||
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
|
|
||||||
r"audiobook|audio\scd|paperback|turtleback|"
|
|
||||||
r"mass\s*market|edition|ed\.)[\])}]",
|
|
||||||
"",
|
|
||||||
),
|
|
||||||
# Remove any strings that contain the substring edition inside
|
|
||||||
# parentheses
|
|
||||||
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
|
|
||||||
# Remove commas used a separators in numbers
|
|
||||||
(r"(\d+),(\d+)", r"\1\2"),
|
|
||||||
# Remove hyphens only if they have whitespace before them
|
|
||||||
(r"(\s-)", " "),
|
|
||||||
# Replace other special chars with a space
|
|
||||||
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
for pat, repl in title_patterns:
|
|
||||||
title = pat.sub(repl, title)
|
|
||||||
|
|
||||||
tokens = title.split()
|
|
||||||
for token in tokens:
|
|
||||||
token = token.strip().strip('"').strip("'")
|
|
||||||
if token and (
|
|
||||||
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
|
|
||||||
):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
|
|
||||||
class LubimyCzytacParser:
|
class LubimyCzytacParser:
|
||||||
PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
|
PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
|
||||||
|
@ -232,8 +193,9 @@ class LubimyCzytacParser:
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def parse_single_book(
|
def parse_single_book(
|
||||||
self, match: Dict, response, generic_cover: str
|
self, match: Dict, generic_cover: str
|
||||||
) -> MetaRecord:
|
) -> MetaRecord:
|
||||||
|
response = requests.get(match.get("url"))
|
||||||
self.root = fromstring(response.text)
|
self.root = fromstring(response.text)
|
||||||
match["series"], match["series_index"] = self._parse_series()
|
match["series"], match["series_index"] = self._parse_series()
|
||||||
match["tags"] = self._parse_tags()
|
match["tags"] = self._parse_tags()
|
||||||
|
|
|
@ -16,25 +16,23 @@
|
||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import importlib
|
|
||||||
import sys
|
|
||||||
import inspect
|
|
||||||
import datetime
|
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
import importlib
|
||||||
|
import inspect
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
from flask import Blueprint, request, Response, url_for
|
from flask import Blueprint, Response, request, url_for
|
||||||
from flask_login import current_user
|
from flask_login import current_user
|
||||||
from flask_login import login_required
|
from flask_login import login_required
|
||||||
|
from sqlalchemy.exc import InvalidRequestError, OperationalError
|
||||||
from sqlalchemy.orm.attributes import flag_modified
|
from sqlalchemy.orm.attributes import flag_modified
|
||||||
from sqlalchemy.exc import OperationalError, InvalidRequestError
|
|
||||||
|
|
||||||
from . import constants, logger, ub
|
|
||||||
from cps.services.Metadata import Metadata
|
from cps.services.Metadata import Metadata
|
||||||
|
from . import constants, logger, ub
|
||||||
|
|
||||||
|
meta = Blueprint("metadata", __name__)
|
||||||
meta = Blueprint('metadata', __name__)
|
|
||||||
|
|
||||||
log = logger.create()
|
log = logger.create()
|
||||||
|
|
||||||
|
@ -42,7 +40,7 @@ new_list = list()
|
||||||
meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider")
|
meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider")
|
||||||
modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider"))
|
modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider"))
|
||||||
for f in modules:
|
for f in modules:
|
||||||
if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'):
|
if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"):
|
||||||
a = os.path.basename(f)[:-3]
|
a = os.path.basename(f)[:-3]
|
||||||
try:
|
try:
|
||||||
importlib.import_module("cps.metadata_provider." + a)
|
importlib.import_module("cps.metadata_provider." + a)
|
||||||
|
@ -51,34 +49,46 @@ for f in modules:
|
||||||
log.error("Import error for metadata source: {}".format(a))
|
log.error("Import error for metadata source: {}".format(a))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def list_classes(provider_list):
|
def list_classes(provider_list):
|
||||||
classes = list()
|
classes = list()
|
||||||
for element in provider_list:
|
for element in provider_list:
|
||||||
for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]):
|
for name, obj in inspect.getmembers(
|
||||||
if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata):
|
sys.modules["cps.metadata_provider." + element]
|
||||||
|
):
|
||||||
|
if (
|
||||||
|
inspect.isclass(obj)
|
||||||
|
and name != "Metadata"
|
||||||
|
and issubclass(obj, Metadata)
|
||||||
|
):
|
||||||
classes.append(obj())
|
classes.append(obj())
|
||||||
return classes
|
return classes
|
||||||
|
|
||||||
|
|
||||||
cl = list_classes(new_list)
|
cl = list_classes(new_list)
|
||||||
|
|
||||||
|
|
||||||
@meta.route("/metadata/provider")
|
@meta.route("/metadata/provider")
|
||||||
@login_required
|
@login_required
|
||||||
def metadata_provider():
|
def metadata_provider():
|
||||||
active = current_user.view_settings.get('metadata', {})
|
active = current_user.view_settings.get("metadata", {})
|
||||||
provider = list()
|
provider = list()
|
||||||
for c in cl:
|
for c in cl:
|
||||||
ac = active.get(c.__id__, True)
|
ac = active.get(c.__id__, True)
|
||||||
provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__})
|
provider.append(
|
||||||
return Response(json.dumps(provider), mimetype='application/json')
|
{"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}
|
||||||
|
)
|
||||||
|
return Response(json.dumps(provider), mimetype="application/json")
|
||||||
|
|
||||||
@meta.route("/metadata/provider", methods=['POST'])
|
|
||||||
@meta.route("/metadata/provider/<prov_name>", methods=['POST'])
|
@meta.route("/metadata/provider", methods=["POST"])
|
||||||
|
@meta.route("/metadata/provider/<prov_name>", methods=["POST"])
|
||||||
@login_required
|
@login_required
|
||||||
def metadata_change_active_provider(prov_name):
|
def metadata_change_active_provider(prov_name):
|
||||||
new_state = request.get_json()
|
new_state = request.get_json()
|
||||||
active = current_user.view_settings.get('metadata', {})
|
active = current_user.view_settings.get("metadata", {})
|
||||||
active[new_state['id']] = new_state['value']
|
active[new_state["id"]] = new_state["value"]
|
||||||
current_user.view_settings['metadata'] = active
|
current_user.view_settings["metadata"] = active
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
flag_modified(current_user, "view_settings")
|
flag_modified(current_user, "view_settings")
|
||||||
|
@ -91,27 +101,26 @@ def metadata_change_active_provider(prov_name):
|
||||||
if "initial" in new_state and prov_name:
|
if "initial" in new_state and prov_name:
|
||||||
for c in cl:
|
for c in cl:
|
||||||
if c.__id__ == prov_name:
|
if c.__id__ == prov_name:
|
||||||
data = c.search(new_state.get('query', ""))
|
data = c.search(new_state.get("query", ""))
|
||||||
break
|
break
|
||||||
return Response(json.dumps(data), mimetype='application/json')
|
return Response(json.dumps(data), mimetype="application/json")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@meta.route("/metadata/search", methods=['POST'])
|
|
||||||
|
@meta.route("/metadata/search", methods=["POST"])
|
||||||
@login_required
|
@login_required
|
||||||
def metadata_search():
|
def metadata_search():
|
||||||
query = request.form.to_dict().get('query')
|
query = request.form.to_dict().get("query")
|
||||||
data = list()
|
data = list()
|
||||||
active = current_user.view_settings.get('metadata', {})
|
active = current_user.view_settings.get("metadata", {})
|
||||||
if query:
|
if query:
|
||||||
static_cover = url_for('static', filename='generic_cover.jpg')
|
static_cover = url_for("static", filename="generic_cover.jpg")
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
meta = {executor.submit(c.search, query, static_cover): c for c in cl if active.get(c.__id__, True)}
|
meta = {
|
||||||
|
executor.submit(c.search, query, static_cover): c
|
||||||
|
for c in cl
|
||||||
|
if active.get(c.__id__, True)
|
||||||
|
}
|
||||||
for future in concurrent.futures.as_completed(meta):
|
for future in concurrent.futures.as_completed(meta):
|
||||||
data.extend(future.result())
|
data.extend(future.result())
|
||||||
return Response(json.dumps(data), mimetype='application/json')
|
return Response(json.dumps(data), mimetype="application/json")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,8 @@
|
||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
import abc
|
import abc
|
||||||
from typing import Dict, List, Optional, TypedDict, Union
|
import re
|
||||||
|
from typing import Dict, Generator, List, Optional, TypedDict, Union
|
||||||
|
|
||||||
|
|
||||||
class Metadata:
|
class Metadata:
|
||||||
|
@ -30,9 +31,49 @@ class Metadata:
|
||||||
self.active = state
|
self.active = state
|
||||||
|
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def search(self, query: str, generic_cover: str):
|
def search(self, query: str, generic_cover: str = ""):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_title_tokens(
|
||||||
|
title: str, strip_joiners: bool = True
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
|
"""
|
||||||
|
Taken from calibre source code
|
||||||
|
"""
|
||||||
|
title_patterns = [
|
||||||
|
(re.compile(pat, re.IGNORECASE), repl)
|
||||||
|
for pat, repl in [
|
||||||
|
# Remove things like: (2010) (Omnibus) etc.
|
||||||
|
(
|
||||||
|
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
|
||||||
|
r"audiobook|audio\scd|paperback|turtleback|"
|
||||||
|
r"mass\s*market|edition|ed\.)[\])}]",
|
||||||
|
"",
|
||||||
|
),
|
||||||
|
# Remove any strings that contain the substring edition inside
|
||||||
|
# parentheses
|
||||||
|
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
|
||||||
|
# Remove commas used a separators in numbers
|
||||||
|
(r"(\d+),(\d+)", r"\1\2"),
|
||||||
|
# Remove hyphens only if they have whitespace before them
|
||||||
|
(r"(\s-)", " "),
|
||||||
|
# Replace other special chars with a space
|
||||||
|
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
for pat, repl in title_patterns:
|
||||||
|
title = pat.sub(repl, title)
|
||||||
|
|
||||||
|
tokens = title.split()
|
||||||
|
for token in tokens:
|
||||||
|
token = token.strip().strip('"').strip("'")
|
||||||
|
if token and (
|
||||||
|
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
|
||||||
|
):
|
||||||
|
yield token
|
||||||
|
|
||||||
|
|
||||||
class MetaSourceInfo(TypedDict):
|
class MetaSourceInfo(TypedDict):
|
||||||
id: str
|
id: str
|
||||||
|
|
Loading…
Reference in New Issue
Block a user