run lubimyczytac detail pages in threadpool

2021-12-13 02:14:53 +01:00 · 2021-12-13 02:14:53 +01:00 · 362fdc5716
commit 362fdc5716
parent d55626d445
3 changed files with 99 additions and 87 deletions
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-
 #  This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
 #    Copyright (C) 2021 OzzieIsaacs
 #
@ -18,7 +17,8 @@
 import datetime
 import json
 import re
-from typing import Dict, Generator, List, Optional, Tuple, Union
+from multiprocessing.pool import ThreadPool
+from typing import Dict, List, Optional, Tuple, Union
 from urllib.parse import quote

 import requests
@ -114,13 +114,14 @@ class LubimyCzytac(Metadata):
            lc_parser = LubimyCzytacParser(root=root, metadata=self)
            matches = lc_parser.parse_search_results()
            if matches:
-                final_matches = []
-                for match in matches:
-                    response = requests.get(match.get("url"))
-                    match = lc_parser.parse_single_book(
-                        match=match, response=response, generic_cover=generic_cover
+                with ThreadPool(processes=10) as pool:
+                    final_matches = pool.starmap(
+                        lc_parser.parse_single_book,
+                        [
+                            (match, generic_cover)
+                            for match in matches
+                        ],
                    )
-                    final_matches.append(match)
                return final_matches
            return matches

@ -146,46 +147,6 @@ class LubimyCzytac(Metadata):
            return ""
        return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"

-    @staticmethod
-    def get_title_tokens(
-        title: str, strip_joiners: bool = True
-    ) -> Generator[str, None, None]:
-        """
-        Taken from calibre source code
-        """
-        title_patterns = [
-            (re.compile(pat, re.IGNORECASE), repl)
-            for pat, repl in [
-                # Remove things like: (2010) (Omnibus) etc.
-                (
-                    r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
-                    r"audiobook|audio\scd|paperback|turtleback|"
-                    r"mass\s*market|edition|ed\.)[\])}]",
-                    "",
-                ),
-                # Remove any strings that contain the substring edition inside
-                # parentheses
-                (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
-                # Remove commas used a separators in numbers
-                (r"(\d+),(\d+)", r"\1\2"),
-                # Remove hyphens only if they have whitespace before them
-                (r"(\s-)", " "),
-                # Replace other special chars with a space
-                (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
-            ]
-        ]
-
-        for pat, repl in title_patterns:
-            title = pat.sub(repl, title)
-
-        tokens = title.split()
-        for token in tokens:
-            token = token.strip().strip('"').strip("'")
-            if token and (
-                not strip_joiners or token.lower() not in ("a", "and", "the", "&")
-            ):
-                yield token
-

 class LubimyCzytacParser:
    PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
@ -232,8 +193,9 @@ class LubimyCzytacParser:
        return matches

    def parse_single_book(
-        self, match: Dict, response, generic_cover: str
+        self, match: Dict, generic_cover: str
    ) -> MetaRecord:
+        response = requests.get(match.get("url"))
        self.root = fromstring(response.text)
        match["series"], match["series_index"] = self._parse_series()
        match["tags"] = self._parse_tags()
--- a/cps/search_metadata.py
+++ b/cps/search_metadata.py
@ -16,25 +16,23 @@
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.

-import os
-import json
-import importlib
-import sys
-import inspect
-import datetime
 import concurrent.futures
+import importlib
+import inspect
+import json
+import os
+import sys

-from flask import Blueprint, request, Response, url_for
+from flask import Blueprint, Response, request, url_for
 from flask_login import current_user
 from flask_login import login_required
+from sqlalchemy.exc import InvalidRequestError, OperationalError
 from sqlalchemy.orm.attributes import flag_modified
-from sqlalchemy.exc import OperationalError, InvalidRequestError

-from . import constants, logger, ub
 from cps.services.Metadata import Metadata
+from . import constants, logger, ub

-
-meta = Blueprint('metadata', __name__)
+meta = Blueprint("metadata", __name__)

 log = logger.create()

@ -42,7 +40,7 @@ new_list = list()
 meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider")
 modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider"))
 for f in modules:
-    if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'):
+    if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"):
        a = os.path.basename(f)[:-3]
        try:
            importlib.import_module("cps.metadata_provider." + a)
@ -51,34 +49,46 @@ for f in modules:
            log.error("Import error for metadata source: {}".format(a))
            pass

+
 def list_classes(provider_list):
    classes = list()
    for element in provider_list:
-        for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]):
-            if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata):
+        for name, obj in inspect.getmembers(
+            sys.modules["cps.metadata_provider." + element]
+        ):
+            if (
+                inspect.isclass(obj)
+                and name != "Metadata"
+                and issubclass(obj, Metadata)
+            ):
                classes.append(obj())
    return classes

+
 cl = list_classes(new_list)

+
@meta.route("/metadata/provider")
@login_required
 def metadata_provider():
-    active = current_user.view_settings.get('metadata', {})
+    active = current_user.view_settings.get("metadata", {})
    provider = list()
    for c in cl:
        ac = active.get(c.__id__, True)
-        provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__})
-    return Response(json.dumps(provider), mimetype='application/json')
+        provider.append(
+            {"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}
+        )
+    return Response(json.dumps(provider), mimetype="application/json")

-@meta.route("/metadata/provider", methods=['POST'])
-@meta.route("/metadata/provider/<prov_name>", methods=['POST'])
+
+@meta.route("/metadata/provider", methods=["POST"])
+@meta.route("/metadata/provider/<prov_name>", methods=["POST"])
@login_required
 def metadata_change_active_provider(prov_name):
    new_state = request.get_json()
-    active = current_user.view_settings.get('metadata', {})
-    active[new_state['id']] = new_state['value']
-    current_user.view_settings['metadata'] = active
+    active = current_user.view_settings.get("metadata", {})
+    active[new_state["id"]] = new_state["value"]
+    current_user.view_settings["metadata"] = active
    try:
        try:
            flag_modified(current_user, "view_settings")
@ -91,27 +101,26 @@ def metadata_change_active_provider(prov_name):
    if "initial" in new_state and prov_name:
        for c in cl:
            if c.__id__ == prov_name:
-                data = c.search(new_state.get('query', ""))
+                data = c.search(new_state.get("query", ""))
                break
-        return Response(json.dumps(data), mimetype='application/json')
+        return Response(json.dumps(data), mimetype="application/json")
    return ""

-@meta.route("/metadata/search", methods=['POST'])
+
+@meta.route("/metadata/search", methods=["POST"])
@login_required
 def metadata_search():
-    query = request.form.to_dict().get('query')
+    query = request.form.to_dict().get("query")
    data = list()
-    active = current_user.view_settings.get('metadata', {})
+    active = current_user.view_settings.get("metadata", {})
    if query:
-        static_cover = url_for('static', filename='generic_cover.jpg')
+        static_cover = url_for("static", filename="generic_cover.jpg")
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-            meta = {executor.submit(c.search, query, static_cover): c for c in cl if active.get(c.__id__, True)}
+            meta = {
+                executor.submit(c.search, query, static_cover): c
+                for c in cl
+                if active.get(c.__id__, True)
+            }
            for future in concurrent.futures.as_completed(meta):
                data.extend(future.result())
-    return Response(json.dumps(data), mimetype='application/json')
-
-
-
-
-
-
+    return Response(json.dumps(data), mimetype="application/json")
--- a/cps/services/Metadata.py
+++ b/cps/services/Metadata.py
@ -16,7 +16,8 @@
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
 import abc
-from typing import Dict, List, Optional, TypedDict, Union
+import re
+from typing import Dict, Generator, List, Optional, TypedDict, Union


 class Metadata:
@ -30,9 +31,49 @@ class Metadata:
        self.active = state

    @abc.abstractmethod
-    def search(self, query: str, generic_cover: str):
+    def search(self, query: str, generic_cover: str = ""):
        pass

+    @staticmethod
+    def get_title_tokens(
+        title: str, strip_joiners: bool = True
+    ) -> Generator[str, None, None]:
+        """
+        Taken from calibre source code
+        """
+        title_patterns = [
+            (re.compile(pat, re.IGNORECASE), repl)
+            for pat, repl in [
+                # Remove things like: (2010) (Omnibus) etc.
+                (
+                    r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
+                    r"audiobook|audio\scd|paperback|turtleback|"
+                    r"mass\s*market|edition|ed\.)[\])}]",
+                    "",
+                ),
+                # Remove any strings that contain the substring edition inside
+                # parentheses
+                (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
+                # Remove commas used a separators in numbers
+                (r"(\d+),(\d+)", r"\1\2"),
+                # Remove hyphens only if they have whitespace before them
+                (r"(\s-)", " "),
+                # Replace other special chars with a space
+                (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
+            ]
+        ]
+
+        for pat, repl in title_patterns:
+            title = pat.sub(repl, title)
+
+        tokens = title.split()
+        for token in tokens:
+            token = token.strip().strip('"').strip("'")
+            if token and (
+                not strip_joiners or token.lower() not in ("a", "and", "the", "&")
+            ):
+                yield token
+

 class MetaSourceInfo(TypedDict):
    id: str