refactor and cleaning

2021-12-13 01:23:03 +01:00 · 2021-12-13 01:23:03 +01:00 · d55626d445
commit d55626d445
parent 920acaca99
5 changed files with 278 additions and 240 deletions
--- a/cps/metadata_provider/comicvine.py
+++ b/cps/metadata_provider/comicvine.py
@ -26,7 +26,7 @@ class ComicVine(Metadata):
    __name__ = "ComicVine"
    __id__ = "comicvine"

-    def search(self, query, __):
+    def search(self, query, generic_cover=""):
        val = list()
        apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
        if self.active:
@ -52,7 +52,7 @@ class ComicVine(Metadata):
                v['tags'] = ["Comics", seriesTitle]
                v['rating'] = 0
                v['series'] = seriesTitle
-                v['cover'] = r['image'].get('original_url')
+                v['cover'] = r['image'].get('original_url', generic_cover)
                v['source'] = {
                    "id": self.__id__,
                    "description": "ComicVine Books",
--- a/cps/metadata_provider/google.py
+++ b/cps/metadata_provider/google.py
@ -17,19 +17,20 @@
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.

 # Google Books api document: https://developers.google.com/books/docs/v1/using
-
-
 import requests
+
 from cps.services.Metadata import Metadata

+
 class Google(Metadata):
    __name__ = "Google"
    __id__ = "google"
+    BASE_URL = "https://www.googleapis.com/books/v1/volumes?q="

-    def search(self, query, __):
+    def search(self, query, generic_cover=""):
        if self.active:
            val = list()
-            result = requests.get("https://www.googleapis.com/books/v1/volumes?q="+query.replace(" ","+"))
+            result = requests.get(Google.BASE_URL + query.replace(" ","+"))
            for r in result.json()['items']:
                v = dict()
                v['id'] = r['id']
@ -43,7 +44,8 @@ class Google(Metadata):
                if r['volumeInfo'].get('imageLinks'):
                    v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
                else:
-                    v['cover'] = "/../../../static/generic_cover.jpg"
+                    # v['cover'] = "/../../../static/generic_cover.jpg"
+                    v['cover'] = generic_cover
                v['source'] = {
                    "id": self.__id__,
                    "description": "Google Books",
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@ -15,47 +15,47 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import datetime
 import json
 import re
-from typing import Dict, List
+from typing import Dict, Generator, List, Optional, Tuple, Union
 from urllib.parse import quote

 import requests
-from cps.services.Metadata import Metadata
-from lxml.html import fromstring, tostring
+from dateutil import parser
+from html2text import HTML2Text
+from lxml.html import HtmlElement, fromstring, tostring
+from markdown2 import Markdown
+
+from cps.services.Metadata import MetaRecord, Metadata
+
+SYMBOLS_TO_TRANSLATE = (
+    "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
+    "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
+)
+SYMBOL_TRANSLATION_MAP = dict(
+    [(ord(a), ord(b)) for (a, b) in zip(*SYMBOLS_TO_TRANSLATE)]
+)


-def get_int_or_float(v):
-    number_as_float = float(v)
+def get_int_or_float(value: str) -> Union[int, float]:
+    number_as_float = float(value)
    number_as_int = int(number_as_float)
    return number_as_int if number_as_float == number_as_int else number_as_float


-def strip_accents(s):
-    if s is None:
-        return s
-    else:
-        symbols = (
-            "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
-            "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ",
-        )
-        tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)])
-        return s.translate(tr)  # .lower()
+def strip_accents(s: Optional[str]) -> Optional[str]:
+    return s.translate(SYMBOL_TRANSLATION_MAP) if s is not None else s


-def sanitize_comments_html(html):
-    from markdown2 import Markdown
-
+def sanitize_comments_html(html: str) -> str:
    text = html2text(html)
    md = Markdown()
    html = md.convert(text)
    return html


-def html2text(html):
-    from html2text import HTML2Text
-    import re
-
+def html2text(html: str) -> str:
    # replace <u> tags with <span> as <u> becomes emphasis in html2text
    if isinstance(html, bytes):
        html = html.decode("utf-8")
@ -92,26 +92,36 @@ class LubimyCzytac(Metadata):
    PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
    LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
    DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
-    SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
+    SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()"

    DETAILS = "//div[@id='book-details']"
    PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
    FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
    FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
    TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
+
    RATING = "//meta[@property='books:rating:value']/@content"
    COVER = "//meta[@property='og:image']/@content"
+    ISBN = "//meta[@property='books:isbn']/@content"
+    META_TITLE = "//meta[@property='og:description']/@content"

    SUMMARY = "//script[@type='application/ld+json']//text()"

-    def search(self, query, __):
+    def search(self, query: str, generic_cover: str = "") -> Optional[List]:
        if self.active:
            result = requests.get(self._prepare_query(title=query))
            root = fromstring(result.text)
-            matches = self._parse_search_results(root=root)
+            lc_parser = LubimyCzytacParser(root=root, metadata=self)
+            matches = lc_parser.parse_search_results()
            if matches:
-                for ind, match in enumerate(matches):
-                    matches[ind] = self._parse_single_book(match=match)
+                final_matches = []
+                for match in matches:
+                    response = requests.get(match.get("url"))
+                    match = lc_parser.parse_single_book(
+                        match=match, response=response, generic_cover=generic_cover
+                    )
+                    final_matches.append(match)
+                return final_matches
            return matches

    def _prepare_query(self, title: str) -> str:
@ -128,9 +138,7 @@ class LubimyCzytac(Metadata):
                token for token in title.lower().split(" ") if len(token) > 1
            ]
        else:
-            title_tokens = list(
-                self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
-            )
+            title_tokens = list(self.get_title_tokens(title, strip_joiners=False))
        if title_tokens:
            tokens = [quote(t.encode("utf-8")) for t in title_tokens]
            query = query + "%20".join(tokens)
@ -138,215 +146,21 @@ class LubimyCzytac(Metadata):
            return ""
        return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"

-    def _parse_search_results(self, root) -> List[Dict]:
-        matches = []
-        results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
-        for result in results:
-            title = result.xpath(
-                f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
-                f"{LubimyCzytac.TITLE_TEXT_PATH}"
-            )
-            book_url = result.xpath(
-                f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
-            )
-            authors = result.xpath(
-                f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
-                f"{LubimyCzytac.AUTHORS_PATH}"
-            )
-
-            if not title or not book_url or not authors:
-                continue
-            title = title[0].strip()
-            book_url = LubimyCzytac.BASE_URL + book_url[0]
-            book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
-                "/"
-            )[0]
-            matches.append(
-                {"id": book_id, "title": title, "authors": authors, "url": book_url}
-            )
-        return matches
-
-    def _parse_single_book(self, match: Dict) -> Dict:
-        url = match.get("url")
-        result = requests.get(url)
-        root = fromstring(result.text)
-        match["series"], match["series_index"] = self._parse_series(root=root)
-        match["tags"] = self._parse_tags(root=root)
-        match["publisher"] = self._parse_publisher(root=root)
-        match["publishedDate"] = self._parse_from_summary(
-            root=root, attribute_name="datePublished"
-        )
-        match["rating"] = self._parse_rating(root=root)
-        match["description"] = self._parse_description(root=root)
-        match["cover"] = self._parse_cover(root=root)
-        match["source"] = {
-            "id": self.__id__,
-            "description": self.__name__,
-            "link": LubimyCzytac.BASE_URL,
-        }
-        match['languages'] = self._parse_languages(root=root)
-        match["identifiers"] = {
-            "isbn": self._parse_isbn(root=root),
-            "lubimyczytac": match["id"],
-        }
-        return match
-
-    def _parse_cover(self, root):
-        imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
-        if imgcol_node:
-            img_url = imgcol_node[0]
-            return img_url
-
-    def _parse_publisher(self, root):
-        publisher = root.xpath(LubimyCzytac.PUBLISHER)
-        if publisher:
-            return publisher[0]
-        else:
-            return None
-
-    def _parse_languages(self, root):
-        lang = root.xpath(LubimyCzytac.LANGUAGES)
-        languages = list()
-        if lang:
-            lang = lang[0].strip()
-            if "polski" in lang:
-                languages.append("Polish")
-            if "angielski" in lang:
-                languages.append("English")
-        if not languages:
-            return ['Polish']
-        return languages
-
-    def _parse_series(self, root):
-        try:
-            series_node = root.xpath(LubimyCzytac.SERIES)
-            if series_node:
-                series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
-                if series_lst:
-                    series_txt = series_lst
-                else:
-                    series_txt = None
-            else:
-                return (None, None)
-
-            if series_txt:
-                ser_string = [series_txt[0].replace("\n", "").strip()]
-                ser_nazwa = ser_string
-                for ser in ser_string:
-                    if "tom " in ser:
-                        ser_info = ser.split(" (tom ", 1)
-                        ser_nazwa = ser.split(" (tom ")[0]
-                        break
-
-            if ser_info:
-                series_index_unicode = ser_info[1]
-                series_index_string = str(
-                    series_index_unicode.replace(" ", "").replace(")", "")
-                )
-                # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
-                if "-" in series_index_string:
-                    series_index_string_temp = series_index_string.split("-", 1)
-                    series_index_string = series_index_string_temp[0]
-                if series_index_string.replace(".", "").isdigit() is True:
-                    series_index = get_int_or_float(series_index_string)
-                else:
-                    series_index = 0
-            else:
-                series_index = 0
-            series = ser_nazwa
-            return (series, series_index)
-        except:
-            return (None, None)
-
-    def _parse_tags(self, root):
-        tags = None
-        try:
-            tags_from_genre = root.xpath(LubimyCzytac.TAGS)
-            if tags_from_genre:
-                tags = tags_from_genre
-                tags = [w.replace(", itd.", " itd.") for w in tags]
-                return tags
-            else:
-                return None
-        except:
-            return tags
-
-    def _parse_from_summary(self, root, attribute_name: str) -> str:
-        data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
-        value = data.get(attribute_name)
-        return value.strip() if value is not None else value
-
-    def _parse_rating(self, root):
-        rating_node = root.xpath(LubimyCzytac.RATING)
-        if rating_node:
-            rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
-            return rating_value
-        return None
-
-    def _parse_date(self, root, xpath="first_publish"):
-        options = {
-            "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
-            "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
-        }
-        path = options.get(xpath)
-        from dateutil import parser
-
-        data = root.xpath(path)
-        if data:
-            first_pub_date = data[0].strip()
-            return parser.parse(first_pub_date)
-        return None
-
-    def _parse_isbn(self, root):
-        isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
-        return isbn_node
-
-    def _parse_description(self, root):
-        comments = ""
-        description_node = root.xpath(LubimyCzytac.DESCRIPTION)
-        if description_node:
-            for zrodla in root.xpath('//p[@class="source"]'):
-                zrodla.getparent().remove(zrodla)
-            comments = tostring(description_node[0], method="html")
-            comments = sanitize_comments_html(comments)
-
-        else:
-            # try <meta>
-            description_node = root.xpath('//meta[@property="og:description"]/@content')
-            if description_node:
-                comments = description_node[0]
-                comments = sanitize_comments_html(comments)
-
-        pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
-        if pages:
-            comments += f'<p id="strony">Książka ma {pages} stron(y).</p>'
-
-        first_publish_date = self._parse_date(root=root)
-        if first_publish_date:
-            comments += f'<p id="pierwsze_wydanie">Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}</p>'
-
-        first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
-        if first_publish_date_pl:
-            comments += f'<p id="pierwsze_wydanie_pl">Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}</p>'
-
-        return comments
-
-    def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
+    @staticmethod
+    def get_title_tokens(
+        title: str, strip_joiners: bool = True
+    ) -> Generator[str, None, None]:
        """
-        Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
+        Taken from calibre source code
        """
-        # strip sub-titles
-        if strip_subtitle:
-            subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
-            if len(subtitle.sub("", title)) > 1:
-                title = subtitle.sub("", title)
-
        title_patterns = [
            (re.compile(pat, re.IGNORECASE), repl)
            for pat, repl in [
                # Remove things like: (2010) (Omnibus) etc.
                (
-                    r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
+                    r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
+                    r"audiobook|audio\scd|paperback|turtleback|"
+                    r"mass\s*market|edition|ed\.)[\])}]",
                    "",
                ),
                # Remove any strings that contain the substring edition inside
@ -371,3 +185,193 @@ class LubimyCzytac(Metadata):
                not strip_joiners or token.lower() not in ("a", "and", "the", "&")
            ):
                yield token
+
+
+class LubimyCzytacParser:
+    PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
+    PUBLISH_DATE_TEMPLATE = "<p id='pierwsze_wydanie'>Data pierwszego wydania: {0}</p>"
+    PUBLISH_DATE_PL_TEMPLATE = (
+        "<p id='pierwsze_wydanie'>Data pierwszego wydania w Polsce: {0}</p>"
+    )
+
+    def __init__(self, root: HtmlElement, metadata: Metadata) -> None:
+        self.root = root
+        self.metadata = metadata
+
+    def parse_search_results(self) -> List[Dict]:
+        matches = []
+        results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
+        for result in results:
+            title = self._parse_xpath_node(
+                root=result,
+                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.TITLE_TEXT_PATH}",
+            )
+
+            book_url = self._parse_xpath_node(
+                root=result,
+                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.URL_PATH}",
+            )
+            authors = self._parse_xpath_node(
+                root=result,
+                xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+                f"{LubimyCzytac.AUTHORS_PATH}",
+                take_first=False,
+            )
+            if not all([title, book_url, authors]):
+                continue
+            matches.append(
+                {
+                    "id": book_url.replace(f"/ksiazka/", "").split("/")[0],
+                    "title": title,
+                    "authors": [strip_accents(author) for author in authors],
+                    "url": LubimyCzytac.BASE_URL + book_url,
+                }
+            )
+        return matches
+
+    def parse_single_book(
+        self, match: Dict, response, generic_cover: str
+    ) -> MetaRecord:
+        self.root = fromstring(response.text)
+        match["series"], match["series_index"] = self._parse_series()
+        match["tags"] = self._parse_tags()
+        match["publisher"] = self._parse_publisher()
+        match["publishedDate"] = self._parse_from_summary(
+            attribute_name="datePublished"
+        )
+        match["rating"] = self._parse_rating()
+        match["description"] = self._parse_description()
+        match["cover"] = self._parse_cover(generic_cover=generic_cover)
+        match["source"] = {
+            "id": self.metadata.__id__,
+            "description": self.metadata.__name__,
+            "link": LubimyCzytac.BASE_URL,
+        }
+        match["languages"] = self._parse_languages()
+        match["identifiers"] = {
+            "isbn": self._parse_isbn(),
+            "lubimyczytac": match["id"],
+        }
+        return match
+
+    def _parse_xpath_node(
+        self,
+        xpath: str,
+        root: HtmlElement = None,
+        take_first: bool = True,
+        strip_element: bool = True,
+    ) -> Optional[Union[str, List[str]]]:
+        root = root if root is not None else self.root
+        node = root.xpath(xpath)
+        if not node:
+            return None
+        return (
+            (node[0].strip() if strip_element else node[0])
+            if take_first
+            else [x.strip() for x in node]
+        )
+
+    def _parse_cover(self, generic_cover) -> Optional[str]:
+        return (
+            self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True)
+            or generic_cover
+        )
+
+    def _parse_publisher(self) -> Optional[str]:
+        return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True)
+
+    def _parse_languages(self) -> List[str]:
+        languages = list()
+        lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True)
+        if lang:
+            if "polski" in lang:
+                languages.append("Polish")
+            if "angielski" in lang:
+                languages.append("English")
+        return languages
+
+    def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]:
+        series_index = 0
+        series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True)
+        if series:
+            if "tom " in series:
+                series_name, series_info = series.split(" (tom ", 1)
+                series_info = series_info.replace(" ", "").replace(")", "")
+                # Check if book is not a bundle, i.e. chapter 1-3
+                if "-" in series_info:
+                    series_info = series_info.split("-", 1)[0]
+                if series_info.replace(".", "").isdigit() is True:
+                    series_index = get_int_or_float(series_info)
+                return series_name, series_index
+        return None, None
+
+    def _parse_tags(self) -> List[str]:
+        tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False)
+        return [
+            strip_accents(w.replace(", itd.", " itd."))
+            for w in tags
+            if isinstance(w, str)
+        ]
+
+    def _parse_from_summary(self, attribute_name: str) -> Optional[str]:
+        value = None
+        summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY)
+        if summary_text:
+            data = json.loads(summary_text)
+            value = data.get(attribute_name)
+        return value.strip() if value is not None else value
+
+    def _parse_rating(self) -> Optional[str]:
+        rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING)
+        return round(float(rating.replace(",", ".")) / 2) if rating else rating
+
+    def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]:
+        options = {
+            "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
+            "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
+        }
+        date = self._parse_xpath_node(xpath=options.get(xpath))
+        return parser.parse(date) if date else None
+
+    def _parse_isbn(self) -> Optional[str]:
+        return self._parse_xpath_node(xpath=LubimyCzytac.ISBN)
+
+    def _parse_description(self) -> str:
+        description = ""
+        description_node = self._parse_xpath_node(
+            xpath=LubimyCzytac.DESCRIPTION, strip_element=False
+        )
+        if description_node is not None:
+            for source in self.root.xpath('//p[@class="source"]'):
+                source.getparent().remove(source)
+            description = tostring(description_node, method="html")
+            description = sanitize_comments_html(description)
+
+        else:
+            description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE)
+            if description_node is not None:
+                description = description_node
+                description = sanitize_comments_html(description)
+        description = self._add_extra_info_to_description(description=description)
+        return description
+
+    def _add_extra_info_to_description(self, description: str) -> str:
+        pages = self._parse_from_summary(attribute_name="numberOfPages")
+        if pages:
+            description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages)
+
+        first_publish_date = self._parse_date()
+        if first_publish_date:
+            description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format(
+                first_publish_date.strftime("%d.%m.%Y")
+            )
+
+        first_publish_date_pl = self._parse_date(xpath="first_publish_pl")
+        if first_publish_date_pl:
+            description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format(
+                first_publish_date_pl.strftime("%d.%m.%Y")
+            )
+
+        return description
--- a/cps/services/Metadata.py
+++ b/cps/services/Metadata.py
@ -15,13 +15,44 @@
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.
+import abc
+from typing import Dict, List, Optional, TypedDict, Union


-class Metadata():
+class Metadata:
    __name__ = "Generic"
+    __id__ = "generic"

    def __init__(self):
        self.active = True

    def set_status(self, state):
        self.active = state
+
+    @abc.abstractmethod
+    def search(self, query: str, generic_cover: str):
+        pass
+
+
+class MetaSourceInfo(TypedDict):
+    id: str
+    description: str
+    link: str
+
+
+class MetaRecord(TypedDict):
+    id: Union[str, int]
+    title: str
+    authors: List[str]
+    url: str
+    cover: str
+    series: Optional[str]
+    series_index: Optional[Union[int, float]]
+    tags: Optional[List[str]]
+    publisher: Optional[str]
+    publishedDate: Optional[str]
+    rating: Optional[int]
+    description: Optional[str]
+    source: MetaSourceInfo
+    languages: Optional[List[str]]
+    identifiers: Dict[str, Union[str, int]]
--- a/requirements.txt
+++ b/requirements.txt
@ -16,3 +16,4 @@ lxml>=3.8.0,<4.7.0
 flask-wtf>=0.14.2,<1.1.0
 markdown2==2.4.2
 html2text==2020.1.16
+python-dateutil==2.8.2