as becomes emphasis in html2text
if isinstance(html, bytes):
html = html.decode("utf-8")
@@ -92,26 +92,36 @@ class LubimyCzytac(Metadata):
PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()"
LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()"
DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']"
- SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]"
+ SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()"
DETAILS = "//div[@id='book-details']"
PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania"
FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()"
FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()"
TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()"
+
RATING = "//meta[@property='books:rating:value']/@content"
COVER = "//meta[@property='og:image']/@content"
+ ISBN = "//meta[@property='books:isbn']/@content"
+ META_TITLE = "//meta[@property='og:description']/@content"
SUMMARY = "//script[@type='application/ld+json']//text()"
- def search(self, query, __):
+ def search(self, query: str, generic_cover: str = "") -> Optional[List]:
if self.active:
result = requests.get(self._prepare_query(title=query))
root = fromstring(result.text)
- matches = self._parse_search_results(root=root)
+ lc_parser = LubimyCzytacParser(root=root, metadata=self)
+ matches = lc_parser.parse_search_results()
if matches:
- for ind, match in enumerate(matches):
- matches[ind] = self._parse_single_book(match=match)
+ final_matches = []
+ for match in matches:
+ response = requests.get(match.get("url"))
+ match = lc_parser.parse_single_book(
+ match=match, response=response, generic_cover=generic_cover
+ )
+ final_matches.append(match)
+ return final_matches
return matches
def _prepare_query(self, title: str) -> str:
@@ -128,9 +138,7 @@ class LubimyCzytac(Metadata):
token for token in title.lower().split(" ") if len(token) > 1
]
else:
- title_tokens = list(
- self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)
- )
+ title_tokens = list(self.get_title_tokens(title, strip_joiners=False))
if title_tokens:
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
query = query + "%20".join(tokens)
@@ -138,215 +146,21 @@ class LubimyCzytac(Metadata):
return ""
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
- def _parse_search_results(self, root) -> List[Dict]:
- matches = []
- results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
- for result in results:
- title = result.xpath(
- f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
- f"{LubimyCzytac.TITLE_TEXT_PATH}"
- )
- book_url = result.xpath(
- f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}"
- )
- authors = result.xpath(
- f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
- f"{LubimyCzytac.AUTHORS_PATH}"
- )
-
- if not title or not book_url or not authors:
- continue
- title = title[0].strip()
- book_url = LubimyCzytac.BASE_URL + book_url[0]
- book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split(
- "/"
- )[0]
- matches.append(
- {"id": book_id, "title": title, "authors": authors, "url": book_url}
- )
- return matches
-
- def _parse_single_book(self, match: Dict) -> Dict:
- url = match.get("url")
- result = requests.get(url)
- root = fromstring(result.text)
- match["series"], match["series_index"] = self._parse_series(root=root)
- match["tags"] = self._parse_tags(root=root)
- match["publisher"] = self._parse_publisher(root=root)
- match["publishedDate"] = self._parse_from_summary(
- root=root, attribute_name="datePublished"
- )
- match["rating"] = self._parse_rating(root=root)
- match["description"] = self._parse_description(root=root)
- match["cover"] = self._parse_cover(root=root)
- match["source"] = {
- "id": self.__id__,
- "description": self.__name__,
- "link": LubimyCzytac.BASE_URL,
- }
- match['languages'] = self._parse_languages(root=root)
- match["identifiers"] = {
- "isbn": self._parse_isbn(root=root),
- "lubimyczytac": match["id"],
- }
- return match
-
- def _parse_cover(self, root):
- imgcol_node = root.xpath('//meta[@property="og:image"]/@content')
- if imgcol_node:
- img_url = imgcol_node[0]
- return img_url
-
- def _parse_publisher(self, root):
- publisher = root.xpath(LubimyCzytac.PUBLISHER)
- if publisher:
- return publisher[0]
- else:
- return None
-
- def _parse_languages(self, root):
- lang = root.xpath(LubimyCzytac.LANGUAGES)
- languages = list()
- if lang:
- lang = lang[0].strip()
- if "polski" in lang:
- languages.append("Polish")
- if "angielski" in lang:
- languages.append("English")
- if not languages:
- return ['Polish']
- return languages
-
- def _parse_series(self, root):
- try:
- series_node = root.xpath(LubimyCzytac.SERIES)
- if series_node:
- series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()")
- if series_lst:
- series_txt = series_lst
- else:
- series_txt = None
- else:
- return (None, None)
-
- if series_txt:
- ser_string = [series_txt[0].replace("\n", "").strip()]
- ser_nazwa = ser_string
- for ser in ser_string:
- if "tom " in ser:
- ser_info = ser.split(" (tom ", 1)
- ser_nazwa = ser.split(" (tom ")[0]
- break
-
- if ser_info:
- series_index_unicode = ser_info[1]
- series_index_string = str(
- series_index_unicode.replace(" ", "").replace(")", "")
- )
- # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3
- if "-" in series_index_string:
- series_index_string_temp = series_index_string.split("-", 1)
- series_index_string = series_index_string_temp[0]
- if series_index_string.replace(".", "").isdigit() is True:
- series_index = get_int_or_float(series_index_string)
- else:
- series_index = 0
- else:
- series_index = 0
- series = ser_nazwa
- return (series, series_index)
- except:
- return (None, None)
-
- def _parse_tags(self, root):
- tags = None
- try:
- tags_from_genre = root.xpath(LubimyCzytac.TAGS)
- if tags_from_genre:
- tags = tags_from_genre
- tags = [w.replace(", itd.", " itd.") for w in tags]
- return tags
- else:
- return None
- except:
- return tags
-
- def _parse_from_summary(self, root, attribute_name: str) -> str:
- data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0])
- value = data.get(attribute_name)
- return value.strip() if value is not None else value
-
- def _parse_rating(self, root):
- rating_node = root.xpath(LubimyCzytac.RATING)
- if rating_node:
- rating_value = round(float((rating_node[0]).replace(",", ".")) / 2)
- return rating_value
- return None
-
- def _parse_date(self, root, xpath="first_publish"):
- options = {
- "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
- "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
- }
- path = options.get(xpath)
- from dateutil import parser
-
- data = root.xpath(path)
- if data:
- first_pub_date = data[0].strip()
- return parser.parse(first_pub_date)
- return None
-
- def _parse_isbn(self, root):
- isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0]
- return isbn_node
-
- def _parse_description(self, root):
- comments = ""
- description_node = root.xpath(LubimyCzytac.DESCRIPTION)
- if description_node:
- for zrodla in root.xpath('//p[@class="source"]'):
- zrodla.getparent().remove(zrodla)
- comments = tostring(description_node[0], method="html")
- comments = sanitize_comments_html(comments)
-
- else:
- # try
- description_node = root.xpath('//meta[@property="og:description"]/@content')
- if description_node:
- comments = description_node[0]
- comments = sanitize_comments_html(comments)
-
- pages = self._parse_from_summary(root=root, attribute_name="numberOfPages")
- if pages:
- comments += f'Książka ma {pages} stron(y).
'
-
- first_publish_date = self._parse_date(root=root)
- if first_publish_date:
- comments += f'Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}
'
-
- first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl")
- if first_publish_date_pl:
- comments += f'Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}
'
-
- return comments
-
- def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
+ @staticmethod
+ def get_title_tokens(
+ title: str, strip_joiners: bool = True
+ ) -> Generator[str, None, None]:
"""
- Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py.
+ Taken from calibre source code
"""
- # strip sub-titles
- if strip_subtitle:
- subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)")
- if len(subtitle.sub("", title)) > 1:
- title = subtitle.sub("", title)
-
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
- r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]",
+ r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
+ r"audiobook|audio\scd|paperback|turtleback|"
+ r"mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
@@ -371,3 +185,193 @@ class LubimyCzytac(Metadata):
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token
+
+
+class LubimyCzytacParser:
+ PAGES_TEMPLATE = "Książka ma {0} stron(y).
"
+ PUBLISH_DATE_TEMPLATE = "Data pierwszego wydania: {0}
"
+ PUBLISH_DATE_PL_TEMPLATE = (
+ "Data pierwszego wydania w Polsce: {0}
"
+ )
+
+ def __init__(self, root: HtmlElement, metadata: Metadata) -> None:
+ self.root = root
+ self.metadata = metadata
+
+ def parse_search_results(self) -> List[Dict]:
+ matches = []
+ results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
+ for result in results:
+ title = self._parse_xpath_node(
+ root=result,
+ xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+ f"{LubimyCzytac.TITLE_TEXT_PATH}",
+ )
+
+ book_url = self._parse_xpath_node(
+ root=result,
+ xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+ f"{LubimyCzytac.URL_PATH}",
+ )
+ authors = self._parse_xpath_node(
+ root=result,
+ xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
+ f"{LubimyCzytac.AUTHORS_PATH}",
+ take_first=False,
+ )
+ if not all([title, book_url, authors]):
+ continue
+ matches.append(
+ {
+ "id": book_url.replace(f"/ksiazka/", "").split("/")[0],
+ "title": title,
+ "authors": [strip_accents(author) for author in authors],
+ "url": LubimyCzytac.BASE_URL + book_url,
+ }
+ )
+ return matches
+
+ def parse_single_book(
+ self, match: Dict, response, generic_cover: str
+ ) -> MetaRecord:
+ self.root = fromstring(response.text)
+ match["series"], match["series_index"] = self._parse_series()
+ match["tags"] = self._parse_tags()
+ match["publisher"] = self._parse_publisher()
+ match["publishedDate"] = self._parse_from_summary(
+ attribute_name="datePublished"
+ )
+ match["rating"] = self._parse_rating()
+ match["description"] = self._parse_description()
+ match["cover"] = self._parse_cover(generic_cover=generic_cover)
+ match["source"] = {
+ "id": self.metadata.__id__,
+ "description": self.metadata.__name__,
+ "link": LubimyCzytac.BASE_URL,
+ }
+ match["languages"] = self._parse_languages()
+ match["identifiers"] = {
+ "isbn": self._parse_isbn(),
+ "lubimyczytac": match["id"],
+ }
+ return match
+
+ def _parse_xpath_node(
+ self,
+ xpath: str,
+ root: HtmlElement = None,
+ take_first: bool = True,
+ strip_element: bool = True,
+ ) -> Optional[Union[str, List[str]]]:
+ root = root if root is not None else self.root
+ node = root.xpath(xpath)
+ if not node:
+ return None
+ return (
+ (node[0].strip() if strip_element else node[0])
+ if take_first
+ else [x.strip() for x in node]
+ )
+
+ def _parse_cover(self, generic_cover) -> Optional[str]:
+ return (
+ self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True)
+ or generic_cover
+ )
+
+ def _parse_publisher(self) -> Optional[str]:
+ return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True)
+
+ def _parse_languages(self) -> List[str]:
+ languages = list()
+ lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True)
+ if lang:
+ if "polski" in lang:
+ languages.append("Polish")
+ if "angielski" in lang:
+ languages.append("English")
+ return languages
+
+ def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]:
+ series_index = 0
+ series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True)
+ if series:
+ if "tom " in series:
+ series_name, series_info = series.split(" (tom ", 1)
+ series_info = series_info.replace(" ", "").replace(")", "")
+ # Check if book is not a bundle, i.e. chapter 1-3
+ if "-" in series_info:
+ series_info = series_info.split("-", 1)[0]
+ if series_info.replace(".", "").isdigit() is True:
+ series_index = get_int_or_float(series_info)
+ return series_name, series_index
+ return None, None
+
+ def _parse_tags(self) -> List[str]:
+ tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False)
+ return [
+ strip_accents(w.replace(", itd.", " itd."))
+ for w in tags
+ if isinstance(w, str)
+ ]
+
+ def _parse_from_summary(self, attribute_name: str) -> Optional[str]:
+ value = None
+ summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY)
+ if summary_text:
+ data = json.loads(summary_text)
+ value = data.get(attribute_name)
+ return value.strip() if value is not None else value
+
+ def _parse_rating(self) -> Optional[str]:
+ rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING)
+ return round(float(rating.replace(",", ".")) / 2) if rating else rating
+
+ def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]:
+ options = {
+ "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE,
+ "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL,
+ }
+ date = self._parse_xpath_node(xpath=options.get(xpath))
+ return parser.parse(date) if date else None
+
+ def _parse_isbn(self) -> Optional[str]:
+ return self._parse_xpath_node(xpath=LubimyCzytac.ISBN)
+
+ def _parse_description(self) -> str:
+ description = ""
+ description_node = self._parse_xpath_node(
+ xpath=LubimyCzytac.DESCRIPTION, strip_element=False
+ )
+ if description_node is not None:
+ for source in self.root.xpath('//p[@class="source"]'):
+ source.getparent().remove(source)
+ description = tostring(description_node, method="html")
+ description = sanitize_comments_html(description)
+
+ else:
+ description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE)
+ if description_node is not None:
+ description = description_node
+ description = sanitize_comments_html(description)
+ description = self._add_extra_info_to_description(description=description)
+ return description
+
+ def _add_extra_info_to_description(self, description: str) -> str:
+ pages = self._parse_from_summary(attribute_name="numberOfPages")
+ if pages:
+ description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages)
+
+ first_publish_date = self._parse_date()
+ if first_publish_date:
+ description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format(
+ first_publish_date.strftime("%d.%m.%Y")
+ )
+
+ first_publish_date_pl = self._parse_date(xpath="first_publish_pl")
+ if first_publish_date_pl:
+ description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format(
+ first_publish_date_pl.strftime("%d.%m.%Y")
+ )
+
+ return description
diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py
index d6e4e7d5..17a9e38e 100644
--- a/cps/services/Metadata.py
+++ b/cps/services/Metadata.py
@@ -15,13 +15,44 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+import abc
+from typing import Dict, List, Optional, TypedDict, Union
-class Metadata():
+class Metadata:
__name__ = "Generic"
+ __id__ = "generic"
def __init__(self):
self.active = True
def set_status(self, state):
self.active = state
+
+ @abc.abstractmethod
+ def search(self, query: str, generic_cover: str):
+ pass
+
+
+class MetaSourceInfo(TypedDict):
+ id: str
+ description: str
+ link: str
+
+
+class MetaRecord(TypedDict):
+ id: Union[str, int]
+ title: str
+ authors: List[str]
+ url: str
+ cover: str
+ series: Optional[str]
+ series_index: Optional[Union[int, float]]
+ tags: Optional[List[str]]
+ publisher: Optional[str]
+ publishedDate: Optional[str]
+ rating: Optional[int]
+ description: Optional[str]
+ source: MetaSourceInfo
+ languages: Optional[List[str]]
+ identifiers: Dict[str, Union[str, int]]
diff --git a/requirements.txt b/requirements.txt
index d1f58a8d..d09c2157 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,4 @@ lxml>=3.8.0,<4.7.0
flask-wtf>=0.14.2,<1.1.0
markdown2==2.4.2
html2text==2020.1.16
+python-dateutil==2.8.2