Merge remote-tracking branch 'douban/metadata_provider/douban'

This commit is contained in:
Ozzie Isaacs 2022-12-25 10:35:35 +01:00
commit 5dc3385ae5

View File

@ -43,7 +43,8 @@ class Douban(Metadata):
__id__ = "douban" __id__ = "douban"
DESCRIPTION = "豆瓣" DESCRIPTION = "豆瓣"
META_URL = "https://book.douban.com/" META_URL = "https://book.douban.com/"
SEARCH_URL = "https://www.douban.com/j/search" SEARCH_JSON_URL = "https://www.douban.com/j/search"
SEARCH_URL = "https://www.douban.com/search"
ID_PATTERN = re.compile(r"sid: (?P<id>\d+),") ID_PATTERN = re.compile(r"sid: (?P<id>\d+),")
AUTHORS_PATTERN = re.compile(r"作者|译者") AUTHORS_PATTERN = re.compile(r"作者|译者")
@ -52,6 +53,7 @@ class Douban(Metadata):
PUBLISHED_DATE_PATTERN = re.compile(r"出版年") PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
SERIES_PATTERN = re.compile(r"丛书") SERIES_PATTERN = re.compile(r"丛书")
IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号") IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
CRITERIA_PATTERN = re.compile("criteria = '(.+)'")
TITTLE_XPATH = "//span[@property='v:itemreviewed']" TITTLE_XPATH = "//span[@property='v:itemreviewed']"
COVER_XPATH = "//a[@class='nbg']" COVER_XPATH = "//a[@class='nbg']"
@ -66,53 +68,87 @@ class Douban(Metadata):
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
} }
def search( def search(self,
self, query: str, generic_cover: str = "", locale: str = "en" query: str,
) -> Optional[List[MetaRecord]]: generic_cover: str = "",
locale: str = "en") -> List[MetaRecord]:
val = []
if self.active: if self.active:
log.debug(f"starting search {query} on douban") log.debug(f"start searching {query} on douban")
if title_tokens := list( if title_tokens := list(
self.get_title_tokens(query, strip_joiners=False) self.get_title_tokens(query, strip_joiners=False)):
):
query = "+".join(title_tokens) query = "+".join(title_tokens)
book_id_list = self._get_book_id_list_from_html(query)
if not book_id_list:
log.debug("No search results in Douban")
return []
with futures.ThreadPoolExecutor(
max_workers=5, thread_name_prefix='douban') as executor:
fut = [
executor.submit(self._parse_single_book, book_id,
generic_cover) for book_id in book_id_list
]
val = [
future.result() for future in futures.as_completed(fut)
if future.result()
]
return val
def _get_book_id_list_from_html(self, query: str) -> List[str]:
try: try:
r = self.session.get( r = self.session.get(self.SEARCH_URL,
self.SEARCH_URL, params={"cat": 1001, "q": query} params={
) "cat": 1001,
"q": query
})
r.raise_for_status() r.raise_for_status()
except Exception as e: except Exception as e:
log.warning(e) log.warning(e)
return None return []
html = etree.HTML(r.content.decode("utf8"))
result_list = html.xpath(self.COVER_XPATH)
return [
self.ID_PATTERN.search(item.get("onclick")).group("id")
for item in result_list[:10]
if self.ID_PATTERN.search(item.get("onclick"))
]
def _get_book_id_list_from_json(self, query: str) -> List[str]:
try:
r = self.session.get(self.SEARCH_JSON_URL,
params={
"cat": 1001,
"q": query
})
r.raise_for_status()
except Exception as e:
log.warning(e)
return []
results = r.json() results = r.json()
if results["total"] == 0: if results["total"] == 0:
return [] return []
book_id_list = [ return [
self.ID_PATTERN.search(item).group("id") self.ID_PATTERN.search(item).group("id")
for item in results["items"][:10] if self.ID_PATTERN.search(item) for item in results["items"][:10] if self.ID_PATTERN.search(item)
] ]
with futures.ThreadPoolExecutor(max_workers=5) as executor: def _parse_single_book(self,
id: str,
fut = [ generic_cover: str = "") -> Optional[MetaRecord]:
executor.submit(self._parse_single_book, book_id, generic_cover)
for book_id in book_id_list
]
val = [
future.result()
for future in futures.as_completed(fut) if future.result()
]
return val
def _parse_single_book(
self, id: str, generic_cover: str = ""
) -> Optional[MetaRecord]:
url = f"https://book.douban.com/subject/{id}/" url = f"https://book.douban.com/subject/{id}/"
log.debug(f"start parsing {url}")
try: try:
r = self.session.get(url) r = self.session.get(url)
@ -136,7 +172,8 @@ class Douban(Metadata):
html = etree.HTML(r.content.decode("utf8")) html = etree.HTML(r.content.decode("utf8"))
match.title = html.xpath(self.TITTLE_XPATH)[0].text match.title = html.xpath(self.TITTLE_XPATH)[0].text
match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover match.cover = html.xpath(
self.COVER_XPATH)[0].attrib["href"] or generic_cover
try: try:
rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip()) rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
except Exception: except Exception:
@ -146,35 +183,39 @@ class Douban(Metadata):
tag_elements = html.xpath(self.TAGS_XPATH) tag_elements = html.xpath(self.TAGS_XPATH)
if len(tag_elements): if len(tag_elements):
match.tags = [tag_element.text for tag_element in tag_elements] match.tags = [tag_element.text for tag_element in tag_elements]
else:
match.tags = self._get_tags(html.text)
description_element = html.xpath(self.DESCRIPTION_XPATH) description_element = html.xpath(self.DESCRIPTION_XPATH)
if len(description_element): if len(description_element):
match.description = html2text(etree.tostring( match.description = html2text(
description_element[-1], encoding="utf8").decode("utf8")) etree.tostring(description_element[-1]).decode("utf8"))
info = html.xpath(self.INFO_XPATH) info = html.xpath(self.INFO_XPATH)
for element in info: for element in info:
text = element.text text = element.text
if self.AUTHORS_PATTERN.search(text): if self.AUTHORS_PATTERN.search(text):
next = element.getnext() next_element = element.getnext()
while next is not None and next.tag != "br": while next_element is not None and next_element.tag != "br":
match.authors.append(next.text) match.authors.append(next_element.text)
next = next.getnext() next_element = next_element.getnext()
elif self.PUBLISHER_PATTERN.search(text): elif self.PUBLISHER_PATTERN.search(text):
match.publisher = element.tail.strip() if publisher := element.tail.strip():
match.publisher = publisher
else:
match.publisher = element.getnext().text
elif self.SUBTITLE_PATTERN.search(text): elif self.SUBTITLE_PATTERN.search(text):
match.title = f'{match.title}:' + element.tail.strip() match.title = f'{match.title}:{element.tail.strip()}'
elif self.PUBLISHED_DATE_PATTERN.search(text): elif self.PUBLISHED_DATE_PATTERN.search(text):
match.publishedDate = self._clean_date(element.tail.strip()) match.publishedDate = self._clean_date(element.tail.strip())
elif self.SUBTITLE_PATTERN.search(text): elif self.SERIES_PATTERN.search(text):
match.series = element.getnext().text match.series = element.getnext().text
elif i_type := self.IDENTIFIERS_PATTERN.search(text): elif i_type := self.IDENTIFIERS_PATTERN.search(text):
match.identifiers[i_type.group()] = element.tail.strip() match.identifiers[i_type.group()] = element.tail.strip()
return match return match
def _clean_date(self, date: str) -> str: def _clean_date(self, date: str) -> str:
""" """
Clean up the date string to be in the format YYYY-MM-DD Clean up the date string to be in the format YYYY-MM-DD
@ -194,13 +235,24 @@ class Douban(Metadata):
if date[i].isdigit(): if date[i].isdigit():
digit.append(date[i]) digit.append(date[i])
elif digit: elif digit:
ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}") ls.append("".join(digit) if len(digit) ==
2 else f"0{digit[0]}")
digit = [] digit = []
if digit: if digit:
ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}") ls.append("".join(digit) if len(digit) ==
2 else f"0{digit[0]}")
moon = ls[0] moon = ls[0]
if len(ls) > 1: if len(ls) > 1:
day = ls[1] day = ls[1]
return f"{year}-{moon}-{day}" return f"{year}-{moon}-{day}"
def _get_tags(self, text: str) -> List[str]:
tags = []
if criteria := self.CRITERIA_PATTERN.search(text):
tags.extend(
item.replace('7:', '') for item in criteria.group().split('|')
if item.startswith('7:'))
return tags