Merge remote-tracking branch 'douban/metadata_provider/douban'
This commit is contained in:
commit
5dc3385ae5
|
@ -43,7 +43,8 @@ class Douban(Metadata):
|
||||||
__id__ = "douban"
|
__id__ = "douban"
|
||||||
DESCRIPTION = "豆瓣"
|
DESCRIPTION = "豆瓣"
|
||||||
META_URL = "https://book.douban.com/"
|
META_URL = "https://book.douban.com/"
|
||||||
SEARCH_URL = "https://www.douban.com/j/search"
|
SEARCH_JSON_URL = "https://www.douban.com/j/search"
|
||||||
|
SEARCH_URL = "https://www.douban.com/search"
|
||||||
|
|
||||||
ID_PATTERN = re.compile(r"sid: (?P<id>\d+),")
|
ID_PATTERN = re.compile(r"sid: (?P<id>\d+),")
|
||||||
AUTHORS_PATTERN = re.compile(r"作者|译者")
|
AUTHORS_PATTERN = re.compile(r"作者|译者")
|
||||||
|
@ -52,6 +53,7 @@ class Douban(Metadata):
|
||||||
PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
|
PUBLISHED_DATE_PATTERN = re.compile(r"出版年")
|
||||||
SERIES_PATTERN = re.compile(r"丛书")
|
SERIES_PATTERN = re.compile(r"丛书")
|
||||||
IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
|
IDENTIFIERS_PATTERN = re.compile(r"ISBN|统一书号")
|
||||||
|
CRITERIA_PATTERN = re.compile("criteria = '(.+)'")
|
||||||
|
|
||||||
TITTLE_XPATH = "//span[@property='v:itemreviewed']"
|
TITTLE_XPATH = "//span[@property='v:itemreviewed']"
|
||||||
COVER_XPATH = "//a[@class='nbg']"
|
COVER_XPATH = "//a[@class='nbg']"
|
||||||
|
@ -63,56 +65,90 @@ class Douban(Metadata):
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers = {
|
session.headers = {
|
||||||
'user-agent':
|
'user-agent':
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56',
|
||||||
}
|
}
|
||||||
|
|
||||||
def search(
|
def search(self,
|
||||||
self, query: str, generic_cover: str = "", locale: str = "en"
|
query: str,
|
||||||
) -> Optional[List[MetaRecord]]:
|
generic_cover: str = "",
|
||||||
|
locale: str = "en") -> List[MetaRecord]:
|
||||||
|
val = []
|
||||||
if self.active:
|
if self.active:
|
||||||
log.debug(f"starting search {query} on douban")
|
log.debug(f"start searching {query} on douban")
|
||||||
if title_tokens := list(
|
if title_tokens := list(
|
||||||
self.get_title_tokens(query, strip_joiners=False)
|
self.get_title_tokens(query, strip_joiners=False)):
|
||||||
):
|
|
||||||
query = "+".join(title_tokens)
|
query = "+".join(title_tokens)
|
||||||
|
|
||||||
try:
|
book_id_list = self._get_book_id_list_from_html(query)
|
||||||
r = self.session.get(
|
|
||||||
self.SEARCH_URL, params={"cat": 1001, "q": query}
|
|
||||||
)
|
|
||||||
r.raise_for_status()
|
|
||||||
|
|
||||||
except Exception as e:
|
if not book_id_list:
|
||||||
log.warning(e)
|
log.debug("No search results in Douban")
|
||||||
return None
|
|
||||||
|
|
||||||
results = r.json()
|
|
||||||
if results["total"] == 0:
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
book_id_list = [
|
with futures.ThreadPoolExecutor(
|
||||||
self.ID_PATTERN.search(item).group("id")
|
max_workers=5, thread_name_prefix='douban') as executor:
|
||||||
for item in results["items"][:10] if self.ID_PATTERN.search(item)
|
|
||||||
]
|
|
||||||
|
|
||||||
with futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
||||||
|
|
||||||
fut = [
|
fut = [
|
||||||
executor.submit(self._parse_single_book, book_id, generic_cover)
|
executor.submit(self._parse_single_book, book_id,
|
||||||
for book_id in book_id_list
|
generic_cover) for book_id in book_id_list
|
||||||
]
|
]
|
||||||
|
|
||||||
val = [
|
val = [
|
||||||
future.result()
|
future.result() for future in futures.as_completed(fut)
|
||||||
for future in futures.as_completed(fut) if future.result()
|
if future.result()
|
||||||
]
|
]
|
||||||
|
|
||||||
return val
|
return val
|
||||||
|
|
||||||
def _parse_single_book(
|
def _get_book_id_list_from_html(self, query: str) -> List[str]:
|
||||||
self, id: str, generic_cover: str = ""
|
try:
|
||||||
) -> Optional[MetaRecord]:
|
r = self.session.get(self.SEARCH_URL,
|
||||||
|
params={
|
||||||
|
"cat": 1001,
|
||||||
|
"q": query
|
||||||
|
})
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
html = etree.HTML(r.content.decode("utf8"))
|
||||||
|
result_list = html.xpath(self.COVER_XPATH)
|
||||||
|
|
||||||
|
return [
|
||||||
|
self.ID_PATTERN.search(item.get("onclick")).group("id")
|
||||||
|
for item in result_list[:10]
|
||||||
|
if self.ID_PATTERN.search(item.get("onclick"))
|
||||||
|
]
|
||||||
|
|
||||||
|
def _get_book_id_list_from_json(self, query: str) -> List[str]:
|
||||||
|
try:
|
||||||
|
r = self.session.get(self.SEARCH_JSON_URL,
|
||||||
|
params={
|
||||||
|
"cat": 1001,
|
||||||
|
"q": query
|
||||||
|
})
|
||||||
|
r.raise_for_status()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = r.json()
|
||||||
|
if results["total"] == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
|
self.ID_PATTERN.search(item).group("id")
|
||||||
|
for item in results["items"][:10] if self.ID_PATTERN.search(item)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _parse_single_book(self,
|
||||||
|
id: str,
|
||||||
|
generic_cover: str = "") -> Optional[MetaRecord]:
|
||||||
url = f"https://book.douban.com/subject/{id}/"
|
url = f"https://book.douban.com/subject/{id}/"
|
||||||
|
log.debug(f"start parsing {url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = self.session.get(url)
|
r = self.session.get(url)
|
||||||
|
@ -136,7 +172,8 @@ class Douban(Metadata):
|
||||||
html = etree.HTML(r.content.decode("utf8"))
|
html = etree.HTML(r.content.decode("utf8"))
|
||||||
|
|
||||||
match.title = html.xpath(self.TITTLE_XPATH)[0].text
|
match.title = html.xpath(self.TITTLE_XPATH)[0].text
|
||||||
match.cover = html.xpath(self.COVER_XPATH)[0].attrib["href"] or generic_cover
|
match.cover = html.xpath(
|
||||||
|
self.COVER_XPATH)[0].attrib["href"] or generic_cover
|
||||||
try:
|
try:
|
||||||
rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
|
rating_num = float(html.xpath(self.RATING_XPATH)[0].text.strip())
|
||||||
except Exception:
|
except Exception:
|
||||||
|
@ -146,35 +183,39 @@ class Douban(Metadata):
|
||||||
tag_elements = html.xpath(self.TAGS_XPATH)
|
tag_elements = html.xpath(self.TAGS_XPATH)
|
||||||
if len(tag_elements):
|
if len(tag_elements):
|
||||||
match.tags = [tag_element.text for tag_element in tag_elements]
|
match.tags = [tag_element.text for tag_element in tag_elements]
|
||||||
|
else:
|
||||||
|
match.tags = self._get_tags(html.text)
|
||||||
|
|
||||||
description_element = html.xpath(self.DESCRIPTION_XPATH)
|
description_element = html.xpath(self.DESCRIPTION_XPATH)
|
||||||
if len(description_element):
|
if len(description_element):
|
||||||
match.description = html2text(etree.tostring(
|
match.description = html2text(
|
||||||
description_element[-1], encoding="utf8").decode("utf8"))
|
etree.tostring(description_element[-1]).decode("utf8"))
|
||||||
|
|
||||||
info = html.xpath(self.INFO_XPATH)
|
info = html.xpath(self.INFO_XPATH)
|
||||||
|
|
||||||
for element in info:
|
for element in info:
|
||||||
text = element.text
|
text = element.text
|
||||||
if self.AUTHORS_PATTERN.search(text):
|
if self.AUTHORS_PATTERN.search(text):
|
||||||
next = element.getnext()
|
next_element = element.getnext()
|
||||||
while next is not None and next.tag != "br":
|
while next_element is not None and next_element.tag != "br":
|
||||||
match.authors.append(next.text)
|
match.authors.append(next_element.text)
|
||||||
next = next.getnext()
|
next_element = next_element.getnext()
|
||||||
elif self.PUBLISHER_PATTERN.search(text):
|
elif self.PUBLISHER_PATTERN.search(text):
|
||||||
match.publisher = element.tail.strip()
|
if publisher := element.tail.strip():
|
||||||
|
match.publisher = publisher
|
||||||
|
else:
|
||||||
|
match.publisher = element.getnext().text
|
||||||
elif self.SUBTITLE_PATTERN.search(text):
|
elif self.SUBTITLE_PATTERN.search(text):
|
||||||
match.title = f'{match.title}:' + element.tail.strip()
|
match.title = f'{match.title}:{element.tail.strip()}'
|
||||||
elif self.PUBLISHED_DATE_PATTERN.search(text):
|
elif self.PUBLISHED_DATE_PATTERN.search(text):
|
||||||
match.publishedDate = self._clean_date(element.tail.strip())
|
match.publishedDate = self._clean_date(element.tail.strip())
|
||||||
elif self.SUBTITLE_PATTERN.search(text):
|
elif self.SERIES_PATTERN.search(text):
|
||||||
match.series = element.getnext().text
|
match.series = element.getnext().text
|
||||||
elif i_type := self.IDENTIFIERS_PATTERN.search(text):
|
elif i_type := self.IDENTIFIERS_PATTERN.search(text):
|
||||||
match.identifiers[i_type.group()] = element.tail.strip()
|
match.identifiers[i_type.group()] = element.tail.strip()
|
||||||
|
|
||||||
return match
|
return match
|
||||||
|
|
||||||
|
|
||||||
def _clean_date(self, date: str) -> str:
|
def _clean_date(self, date: str) -> str:
|
||||||
"""
|
"""
|
||||||
Clean up the date string to be in the format YYYY-MM-DD
|
Clean up the date string to be in the format YYYY-MM-DD
|
||||||
|
@ -194,13 +235,24 @@ class Douban(Metadata):
|
||||||
if date[i].isdigit():
|
if date[i].isdigit():
|
||||||
digit.append(date[i])
|
digit.append(date[i])
|
||||||
elif digit:
|
elif digit:
|
||||||
ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
|
ls.append("".join(digit) if len(digit) ==
|
||||||
|
2 else f"0{digit[0]}")
|
||||||
digit = []
|
digit = []
|
||||||
if digit:
|
if digit:
|
||||||
ls.append("".join(digit) if len(digit)==2 else f"0{digit[0]}")
|
ls.append("".join(digit) if len(digit) ==
|
||||||
|
2 else f"0{digit[0]}")
|
||||||
|
|
||||||
moon = ls[0]
|
moon = ls[0]
|
||||||
if len(ls)>1:
|
if len(ls) > 1:
|
||||||
day = ls[1]
|
day = ls[1]
|
||||||
|
|
||||||
return f"{year}-{moon}-{day}"
|
return f"{year}-{moon}-{day}"
|
||||||
|
|
||||||
|
def _get_tags(self, text: str) -> List[str]:
|
||||||
|
tags = []
|
||||||
|
if criteria := self.CRITERIA_PATTERN.search(text):
|
||||||
|
tags.extend(
|
||||||
|
item.replace('7:', '') for item in criteria.group().split('|')
|
||||||
|
if item.startswith('7:'))
|
||||||
|
|
||||||
|
return tags
|
||||||
|
|
Loading…
Reference in New Issue
Block a user