unify scholar
This commit is contained in:
parent
d64589914f
commit
51bf35c2e4
|
@ -17,49 +17,68 @@
|
|||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# ComicVine api document: https://comicvine.gamespot.com/api/documentation
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
from cps.services.Metadata import Metadata
|
||||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||||
|
||||
|
||||
class ComicVine(Metadata):
|
||||
__name__ = "ComicVine"
|
||||
__id__ = "comicvine"
|
||||
DESCRIPTION = "ComicVine Books"
|
||||
META_URL = "https://comicvine.gamespot.com/"
|
||||
API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
|
||||
BASE_URL = (
|
||||
f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}"
|
||||
f"&resources=issue&query="
|
||||
)
|
||||
QUERY_PARAMS = "&sort=name:desc&format=json"
|
||||
HEADERS = {"User-Agent": "Not Evil Browser"}
|
||||
|
||||
def search(self, query, generic_cover=""):
|
||||
def search(
|
||||
self, query: str, generic_cover: str = "", locale: str = "en"
|
||||
) -> Optional[List[MetaRecord]]:
|
||||
val = list()
|
||||
apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6"
|
||||
if self.active:
|
||||
headers = {
|
||||
'User-Agent': 'Not Evil Browser'
|
||||
}
|
||||
|
||||
result = requests.get("https://comicvine.gamespot.com/api/search?api_key="
|
||||
+ apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers)
|
||||
for r in result.json()['results']:
|
||||
seriesTitle = r['volume'].get('name', "")
|
||||
if r.get('store_date'):
|
||||
dateFomers = r.get('store_date')
|
||||
else:
|
||||
dateFomers = r.get('date_added')
|
||||
v = dict()
|
||||
v['id'] = r['id']
|
||||
v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "")
|
||||
v['authors'] = r.get('authors', [])
|
||||
v['description'] = r.get('description', "")
|
||||
v['publisher'] = ""
|
||||
v['publishedDate'] = dateFomers
|
||||
v['tags'] = ["Comics", seriesTitle]
|
||||
v['rating'] = 0
|
||||
v['series'] = seriesTitle
|
||||
v['cover'] = r['image'].get('original_url', generic_cover)
|
||||
v['source'] = {
|
||||
"id": self.__id__,
|
||||
"description": "ComicVine Books",
|
||||
"link": "https://comicvine.gamespot.com/"
|
||||
}
|
||||
v['url'] = r.get('site_detail_url', "")
|
||||
val.append(v)
|
||||
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
|
||||
if title_tokens:
|
||||
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
|
||||
query = "%20".join(tokens)
|
||||
result = requests.get(
|
||||
f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}",
|
||||
headers=ComicVine.HEADERS,
|
||||
)
|
||||
for result in result.json()["results"]:
|
||||
match = self._parse_search_result(
|
||||
result=result, generic_cover=generic_cover, locale=locale
|
||||
)
|
||||
val.append(match)
|
||||
return val
|
||||
|
||||
|
||||
def _parse_search_result(
|
||||
self, result: Dict, generic_cover: str, locale: str
|
||||
) -> MetaRecord:
|
||||
series = result["volume"].get("name", "")
|
||||
series_index = result.get("issue_number", 0)
|
||||
issue_name = result.get("name", "")
|
||||
match = MetaRecord(
|
||||
id=result["id"],
|
||||
title=f"{series}#{series_index} - {issue_name}",
|
||||
authors=result.get("authors", []),
|
||||
url=result.get("site_detail_url", ""),
|
||||
source=MetaSourceInfo(
|
||||
id=self.__id__,
|
||||
description=ComicVine.DESCRIPTION,
|
||||
link=ComicVine.META_URL,
|
||||
),
|
||||
series=series,
|
||||
)
|
||||
match.cover = result["image"].get("original_url", generic_cover)
|
||||
match.description = result.get("description", "")
|
||||
match.publishedDate = result.get("store_date", result.get("date_added"))
|
||||
match.series_index = series_index
|
||||
match.tags = ["Comics", series]
|
||||
match.identifiers = {"comicvine": match.id}
|
||||
return match
|
||||
|
|
|
@ -23,7 +23,7 @@ from urllib.parse import quote
|
|||
import requests
|
||||
|
||||
from cps.isoLanguages import get_lang3, get_language_name
|
||||
from cps.services.Metadata import MetaRecord, Metadata
|
||||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||||
|
||||
|
||||
class Google(Metadata):
|
||||
|
@ -56,38 +56,37 @@ class Google(Metadata):
|
|||
def _parse_search_result(
|
||||
self, result: Dict, generic_cover: str, locale: str
|
||||
) -> MetaRecord:
|
||||
match = dict()
|
||||
match["id"] = result["id"]
|
||||
match["title"] = result["volumeInfo"]["title"]
|
||||
match["authors"] = result["volumeInfo"].get("authors", [])
|
||||
match["url"] = Google.BOOK_URL + result["id"]
|
||||
match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover)
|
||||
match["description"] = result["volumeInfo"].get("description", "")
|
||||
match["languages"] = self._parse_languages(result=result, locale=locale)
|
||||
match["publisher"] = result["volumeInfo"].get("publisher", "")
|
||||
match["publishedDate"] = result["volumeInfo"].get("publishedDate", "")
|
||||
match["rating"] = result["volumeInfo"].get("averageRating", 0)
|
||||
match["series"], match["series_index"] = "", 1
|
||||
match["tags"] = result["volumeInfo"].get("categories", [])
|
||||
match = MetaRecord(
|
||||
id=result["id"],
|
||||
title=result["volumeInfo"]["title"],
|
||||
authors=result["volumeInfo"].get("authors", []),
|
||||
url=Google.BOOK_URL + result["id"],
|
||||
source=MetaSourceInfo(
|
||||
id=self.__id__,
|
||||
description=Google.DESCRIPTION,
|
||||
link=Google.META_URL,
|
||||
),
|
||||
)
|
||||
|
||||
match["source"] = {
|
||||
"id": self.__id__,
|
||||
"description": Google.DESCRIPTION,
|
||||
"link": Google.META_URL,
|
||||
}
|
||||
match.cover = self._parse_cover(result=result, generic_cover=generic_cover)
|
||||
match.description = result["volumeInfo"].get("description", "")
|
||||
match.languages = self._parse_languages(result=result, locale=locale)
|
||||
match.publisher = result["volumeInfo"].get("publisher", "")
|
||||
match.publishedDate = result["volumeInfo"].get("publishedDate", "")
|
||||
match.rating = result["volumeInfo"].get("averageRating", 0)
|
||||
match.series, match.series_index = "", 1
|
||||
match.tags = result["volumeInfo"].get("categories", [])
|
||||
|
||||
match["identifiers"] = {
|
||||
"google": match.get("id"),
|
||||
}
|
||||
match.identifiers = {"google": match.id}
|
||||
match = self._parse_isbn(result=result, match=match)
|
||||
return match
|
||||
|
||||
@staticmethod
|
||||
def _parse_isbn(result: Dict, match: Dict) -> Dict:
|
||||
def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord:
|
||||
identifiers = result["volumeInfo"].get("industryIdentifiers", [])
|
||||
for identifier in identifiers:
|
||||
if identifier.get("type") == Google.ISBN_TYPE:
|
||||
match["identifiers"]["isbn"] = identifier.get("identifier")
|
||||
match.identifiers["isbn"] = identifier.get("identifier")
|
||||
break
|
||||
return match
|
||||
|
||||
|
@ -100,7 +99,7 @@ class Google(Metadata):
|
|||
|
||||
@staticmethod
|
||||
def _parse_languages(result: Dict, locale: str) -> List[str]:
|
||||
language_iso2 = result.get("language", "")
|
||||
language_iso2 = result["volumeInfo"].get("language", "")
|
||||
languages = (
|
||||
[get_language_name(locale, get_lang3(language_iso2))]
|
||||
if language_iso2
|
||||
|
|
|
@ -27,7 +27,7 @@ from html2text import HTML2Text
|
|||
from lxml.html import HtmlElement, fromstring, tostring
|
||||
from markdown2 import Markdown
|
||||
|
||||
from cps.services.Metadata import MetaRecord, Metadata
|
||||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||||
|
||||
SYMBOLS_TO_TRANSLATE = (
|
||||
"öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ",
|
||||
|
@ -158,61 +158,60 @@ class LubimyCzytacParser:
|
|||
self.root = root
|
||||
self.metadata = metadata
|
||||
|
||||
def parse_search_results(self) -> List[Dict]:
|
||||
def parse_search_results(self) -> List[MetaRecord]:
|
||||
matches = []
|
||||
results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH)
|
||||
for result in results:
|
||||
title = self._parse_xpath_node(
|
||||
root=result,
|
||||
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
||||
f"{LubimyCzytac.TITLE_TEXT_PATH}",
|
||||
f"{LubimyCzytac.TITLE_TEXT_PATH}",
|
||||
)
|
||||
|
||||
book_url = self._parse_xpath_node(
|
||||
root=result,
|
||||
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
||||
f"{LubimyCzytac.URL_PATH}",
|
||||
f"{LubimyCzytac.URL_PATH}",
|
||||
)
|
||||
authors = self._parse_xpath_node(
|
||||
root=result,
|
||||
xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}"
|
||||
f"{LubimyCzytac.AUTHORS_PATH}",
|
||||
f"{LubimyCzytac.AUTHORS_PATH}",
|
||||
take_first=False,
|
||||
)
|
||||
if not all([title, book_url, authors]):
|
||||
continue
|
||||
matches.append(
|
||||
{
|
||||
"id": book_url.replace(f"/ksiazka/", "").split("/")[0],
|
||||
"title": title,
|
||||
"authors": [strip_accents(author) for author in authors],
|
||||
"url": LubimyCzytac.BASE_URL + book_url,
|
||||
}
|
||||
MetaRecord(
|
||||
id=book_url.replace(f"/ksiazka/", "").split("/")[0],
|
||||
title=title,
|
||||
authors=[strip_accents(author) for author in authors],
|
||||
url=LubimyCzytac.BASE_URL + book_url,
|
||||
source=MetaSourceInfo(
|
||||
id=self.metadata.__id__,
|
||||
description=self.metadata.__name__,
|
||||
link=LubimyCzytac.BASE_URL,
|
||||
)
|
||||
)
|
||||
)
|
||||
return matches
|
||||
|
||||
def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord:
|
||||
response = requests.get(match.get("url"))
|
||||
def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord:
|
||||
response = requests.get(match.url)
|
||||
self.root = fromstring(response.text)
|
||||
match["cover"] = self._parse_cover(generic_cover=generic_cover)
|
||||
match["description"] = self._parse_description()
|
||||
match["languages"] = self._parse_languages()
|
||||
match["publisher"] = self._parse_publisher()
|
||||
match["publishedDate"] = self._parse_from_summary(
|
||||
match.cover = self._parse_cover(generic_cover=generic_cover)
|
||||
match.description = self._parse_description()
|
||||
match.languages = self._parse_languages()
|
||||
match.publisher = self._parse_publisher()
|
||||
match.publishedDate = self._parse_from_summary(
|
||||
attribute_name="datePublished"
|
||||
)
|
||||
match["rating"] = self._parse_rating()
|
||||
match["series"], match["series_index"] = self._parse_series()
|
||||
match["tags"] = self._parse_tags()
|
||||
|
||||
match["source"] = {
|
||||
"id": self.metadata.__id__,
|
||||
"description": self.metadata.__name__,
|
||||
"link": LubimyCzytac.BASE_URL,
|
||||
}
|
||||
match["identifiers"] = {
|
||||
match.rating = self._parse_rating()
|
||||
match.series, match.series_index = self._parse_series()
|
||||
match.tags = self._parse_tags()
|
||||
match.identifiers = {
|
||||
"isbn": self._parse_isbn(),
|
||||
"lubimyczytac": match["id"],
|
||||
"lubimyczytac": match.id,
|
||||
}
|
||||
return match
|
||||
|
||||
|
|
|
@ -15,47 +15,53 @@
|
|||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
import itertools
|
||||
from typing import Dict, List, Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
from scholarly import scholarly
|
||||
|
||||
from cps.services.Metadata import Metadata
|
||||
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
||||
|
||||
|
||||
class scholar(Metadata):
|
||||
__name__ = "Google Scholar"
|
||||
__id__ = "googlescholar"
|
||||
META_URL = "https://scholar.google.com/"
|
||||
|
||||
def search(self, query, generic_cover=""):
|
||||
def search(
|
||||
self, query: str, generic_cover: str = "", locale: str = "en"
|
||||
) -> Optional[List[MetaRecord]]:
|
||||
val = list()
|
||||
if self.active:
|
||||
scholar_gen = scholarly.search_pubs(' '.join(query.split('+')))
|
||||
i = 0
|
||||
for publication in scholar_gen:
|
||||
v = dict()
|
||||
v['id'] = "1234" # publication['bib'].get('title')
|
||||
v['title'] = publication['bib'].get('title')
|
||||
v['authors'] = publication['bib'].get('author', [])
|
||||
v['description'] = publication['bib'].get('abstract', "")
|
||||
v['publisher'] = publication['bib'].get('venue', "")
|
||||
if publication['bib'].get('pub_year'):
|
||||
v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01"
|
||||
else:
|
||||
v['publishedDate'] = ""
|
||||
v['tags'] = ""
|
||||
v['ratings'] = 0
|
||||
v['series'] = ""
|
||||
v['cover'] = generic_cover
|
||||
v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "",
|
||||
v['source'] = {
|
||||
"id": self.__id__,
|
||||
"description": "Google Scholar",
|
||||
"link": "https://scholar.google.com/"
|
||||
}
|
||||
val.append(v)
|
||||
i += 1
|
||||
if (i >= 10):
|
||||
break
|
||||
title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
|
||||
if title_tokens:
|
||||
tokens = [quote(t.encode("utf-8")) for t in title_tokens]
|
||||
query = " ".join(tokens)
|
||||
scholar_gen = itertools.islice(scholarly.search_pubs(query), 10)
|
||||
for result in scholar_gen:
|
||||
match = self._parse_search_result(
|
||||
result=result, generic_cover=generic_cover, locale=locale
|
||||
)
|
||||
val.append(match)
|
||||
return val
|
||||
|
||||
def _parse_search_result(
|
||||
self, result: Dict, generic_cover: str, locale: str
|
||||
) -> MetaRecord:
|
||||
match = MetaRecord(
|
||||
id=result.get("pub_url", result.get("eprint_url", "")),
|
||||
title=result["bib"].get("title"),
|
||||
authors=result["bib"].get("author", []),
|
||||
url=result.get("pub_url", result.get("eprint_url", "")),
|
||||
source=MetaSourceInfo(
|
||||
id=self.__id__, description=self.__name__, link=scholar.META_URL
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
match.cover = result.get("image", {}).get("original_url", generic_cover)
|
||||
match.description = result["bib"].get("abstract", "")
|
||||
match.publisher = result["bib"].get("venue", "")
|
||||
match.publishedDate = result["bib"].get("pub_year") + "-01-01"
|
||||
match.identifiers = {"scholar": match.id}
|
||||
return match
|
||||
|
|
|
@ -22,6 +22,7 @@ import inspect
|
|||
import json
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
|
||||
from flask import Blueprint, Response, request, url_for
|
||||
from flask_login import current_user
|
||||
|
@ -99,11 +100,13 @@ def metadata_change_active_provider(prov_name):
|
|||
log.error("Invalid request received: {}".format(request))
|
||||
return "Invalid request", 400
|
||||
if "initial" in new_state and prov_name:
|
||||
for c in cl:
|
||||
if c.__id__ == prov_name:
|
||||
data = c.search(new_state.get("query", ""))
|
||||
break
|
||||
return Response(json.dumps(data), mimetype="application/json")
|
||||
data = []
|
||||
provider = next((c for c in cl if c.__id__ == prov_name), None)
|
||||
if provider is not None:
|
||||
data = provider.search(new_state.get("query", ""))
|
||||
return Response(
|
||||
json.dumps([asdict(x) for x in data]), mimetype="application/json"
|
||||
)
|
||||
return ""
|
||||
|
||||
|
||||
|
@ -123,5 +126,5 @@ def metadata_search():
|
|||
if active.get(c.__id__, True)
|
||||
}
|
||||
for future in concurrent.futures.as_completed(meta):
|
||||
data.extend(future.result())
|
||||
data.extend([asdict(x) for x in future.result()])
|
||||
return Response(json.dumps(data), mimetype="application/json")
|
||||
|
|
|
@ -16,32 +16,38 @@
|
|||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
import abc
|
||||
import dataclasses
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, Generator, List, Optional, TypedDict, Union
|
||||
from typing import Dict, Generator, List, Optional, Union
|
||||
|
||||
from cps import constants
|
||||
|
||||
|
||||
class MetaSourceInfo(TypedDict):
|
||||
@dataclasses.dataclass
|
||||
class MetaSourceInfo:
|
||||
id: str
|
||||
description: str
|
||||
link: str
|
||||
|
||||
|
||||
class MetaRecord(TypedDict):
|
||||
@dataclasses.dataclass
|
||||
class MetaRecord:
|
||||
id: Union[str, int]
|
||||
title: str
|
||||
authors: List[str]
|
||||
url: str
|
||||
cover: str
|
||||
series: Optional[str]
|
||||
series_index: Optional[Union[int, float]]
|
||||
tags: Optional[List[str]]
|
||||
publisher: Optional[str]
|
||||
publishedDate: Optional[str]
|
||||
rating: Optional[int]
|
||||
description: Optional[str]
|
||||
source: MetaSourceInfo
|
||||
languages: Optional[List[str]]
|
||||
identifiers: Dict[str, Union[str, int]]
|
||||
cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg')
|
||||
description: Optional[str] = ""
|
||||
series: Optional[str] = None
|
||||
series_index: Optional[Union[int, float]] = 0
|
||||
identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict)
|
||||
publisher: Optional[str] = None
|
||||
publishedDate: Optional[str] = None
|
||||
rating: Optional[int] = 0
|
||||
languages: Optional[List[str]] = dataclasses.field(default_factory=list)
|
||||
tags: Optional[List[str]] = dataclasses.field(default_factory=list)
|
||||
|
||||
|
||||
class Metadata:
|
||||
|
|
|
@ -32,6 +32,9 @@ SQLAlchemy-Utils>=0.33.5,<0.38.0
|
|||
# extracting metadata
|
||||
rarfile>=2.7
|
||||
scholarly>=1.2.0, <1.5
|
||||
markdown2==2.4.2
|
||||
html2text==2020.1.16
|
||||
python-dateutil==2.8.2
|
||||
|
||||
# other
|
||||
natsort>=2.2.0,<8.1.0
|
||||
|
|
|
@ -14,6 +14,3 @@ Wand>=0.4.4,<0.7.0
|
|||
unidecode>=0.04.19,<1.3.0
|
||||
lxml>=3.8.0,<4.7.0
|
||||
flask-wtf>=0.14.2,<1.1.0
|
||||
markdown2==2.4.2
|
||||
html2text==2020.1.16
|
||||
python-dateutil==2.8.2
|
||||
|
|
Loading…
Reference in New Issue
Block a user