From d64589914fdf69ce78111c2a5d29a967f7a881e3 Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 15:14:19 +0100 Subject: [PATCH] add series, languages and isbn to google provider --- cps/metadata_provider/google.py | 104 +++++++++++++++++++------- cps/metadata_provider/lubimyczytac.py | 24 +++--- cps/search_metadata.py | 5 +- cps/services/Metadata.py | 52 ++++++------- 4 files changed, 119 insertions(+), 66 deletions(-) diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index 8be8ad74..1074fe3d 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -17,41 +17,93 @@ # along with this program. If not, see . # Google Books api document: https://developers.google.com/books/docs/v1/using +from typing import Dict, List, Optional +from urllib.parse import quote + import requests -from cps.services.Metadata import Metadata +from cps.isoLanguages import get_lang3, get_language_name +from cps.services.Metadata import MetaRecord, Metadata class Google(Metadata): __name__ = "Google" __id__ = "google" - BASE_URL = "https://www.googleapis.com/books/v1/volumes?q=" + DESCRIPTION = "Google Books" + META_URL = "https://books.google.com/" + BOOK_URL = "https://books.google.com/books?id=" + SEARCH_URL = "https://www.googleapis.com/books/v1/volumes?q=" + ISBN_TYPE = "ISBN_13" - def search(self, query, generic_cover=""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: if self.active: val = list() - result = requests.get(Google.BASE_URL + query.replace(" ","+")) - for r in result.json()['items']: - v = dict() - v['id'] = r['id'] - v['title'] = r['volumeInfo']['title'] - v['authors'] = r['volumeInfo'].get('authors', []) - v['description'] = r['volumeInfo'].get('description', "") - v['publisher'] = r['volumeInfo'].get('publisher', "") - v['publishedDate'] = r['volumeInfo'].get('publishedDate', "") - v['tags'] = r['volumeInfo'].get('categories', []) - v['rating'] = r['volumeInfo'].get('averageRating', 0) - if r['volumeInfo'].get('imageLinks'): - v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://") - else: - # v['cover'] = "/../../../static/generic_cover.jpg" - v['cover'] = generic_cover - v['source'] = { - "id": self.__id__, - "description": "Google Books", - "link": "https://books.google.com/"} - v['url'] = "https://books.google.com/books?id=" + r['id'] - val.append(v) - return val + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "+".join(tokens) + results = requests.get(Google.SEARCH_URL + query) + for result in results.json()["items"]: + val.append( + self._parse_search_result( + result=result, generic_cover=generic_cover, locale=locale + ) + ) + return val + def _parse_search_result( + self, result: Dict, generic_cover: str, locale: str + ) -> MetaRecord: + match = dict() + match["id"] = result["id"] + match["title"] = result["volumeInfo"]["title"] + match["authors"] = result["volumeInfo"].get("authors", []) + match["url"] = Google.BOOK_URL + result["id"] + match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover) + match["description"] = result["volumeInfo"].get("description", "") + match["languages"] = self._parse_languages(result=result, locale=locale) + match["publisher"] = result["volumeInfo"].get("publisher", "") + match["publishedDate"] = result["volumeInfo"].get("publishedDate", "") + match["rating"] = result["volumeInfo"].get("averageRating", 0) + match["series"], match["series_index"] = "", 1 + match["tags"] = result["volumeInfo"].get("categories", []) + match["source"] = { + "id": self.__id__, + "description": Google.DESCRIPTION, + "link": Google.META_URL, + } + + match["identifiers"] = { + "google": match.get("id"), + } + match = self._parse_isbn(result=result, match=match) + return match + + @staticmethod + def _parse_isbn(result: Dict, match: Dict) -> Dict: + identifiers = result["volumeInfo"].get("industryIdentifiers", []) + for identifier in identifiers: + if identifier.get("type") == Google.ISBN_TYPE: + match["identifiers"]["isbn"] = identifier.get("identifier") + break + return match + + @staticmethod + def _parse_cover(result: Dict, generic_cover: str) -> str: + if result["volumeInfo"].get("imageLinks"): + cover_url = result["volumeInfo"]["imageLinks"]["thumbnail"] + return cover_url.replace("http://", "https://") + return generic_cover + + @staticmethod + def _parse_languages(result: Dict, locale: str) -> List[str]: + language_iso2 = result.get("language", "") + languages = ( + [get_language_name(locale, get_lang3(language_iso2))] + if language_iso2 + else [] + ) + return languages diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index 1d4e18e1..fd9ca4a7 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -107,7 +107,9 @@ class LubimyCzytac(Metadata): SUMMARY = "//script[@type='application/ld+json']//text()" - def search(self, query: str, generic_cover: str = "") -> Optional[List]: + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: if self.active: result = requests.get(self._prepare_query(title=query)) root = fromstring(result.text) @@ -117,10 +119,7 @@ class LubimyCzytac(Metadata): with ThreadPool(processes=10) as pool: final_matches = pool.starmap( lc_parser.parse_single_book, - [ - (match, generic_cover) - for match in matches - ], + [(match, generic_cover) for match in matches], ) return final_matches return matches @@ -192,26 +191,25 @@ class LubimyCzytacParser: ) return matches - def parse_single_book( - self, match: Dict, generic_cover: str - ) -> MetaRecord: + def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord: response = requests.get(match.get("url")) self.root = fromstring(response.text) - match["series"], match["series_index"] = self._parse_series() - match["tags"] = self._parse_tags() + match["cover"] = self._parse_cover(generic_cover=generic_cover) + match["description"] = self._parse_description() + match["languages"] = self._parse_languages() match["publisher"] = self._parse_publisher() match["publishedDate"] = self._parse_from_summary( attribute_name="datePublished" ) match["rating"] = self._parse_rating() - match["description"] = self._parse_description() - match["cover"] = self._parse_cover(generic_cover=generic_cover) + match["series"], match["series_index"] = self._parse_series() + match["tags"] = self._parse_tags() + match["source"] = { "id": self.metadata.__id__, "description": self.metadata.__name__, "link": LubimyCzytac.BASE_URL, } - match["languages"] = self._parse_languages() match["identifiers"] = { "isbn": self._parse_isbn(), "lubimyczytac": match["id"], diff --git a/cps/search_metadata.py b/cps/search_metadata.py index 7d9b6e05..a128f9ac 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -30,7 +30,7 @@ from sqlalchemy.exc import InvalidRequestError, OperationalError from sqlalchemy.orm.attributes import flag_modified from cps.services.Metadata import Metadata -from . import constants, logger, ub +from . import constants, get_locale, logger, ub meta = Blueprint("metadata", __name__) @@ -113,11 +113,12 @@ def metadata_search(): query = request.form.to_dict().get("query") data = list() active = current_user.view_settings.get("metadata", {}) + locale = get_locale() if query: static_cover = url_for("static", filename="generic_cover.jpg") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: meta = { - executor.submit(c.search, query, static_cover): c + executor.submit(c.search, query, static_cover, locale): c for c in cl if active.get(c.__id__, True) } diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index 1464411a..09fc70ce 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -20,6 +20,30 @@ import re from typing import Dict, Generator, List, Optional, TypedDict, Union +class MetaSourceInfo(TypedDict): + id: str + description: str + link: str + + +class MetaRecord(TypedDict): + id: Union[str, int] + title: str + authors: List[str] + url: str + cover: str + series: Optional[str] + series_index: Optional[Union[int, float]] + tags: Optional[List[str]] + publisher: Optional[str] + publishedDate: Optional[str] + rating: Optional[int] + description: Optional[str] + source: MetaSourceInfo + languages: Optional[List[str]] + identifiers: Dict[str, Union[str, int]] + + class Metadata: __name__ = "Generic" __id__ = "generic" @@ -31,7 +55,9 @@ class Metadata: self.active = state @abc.abstractmethod - def search(self, query: str, generic_cover: str = ""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: pass @staticmethod @@ -73,27 +99,3 @@ class Metadata: not strip_joiners or token.lower() not in ("a", "and", "the", "&") ): yield token - - -class MetaSourceInfo(TypedDict): - id: str - description: str - link: str - - -class MetaRecord(TypedDict): - id: Union[str, int] - title: str - authors: List[str] - url: str - cover: str - series: Optional[str] - series_index: Optional[Union[int, float]] - tags: Optional[List[str]] - publisher: Optional[str] - publishedDate: Optional[str] - rating: Optional[int] - description: Optional[str] - source: MetaSourceInfo - languages: Optional[List[str]] - identifiers: Dict[str, Union[str, int]]