add series, languages and isbn to google provider

2025-11-04 01:03:02 +00:00 · 2021-12-13 15:14:19 +01:00
parent 362fdc5716
commit d64589914f
4 changed files with 119 additions and 66 deletions
--- a/cps/metadata_provider/google.py
+++ b/cps/metadata_provider/google.py
@@ -17,41 +17,93 @@
 #  along with this program. If not, see <http://www.gnu.org/licenses/>.

 # Google Books api document: https://developers.google.com/books/docs/v1/using
+from typing import Dict, List, Optional
+from urllib.parse import quote
+
 import requests

-from cps.services.Metadata import Metadata
+from cps.isoLanguages import get_lang3, get_language_name
+from cps.services.Metadata import MetaRecord, Metadata


 class Google(Metadata):
    __name__ = "Google"
    __id__ = "google"
-    BASE_URL = "https://www.googleapis.com/books/v1/volumes?q="
+    DESCRIPTION = "Google Books"
+    META_URL = "https://books.google.com/"
+    BOOK_URL = "https://books.google.com/books?id="
+    SEARCH_URL = "https://www.googleapis.com/books/v1/volumes?q="
+    ISBN_TYPE = "ISBN_13"

-    def search(self, query, generic_cover=""):
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
        if self.active:
            val = list()
-            result = requests.get(Google.BASE_URL + query.replace(" ","+"))
-            for r in result.json()['items']:
-                v = dict()
-                v['id'] = r['id']
-                v['title'] = r['volumeInfo']['title']
-                v['authors'] = r['volumeInfo'].get('authors', [])
-                v['description'] = r['volumeInfo'].get('description', "")
-                v['publisher'] = r['volumeInfo'].get('publisher', "")
-                v['publishedDate'] = r['volumeInfo'].get('publishedDate', "")
-                v['tags'] = r['volumeInfo'].get('categories', [])
-                v['rating'] = r['volumeInfo'].get('averageRating', 0)
-                if r['volumeInfo'].get('imageLinks'):
-                    v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://")
-                else:
-                    # v['cover'] = "/../../../static/generic_cover.jpg"
-                    v['cover'] = generic_cover
-                v['source'] = {
-                    "id": self.__id__,
-                    "description": "Google Books",
-                    "link": "https://books.google.com/"}
-                v['url'] = "https://books.google.com/books?id=" + r['id']
-                val.append(v)
-            return val
+            title_tokens = list(self.get_title_tokens(query, strip_joiners=False))
+            if title_tokens:
+                tokens = [quote(t.encode("utf-8")) for t in title_tokens]
+                query = "+".join(tokens)
+            results = requests.get(Google.SEARCH_URL + query)
+            for result in results.json()["items"]:
+                val.append(
+                    self._parse_search_result(
+                        result=result, generic_cover=generic_cover, locale=locale
+                    )
+                )
+                return val

+    def _parse_search_result(
+        self, result: Dict, generic_cover: str, locale: str
+    ) -> MetaRecord:
+        match = dict()
+        match["id"] = result["id"]
+        match["title"] = result["volumeInfo"]["title"]
+        match["authors"] = result["volumeInfo"].get("authors", [])
+        match["url"] = Google.BOOK_URL + result["id"]
+        match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover)
+        match["description"] = result["volumeInfo"].get("description", "")
+        match["languages"] = self._parse_languages(result=result, locale=locale)
+        match["publisher"] = result["volumeInfo"].get("publisher", "")
+        match["publishedDate"] = result["volumeInfo"].get("publishedDate", "")
+        match["rating"] = result["volumeInfo"].get("averageRating", 0)
+        match["series"], match["series_index"] = "", 1
+        match["tags"] = result["volumeInfo"].get("categories", [])

+        match["source"] = {
+            "id": self.__id__,
+            "description": Google.DESCRIPTION,
+            "link": Google.META_URL,
+        }
+
+        match["identifiers"] = {
+            "google": match.get("id"),
+        }
+        match = self._parse_isbn(result=result, match=match)
+        return match
+
+    @staticmethod
+    def _parse_isbn(result: Dict, match: Dict) -> Dict:
+        identifiers = result["volumeInfo"].get("industryIdentifiers", [])
+        for identifier in identifiers:
+            if identifier.get("type") == Google.ISBN_TYPE:
+                match["identifiers"]["isbn"] = identifier.get("identifier")
+                break
+        return match
+
+    @staticmethod
+    def _parse_cover(result: Dict, generic_cover: str) -> str:
+        if result["volumeInfo"].get("imageLinks"):
+            cover_url = result["volumeInfo"]["imageLinks"]["thumbnail"]
+            return cover_url.replace("http://", "https://")
+        return generic_cover
+
+    @staticmethod
+    def _parse_languages(result: Dict, locale: str) -> List[str]:
+        language_iso2 = result.get("language", "")
+        languages = (
+            [get_language_name(locale, get_lang3(language_iso2))]
+            if language_iso2
+            else []
+        )
+        return languages
--- a/cps/metadata_provider/lubimyczytac.py
+++ b/cps/metadata_provider/lubimyczytac.py
@@ -107,7 +107,9 @@ class LubimyCzytac(Metadata):

    SUMMARY = "//script[@type='application/ld+json']//text()"

-    def search(self, query: str, generic_cover: str = "") -> Optional[List]:
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
        if self.active:
            result = requests.get(self._prepare_query(title=query))
            root = fromstring(result.text)
@@ -117,10 +119,7 @@ class LubimyCzytac(Metadata):
                with ThreadPool(processes=10) as pool:
                    final_matches = pool.starmap(
                        lc_parser.parse_single_book,
-                        [
-                            (match, generic_cover)
-                            for match in matches
-                        ],
+                        [(match, generic_cover) for match in matches],
                    )
                return final_matches
            return matches
@@ -192,26 +191,25 @@ class LubimyCzytacParser:
            )
        return matches

-    def parse_single_book(
-        self, match: Dict, generic_cover: str
-    ) -> MetaRecord:
+    def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord:
        response = requests.get(match.get("url"))
        self.root = fromstring(response.text)
-        match["series"], match["series_index"] = self._parse_series()
-        match["tags"] = self._parse_tags()
+        match["cover"] = self._parse_cover(generic_cover=generic_cover)
+        match["description"] = self._parse_description()
+        match["languages"] = self._parse_languages()
        match["publisher"] = self._parse_publisher()
        match["publishedDate"] = self._parse_from_summary(
            attribute_name="datePublished"
        )
        match["rating"] = self._parse_rating()
-        match["description"] = self._parse_description()
-        match["cover"] = self._parse_cover(generic_cover=generic_cover)
+        match["series"], match["series_index"] = self._parse_series()
+        match["tags"] = self._parse_tags()
+
        match["source"] = {
            "id": self.metadata.__id__,
            "description": self.metadata.__name__,
            "link": LubimyCzytac.BASE_URL,
        }
-        match["languages"] = self._parse_languages()
        match["identifiers"] = {
            "isbn": self._parse_isbn(),
            "lubimyczytac": match["id"],
--- a/cps/search_metadata.py
+++ b/cps/search_metadata.py
@@ -30,7 +30,7 @@ from sqlalchemy.exc import InvalidRequestError, OperationalError
 from sqlalchemy.orm.attributes import flag_modified

 from cps.services.Metadata import Metadata
-from . import constants, logger, ub
+from . import constants, get_locale, logger, ub

 meta = Blueprint("metadata", __name__)

@@ -113,11 +113,12 @@ def metadata_search():
    query = request.form.to_dict().get("query")
    data = list()
    active = current_user.view_settings.get("metadata", {})
+    locale = get_locale()
    if query:
        static_cover = url_for("static", filename="generic_cover.jpg")
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            meta = {
-                executor.submit(c.search, query, static_cover): c
+                executor.submit(c.search, query, static_cover, locale): c
                for c in cl
                if active.get(c.__id__, True)
            }
--- a/cps/services/Metadata.py
+++ b/cps/services/Metadata.py
@@ -20,6 +20,30 @@ import re
 from typing import Dict, Generator, List, Optional, TypedDict, Union


+class MetaSourceInfo(TypedDict):
+    id: str
+    description: str
+    link: str
+
+
+class MetaRecord(TypedDict):
+    id: Union[str, int]
+    title: str
+    authors: List[str]
+    url: str
+    cover: str
+    series: Optional[str]
+    series_index: Optional[Union[int, float]]
+    tags: Optional[List[str]]
+    publisher: Optional[str]
+    publishedDate: Optional[str]
+    rating: Optional[int]
+    description: Optional[str]
+    source: MetaSourceInfo
+    languages: Optional[List[str]]
+    identifiers: Dict[str, Union[str, int]]
+
+
 class Metadata:
    __name__ = "Generic"
    __id__ = "generic"
@@ -31,7 +55,9 @@ class Metadata:
        self.active = state

    @abc.abstractmethod
-    def search(self, query: str, generic_cover: str = ""):
+    def search(
+        self, query: str, generic_cover: str = "", locale: str = "en"
+    ) -> Optional[List[MetaRecord]]:
        pass

    @staticmethod
@@ -73,27 +99,3 @@ class Metadata:
                not strip_joiners or token.lower() not in ("a", "and", "the", "&")
            ):
                yield token
-
-
-class MetaSourceInfo(TypedDict):
-    id: str
-    description: str
-    link: str
-
-
-class MetaRecord(TypedDict):
-    id: Union[str, int]
-    title: str
-    authors: List[str]
-    url: str
-    cover: str
-    series: Optional[str]
-    series_index: Optional[Union[int, float]]
-    tags: Optional[List[str]]
-    publisher: Optional[str]
-    publishedDate: Optional[str]
-    rating: Optional[int]
-    description: Optional[str]
-    source: MetaSourceInfo
-    languages: Optional[List[str]]
-    identifiers: Dict[str, Union[str, int]]