calibre-web/cps/metadata_provider/amazon.py

# -*- coding: utf-8 -*-

#  This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
#    Copyright (C) 2022 quarz12
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program. If not, see <http://www.gnu.org/licenses/>.

import concurrent.futures
import requests
from bs4 import BeautifulSoup as BS  # requirement

try:
    import cchardet #optional for better speed
except ImportError:
    pass
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
import cps.logger as logger

#from time import time
from operator import itemgetter
log = logger.create()

class Amazon(Metadata):
    __name__ = "Amazon"
    __id__ = "amazon"
    headers = {'upgrade-insecure-requests': '1',
               'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
               'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
               'sec-gpc': '1',
               'sec-fetch-site': 'none',
               'sec-fetch-mode': 'navigate',
               'sec-fetch-user': '?1',
               'sec-fetch-dest': 'document',
               'accept-encoding': 'gzip, deflate, br',
               'accept-language': 'en-US,en;q=0.9'}
    session = requests.Session()
    session.headers=headers

    def search(
        self, query: str, generic_cover: str = "", locale: str = "en"
    ):
        #timer=time()
        def inner(link, index) -> [dict, int]:
            try:
                with self.session as session:
                    r = session.get(f"https://www.amazon.com{link}")
                    r.raise_for_status()
                    long_soup = BS(r.text, "lxml")  #~4sec :/
                    soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"})
                    if soup2 is None:
                        return
                    match = MetaRecord(
                        title = "",
                        authors = "",
                        source=MetaSourceInfo(
                            id=self.__id__,
                            description="Amazon Books",
                            link="https://amazon.com/"
                        ),
                        url = f"https://www.amazon.com{link}",
                        #the more searches the slower, these are too hard to find in reasonable time or might not even exist
                        publisher= "",  # very unreliable
                        publishedDate= "",  # very unreliable
                        id = None,  # ?
                        tags = []  # dont exist on amazon
                    )

                    try:
                        match.description = "\n".join(
                            soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\
                                                .replace("\xa0"," ")[:-9].strip().strip("\n")
                    except (AttributeError, TypeError):
                        return None  # if there is no description it is not a book and therefore should be ignored
                    try:
                        match.title = soup2.find("span", attrs={"id": "productTitle"}).text
                    except (AttributeError, TypeError):
                        match.title = ""
                    try:
                        match.authors = [next(
                            filter(lambda i: i != " " and i != "\n" and not i.startswith("{"),
                                   x.findAll(text=True))).strip()
                                        for x in soup2.findAll("span", attrs={"class": "author"})]
                    except (AttributeError, TypeError, StopIteration):
                        match.authors = ""
                    try:
                        match.rating = int(
                            soup2.find("span", class_="a-icon-alt").text.split(" ")[0].split(".")[
                                0])  # first number in string
                    except (AttributeError, ValueError):
                        match.rating = 0
                    try:
                        match.cover = soup2.find("img", attrs={"class": "a-dynamic-image frontImage"})["src"]
                    except (AttributeError, TypeError):
                        match.cover = ""
                    return match, index
            except Exception as e:
                log.error_or_exception(e)
                return

        val = list()
        try:
            if self.active:
                results = self.session.get(
                    f"https://www.amazon.com/s?k={query.replace(' ', '+')}"
                    f"&i=digital-text&sprefix={query.replace(' ', '+')}"
                    f"%2Cdigital-text&ref=nb_sb_noss",
                    headers=self.headers)
                results.raise_for_status()
                soup = BS(results.text, 'html.parser')
                links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in
                              soup.findAll("div", attrs={"data-component-type": "s-search-result"})]
                with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
                    fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])}
                    val = list(map(lambda x: x.result(), concurrent.futures.as_completed(fut)))
            result = list(filter(lambda x: x, val))
            return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance
        except requests.exceptions.HTTPError as e:
            log.error_or_exception(e)
            return []
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`# -- coding: utf-8 --`

			`# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)`
			`# Copyright (C) 2022 quarz12`
			`#`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

amazon metadata 2022-01-25 13:33:34 +00:00			`import concurrent.futures`
			`import requests`
			`from bs4 import BeautifulSoup as BS # requirement`
Added optional requirements for metadata amazon Better logging of errors in metadata source files 2022-01-29 20:02:56 +00:00
import try catch 2022-01-26 09:41:42 +00:00			`try:`
			`import cchardet #optional for better speed`
			`except ImportError:`
			`pass`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`import cps.logger as logger`

amazon metadata 2022-01-25 13:33:34 +00:00			`#from time import time`
			`from operator import itemgetter`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`log = logger.create()`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00
amazon metadata 2022-01-25 13:33:34 +00:00			`class Amazon(Metadata):`
			`__name__ = "Amazon"`
			`__id__ = "amazon"`
			`headers = {'upgrade-insecure-requests': '1',`
			`'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',`
			`'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9',`
			`'sec-gpc': '1',`
			`'sec-fetch-site': 'none',`
			`'sec-fetch-mode': 'navigate',`
			`'sec-fetch-user': '?1',`
			`'sec-fetch-dest': 'document',`
			`'accept-encoding': 'gzip, deflate, br',`
			`'accept-language': 'en-US,en;q=0.9'}`
			`session = requests.Session()`
			`session.headers=headers`

Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`def search(`
			`self, query: str, generic_cover: str = "", locale: str = "en"`
			`):`
amazon metadata 2022-01-25 13:33:34 +00:00			`#timer=time()`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`def inner(link, index) -> [dict, int]:`
			`try:`
			`with self.session as session:`
			`r = session.get(f"https://www.amazon.com{link}")`
			`r.raise_for_status()`
			`long_soup = BS(r.text, "lxml") #~4sec :/`
			`soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"})`
			`if soup2 is None:`
			`return`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match = MetaRecord(`
			`title = "",`
			`authors = "",`
			`source=MetaSourceInfo(`
			`id=self.__id__,`
			`description="Amazon Books",`
			`link="https://amazon.com/"`
			`),`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`url = f"https://www.amazon.com{link}",`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`#the more searches the slower, these are too hard to find in reasonable time or might not even exist`
			`publisher= "", # very unreliable`
			`publishedDate= "", # very unreliable`
			`id = None, # ?`
			`tags = [] # dont exist on amazon`
			`)`
amazon metadata 2022-01-25 13:33:34 +00:00
			`try:`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.description = "\n".join(`
			`soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\`
			`.replace("\xa0"," ")[:-9].strip().strip("\n")`
amazon metadata 2022-01-25 13:33:34 +00:00			`except (AttributeError, TypeError):`
			`return None # if there is no description it is not a book and therefore should be ignored`
			`try:`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.title = soup2.find("span", attrs={"id": "productTitle"}).text`
amazon metadata 2022-01-25 13:33:34 +00:00			`except (AttributeError, TypeError):`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.title = ""`
amazon metadata 2022-01-25 13:33:34 +00:00			`try:`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.authors = [next(`
amazon metadata 2022-01-25 13:33:34 +00:00			`filter(lambda i: i != " " and i != "\n" and not i.startswith("{"),`
			`x.findAll(text=True))).strip()`
			`for x in soup2.findAll("span", attrs={"class": "author"})]`
			`except (AttributeError, TypeError, StopIteration):`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.authors = ""`
amazon metadata 2022-01-25 13:33:34 +00:00			`try:`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.rating = int(`
amazon metadata 2022-01-25 13:33:34 +00:00			`soup2.find("span", class_="a-icon-alt").text.split(" ")[0].split(".")[`
			`0]) # first number in string`
			`except (AttributeError, ValueError):`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.rating = 0`
amazon metadata 2022-01-25 13:33:34 +00:00			`try:`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.cover = soup2.find("img", attrs={"class": "a-dynamic-image frontImage"})["src"]`
amazon metadata 2022-01-25 13:33:34 +00:00			`except (AttributeError, TypeError):`
Merge remote-tracking branch 'amazon/master' into Develop 2022-01-27 18:12:33 +00:00			`match.cover = ""`
			`return match, index`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`except Exception as e:`
Database error is more detailed renamed debug_or_exception to error_or_exception 2022-03-12 16:14:54 +00:00			`log.error_or_exception(e)`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`return`
amazon metadata 2022-01-25 13:33:34 +00:00
			`val = list()`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`try:`
			`if self.active:`
			`results = self.session.get(`
			`f"https://www.amazon.com/s?k={query.replace(' ', '+')}"`
			`f"&i=digital-text&sprefix={query.replace(' ', '+')}"`
			`f"%2Cdigital-text&ref=nb_sb_noss",`
			`headers=self.headers)`
			`results.raise_for_status()`
			`soup = BS(results.text, 'html.parser')`
			`links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in`
			`soup.findAll("div", attrs={"data-component-type": "s-search-result"})]`
			`with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:`
			`fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])}`
			`val = list(map(lambda x: x.result(), concurrent.futures.as_completed(fut)))`
			`result = list(filter(lambda x: x, val))`
			`return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance`
			`except requests.exceptions.HTTPError as e:`
Database error is more detailed renamed debug_or_exception to error_or_exception 2022-03-12 16:14:54 +00:00			`log.error_or_exception(e)`
Update catch errors for load metadata from amazon (#2333) 2022-03-12 07:27:18 +00:00			`return []`