2022-01-27 18:12:33 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
|
|
|
|
# Copyright (C) 2022 quarz12
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
2022-01-25 13:33:34 +00:00
|
|
|
import concurrent.futures
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup as BS # requirement
|
2022-02-25 04:18:07 +00:00
|
|
|
from typing import List, Optional
|
2022-01-29 20:02:56 +00:00
|
|
|
|
2022-01-26 09:41:42 +00:00
|
|
|
try:
|
|
|
|
import cchardet #optional for better speed
|
|
|
|
except ImportError:
|
|
|
|
pass
|
2024-06-18 18:13:26 +00:00
|
|
|
|
2022-01-27 18:12:33 +00:00
|
|
|
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
|
2022-03-12 07:27:18 +00:00
|
|
|
import cps.logger as logger
|
|
|
|
|
2022-01-25 13:33:34 +00:00
|
|
|
#from time import time
|
|
|
|
from operator import itemgetter
|
2022-03-12 07:27:18 +00:00
|
|
|
log = logger.create()
|
2022-01-27 18:12:33 +00:00
|
|
|
|
2022-02-25 04:18:07 +00:00
|
|
|
|
2022-01-25 13:33:34 +00:00
|
|
|
class Amazon(Metadata):
|
|
|
|
__name__ = "Amazon"
|
|
|
|
__id__ = "amazon"
|
|
|
|
headers = {'upgrade-insecure-requests': '1',
|
2024-09-14 08:23:11 +00:00
|
|
|
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0',
|
|
|
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
|
|
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
|
|
'Sec-Fetch-User': '?1',
|
|
|
|
'Sec-Fetch-Dest': 'document',
|
|
|
|
'Upgrade-Insecure-Requests': '1',
|
|
|
|
'Alt-Used' : 'www.amazon.com',
|
|
|
|
'Priority' : 'u=0, i',
|
|
|
|
'accept-encoding': 'gzip, deflate, br, zstd',
|
2022-01-25 13:33:34 +00:00
|
|
|
'accept-language': 'en-US,en;q=0.9'}
|
|
|
|
session = requests.Session()
|
|
|
|
session.headers=headers
|
|
|
|
|
2022-01-27 18:12:33 +00:00
|
|
|
def search(
|
|
|
|
self, query: str, generic_cover: str = "", locale: str = "en"
|
2022-02-25 04:18:07 +00:00
|
|
|
) -> Optional[List[MetaRecord]]:
|
2022-04-19 13:05:41 +00:00
|
|
|
def inner(link, index) -> [dict, int]:
|
2022-02-25 04:18:07 +00:00
|
|
|
with self.session as session:
|
|
|
|
try:
|
|
|
|
r = session.get(f"https://www.amazon.com/{link}")
|
2022-03-12 07:27:18 +00:00
|
|
|
r.raise_for_status()
|
2022-04-19 13:05:41 +00:00
|
|
|
except Exception as ex:
|
|
|
|
log.warning(ex)
|
2024-08-12 16:29:31 +00:00
|
|
|
return []
|
2022-01-25 13:33:34 +00:00
|
|
|
long_soup = BS(r.text, "lxml") #~4sec :/
|
2024-09-14 08:23:11 +00:00
|
|
|
soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-ppd_csm_instrumentation_wrapper"})
|
2022-01-25 13:33:34 +00:00
|
|
|
if soup2 is None:
|
2024-08-12 16:29:31 +00:00
|
|
|
return []
|
2022-01-25 13:33:34 +00:00
|
|
|
try:
|
2022-01-27 18:12:33 +00:00
|
|
|
match = MetaRecord(
|
|
|
|
title = "",
|
|
|
|
authors = "",
|
|
|
|
source=MetaSourceInfo(
|
|
|
|
id=self.__id__,
|
|
|
|
description="Amazon Books",
|
|
|
|
link="https://amazon.com/"
|
|
|
|
),
|
2022-03-12 07:27:18 +00:00
|
|
|
url = f"https://www.amazon.com{link}",
|
2022-01-27 18:12:33 +00:00
|
|
|
#the more searches the slower, these are too hard to find in reasonable time or might not even exist
|
|
|
|
publisher= "", # very unreliable
|
|
|
|
publishedDate= "", # very unreliable
|
|
|
|
id = None, # ?
|
|
|
|
tags = [] # dont exist on amazon
|
|
|
|
)
|
2022-01-25 13:33:34 +00:00
|
|
|
|
|
|
|
try:
|
2022-01-27 18:12:33 +00:00
|
|
|
match.description = "\n".join(
|
|
|
|
soup2.find("div", attrs={"data-feature-name": "bookDescription"}).stripped_strings)\
|
|
|
|
.replace("\xa0"," ")[:-9].strip().strip("\n")
|
2022-01-25 13:33:34 +00:00
|
|
|
except (AttributeError, TypeError):
|
2024-08-12 16:29:31 +00:00
|
|
|
return [] # if there is no description it is not a book and therefore should be ignored
|
2022-01-25 13:33:34 +00:00
|
|
|
try:
|
2022-01-27 18:12:33 +00:00
|
|
|
match.title = soup2.find("span", attrs={"id": "productTitle"}).text
|
2022-01-25 13:33:34 +00:00
|
|
|
except (AttributeError, TypeError):
|
2022-01-27 18:12:33 +00:00
|
|
|
match.title = ""
|
2022-01-25 13:33:34 +00:00
|
|
|
try:
|
2022-01-27 18:12:33 +00:00
|
|
|
match.authors = [next(
|
2022-01-25 13:33:34 +00:00
|
|
|
filter(lambda i: i != " " and i != "\n" and not i.startswith("{"),
|
2023-07-30 05:44:16 +00:00
|
|
|
x.findAll(string=True))).strip()
|
2022-01-25 13:33:34 +00:00
|
|
|
for x in soup2.findAll("span", attrs={"class": "author"})]
|
|
|
|
except (AttributeError, TypeError, StopIteration):
|
2022-01-27 18:12:33 +00:00
|
|
|
match.authors = ""
|
2022-01-25 13:33:34 +00:00
|
|
|
try:
|
2022-01-27 18:12:33 +00:00
|
|
|
match.rating = int(
|
2022-01-25 13:33:34 +00:00
|
|
|
soup2.find("span", class_="a-icon-alt").text.split(" ")[0].split(".")[
|
|
|
|
0]) # first number in string
|
|
|
|
except (AttributeError, ValueError):
|
2022-01-27 18:12:33 +00:00
|
|
|
match.rating = 0
|
2022-01-25 13:33:34 +00:00
|
|
|
try:
|
2024-09-14 08:23:11 +00:00
|
|
|
match.cover = soup2.find("img", attrs={"class": "a-dynamic-image"})["src"]
|
2022-01-25 13:33:34 +00:00
|
|
|
except (AttributeError, TypeError):
|
2022-01-27 18:12:33 +00:00
|
|
|
match.cover = ""
|
|
|
|
return match, index
|
2022-04-17 09:54:46 +00:00
|
|
|
except Exception as e:
|
|
|
|
log.error_or_exception(e)
|
2024-08-12 16:29:31 +00:00
|
|
|
return []
|
2022-01-25 13:33:34 +00:00
|
|
|
|
|
|
|
val = list()
|
|
|
|
if self.active:
|
2022-02-25 04:18:07 +00:00
|
|
|
try:
|
2022-03-12 07:27:18 +00:00
|
|
|
results = self.session.get(
|
2022-02-25 04:18:07 +00:00
|
|
|
f"https://www.amazon.com/s?k={query.replace(' ', '+')}&i=digital-text&sprefix={query.replace(' ', '+')}"
|
2022-03-12 07:27:18 +00:00
|
|
|
f"%2Cdigital-text&ref=nb_sb_noss",
|
|
|
|
headers=self.headers)
|
|
|
|
results.raise_for_status()
|
2022-04-17 08:33:52 +00:00
|
|
|
except requests.exceptions.HTTPError as e:
|
2022-04-19 13:05:41 +00:00
|
|
|
log.error_or_exception(e)
|
2023-01-22 07:02:17 +00:00
|
|
|
return []
|
2022-02-25 04:18:07 +00:00
|
|
|
except Exception as e:
|
2022-04-19 13:05:41 +00:00
|
|
|
log.warning(e)
|
2023-01-22 07:02:17 +00:00
|
|
|
return []
|
2022-01-25 13:33:34 +00:00
|
|
|
soup = BS(results.text, 'html.parser')
|
|
|
|
links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in
|
|
|
|
soup.findAll("div", attrs={"data-component-type": "s-search-result"})]
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
2024-09-14 08:23:11 +00:00
|
|
|
fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:3])}
|
2024-08-12 16:29:31 +00:00
|
|
|
val = list(map(lambda x : x.result(), concurrent.futures.as_completed(fut)))
|
2022-04-19 13:05:41 +00:00
|
|
|
result = list(filter(lambda x: x, val))
|
2022-01-27 18:12:33 +00:00
|
|
|
return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance
|