Update catch errors for load metadata from amazon (#2333)

This commit is contained in:
Ozzie Isaacs 2022-03-12 08:27:18 +01:00
parent 34478079d8
commit 49692b4a45
1 changed files with 35 additions and 27 deletions

View File

@ -25,8 +25,11 @@ try:
except ImportError: except ImportError:
pass pass
from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata
import cps.logger as logger
#from time import time #from time import time
from operator import itemgetter from operator import itemgetter
log = logger.create()
class Amazon(Metadata): class Amazon(Metadata):
__name__ = "Amazon" __name__ = "Amazon"
@ -48,15 +51,15 @@ class Amazon(Metadata):
self, query: str, generic_cover: str = "", locale: str = "en" self, query: str, generic_cover: str = "", locale: str = "en"
): ):
#timer=time() #timer=time()
def inner(link,index)->[dict,int]: def inner(link, index) -> [dict, int]:
with self.session as session: try:
r = session.get(f"https://www.amazon.com/{link}") with self.session as session:
r.raise_for_status() r = session.get(f"https://www.amazon.com{link}")
long_soup = BS(r.text, "lxml") #~4sec :/ r.raise_for_status()
soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"}) long_soup = BS(r.text, "lxml") #~4sec :/
if soup2 is None: soup2 = long_soup.find("div", attrs={"cel_widget_id": "dpx-books-ppd_csm_instrumentation_wrapper"})
return if soup2 is None:
try: return
match = MetaRecord( match = MetaRecord(
title = "", title = "",
authors = "", authors = "",
@ -65,7 +68,7 @@ class Amazon(Metadata):
description="Amazon Books", description="Amazon Books",
link="https://amazon.com/" link="https://amazon.com/"
), ),
url = f"https://www.amazon.com/{link}", url = f"https://www.amazon.com{link}",
#the more searches the slower, these are too hard to find in reasonable time or might not even exist #the more searches the slower, these are too hard to find in reasonable time or might not even exist
publisher= "", # very unreliable publisher= "", # very unreliable
publishedDate= "", # very unreliable publishedDate= "", # very unreliable
@ -101,22 +104,27 @@ class Amazon(Metadata):
except (AttributeError, TypeError): except (AttributeError, TypeError):
match.cover = "" match.cover = ""
return match, index return match, index
except Exception as e: except Exception as e:
print(e) log.debug_or_exception(e)
return return
val = list() val = list()
if self.active: try:
results = self.session.get( if self.active:
f"https://www.amazon.com/s?k={query.replace(' ', '+')}&i=digital-text&sprefix={query.replace(' ', '+')}" results = self.session.get(
f"%2Cdigital-text&ref=nb_sb_noss", f"https://www.amazon.com/s?k={query.replace(' ', '+')}"
headers=self.headers) f"&i=digital-text&sprefix={query.replace(' ', '+')}"
results.raise_for_status() f"%2Cdigital-text&ref=nb_sb_noss",
soup = BS(results.text, 'html.parser') headers=self.headers)
links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in results.raise_for_status()
soup.findAll("div", attrs={"data-component-type": "s-search-result"})] soup = BS(results.text, 'html.parser')
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: links_list = [next(filter(lambda i: "digital-text" in i["href"], x.findAll("a")))["href"] for x in
fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])} soup.findAll("div", attrs={"data-component-type": "s-search-result"})]
val=list(map(lambda x : x.result() ,concurrent.futures.as_completed(fut))) with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
result=list(filter(lambda x: x, val)) fut = {executor.submit(inner, link, index) for index, link in enumerate(links_list[:5])}
return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance val = list(map(lambda x: x.result(), concurrent.futures.as_completed(fut)))
result = list(filter(lambda x: x, val))
return [x[0] for x in sorted(result, key=itemgetter(1))] #sort by amazons listing order for best relevance
except requests.exceptions.HTTPError as e:
log.debug_or_exception(e)
return []