From 362fdc57166e778dd8f08f326adbaaa6b0bf3d5d Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 02:14:53 +0100 Subject: [PATCH] run lubimyczytac detail pages in threadpool --- cps/metadata_provider/lubimyczytac.py | 60 ++++---------------- cps/search_metadata.py | 81 +++++++++++++++------------ cps/services/Metadata.py | 45 ++++++++++++++- 3 files changed, 99 insertions(+), 87 deletions(-) diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index ee66d1b4..1d4e18e1 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - # This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web) # Copyright (C) 2021 OzzieIsaacs # @@ -18,7 +17,8 @@ import datetime import json import re -from typing import Dict, Generator, List, Optional, Tuple, Union +from multiprocessing.pool import ThreadPool +from typing import Dict, List, Optional, Tuple, Union from urllib.parse import quote import requests @@ -114,13 +114,14 @@ class LubimyCzytac(Metadata): lc_parser = LubimyCzytacParser(root=root, metadata=self) matches = lc_parser.parse_search_results() if matches: - final_matches = [] - for match in matches: - response = requests.get(match.get("url")) - match = lc_parser.parse_single_book( - match=match, response=response, generic_cover=generic_cover + with ThreadPool(processes=10) as pool: + final_matches = pool.starmap( + lc_parser.parse_single_book, + [ + (match, generic_cover) + for match in matches + ], ) - final_matches.append(match) return final_matches return matches @@ -146,46 +147,6 @@ class LubimyCzytac(Metadata): return "" return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" - @staticmethod - def get_title_tokens( - title: str, strip_joiners: bool = True - ) -> Generator[str, None, None]: - """ - Taken from calibre source code - """ - title_patterns = [ - (re.compile(pat, re.IGNORECASE), repl) - for pat, repl in [ - # Remove things like: (2010) (Omnibus) etc. - ( - r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|" - r"audiobook|audio\scd|paperback|turtleback|" - r"mass\s*market|edition|ed\.)[\])}]", - "", - ), - # Remove any strings that contain the substring edition inside - # parentheses - (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), - # Remove commas used a separators in numbers - (r"(\d+),(\d+)", r"\1\2"), - # Remove hyphens only if they have whitespace before them - (r"(\s-)", " "), - # Replace other special chars with a space - (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), - ] - ] - - for pat, repl in title_patterns: - title = pat.sub(repl, title) - - tokens = title.split() - for token in tokens: - token = token.strip().strip('"').strip("'") - if token and ( - not strip_joiners or token.lower() not in ("a", "and", "the", "&") - ): - yield token - class LubimyCzytacParser: PAGES_TEMPLATE = "

Książka ma {0} stron(y).

" @@ -232,8 +193,9 @@ class LubimyCzytacParser: return matches def parse_single_book( - self, match: Dict, response, generic_cover: str + self, match: Dict, generic_cover: str ) -> MetaRecord: + response = requests.get(match.get("url")) self.root = fromstring(response.text) match["series"], match["series_index"] = self._parse_series() match["tags"] = self._parse_tags() diff --git a/cps/search_metadata.py b/cps/search_metadata.py index e837fe21..7d9b6e05 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -16,25 +16,23 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import os -import json -import importlib -import sys -import inspect -import datetime import concurrent.futures +import importlib +import inspect +import json +import os +import sys -from flask import Blueprint, request, Response, url_for +from flask import Blueprint, Response, request, url_for from flask_login import current_user from flask_login import login_required +from sqlalchemy.exc import InvalidRequestError, OperationalError from sqlalchemy.orm.attributes import flag_modified -from sqlalchemy.exc import OperationalError, InvalidRequestError -from . import constants, logger, ub from cps.services.Metadata import Metadata +from . import constants, logger, ub - -meta = Blueprint('metadata', __name__) +meta = Blueprint("metadata", __name__) log = logger.create() @@ -42,7 +40,7 @@ new_list = list() meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider") modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider")) for f in modules: - if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'): + if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"): a = os.path.basename(f)[:-3] try: importlib.import_module("cps.metadata_provider." + a) @@ -51,34 +49,46 @@ for f in modules: log.error("Import error for metadata source: {}".format(a)) pass + def list_classes(provider_list): classes = list() for element in provider_list: - for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]): - if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata): + for name, obj in inspect.getmembers( + sys.modules["cps.metadata_provider." + element] + ): + if ( + inspect.isclass(obj) + and name != "Metadata" + and issubclass(obj, Metadata) + ): classes.append(obj()) return classes + cl = list_classes(new_list) + @meta.route("/metadata/provider") @login_required def metadata_provider(): - active = current_user.view_settings.get('metadata', {}) + active = current_user.view_settings.get("metadata", {}) provider = list() for c in cl: ac = active.get(c.__id__, True) - provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}) - return Response(json.dumps(provider), mimetype='application/json') + provider.append( + {"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__} + ) + return Response(json.dumps(provider), mimetype="application/json") -@meta.route("/metadata/provider", methods=['POST']) -@meta.route("/metadata/provider/", methods=['POST']) + +@meta.route("/metadata/provider", methods=["POST"]) +@meta.route("/metadata/provider/", methods=["POST"]) @login_required def metadata_change_active_provider(prov_name): new_state = request.get_json() - active = current_user.view_settings.get('metadata', {}) - active[new_state['id']] = new_state['value'] - current_user.view_settings['metadata'] = active + active = current_user.view_settings.get("metadata", {}) + active[new_state["id"]] = new_state["value"] + current_user.view_settings["metadata"] = active try: try: flag_modified(current_user, "view_settings") @@ -91,27 +101,26 @@ def metadata_change_active_provider(prov_name): if "initial" in new_state and prov_name: for c in cl: if c.__id__ == prov_name: - data = c.search(new_state.get('query', "")) + data = c.search(new_state.get("query", "")) break - return Response(json.dumps(data), mimetype='application/json') + return Response(json.dumps(data), mimetype="application/json") return "" -@meta.route("/metadata/search", methods=['POST']) + +@meta.route("/metadata/search", methods=["POST"]) @login_required def metadata_search(): - query = request.form.to_dict().get('query') + query = request.form.to_dict().get("query") data = list() - active = current_user.view_settings.get('metadata', {}) + active = current_user.view_settings.get("metadata", {}) if query: - static_cover = url_for('static', filename='generic_cover.jpg') + static_cover = url_for("static", filename="generic_cover.jpg") with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - meta = {executor.submit(c.search, query, static_cover): c for c in cl if active.get(c.__id__, True)} + meta = { + executor.submit(c.search, query, static_cover): c + for c in cl + if active.get(c.__id__, True) + } for future in concurrent.futures.as_completed(meta): data.extend(future.result()) - return Response(json.dumps(data), mimetype='application/json') - - - - - - + return Response(json.dumps(data), mimetype="application/json") diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index 17a9e38e..1464411a 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -16,7 +16,8 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . import abc -from typing import Dict, List, Optional, TypedDict, Union +import re +from typing import Dict, Generator, List, Optional, TypedDict, Union class Metadata: @@ -30,9 +31,49 @@ class Metadata: self.active = state @abc.abstractmethod - def search(self, query: str, generic_cover: str): + def search(self, query: str, generic_cover: str = ""): pass + @staticmethod + def get_title_tokens( + title: str, strip_joiners: bool = True + ) -> Generator[str, None, None]: + """ + Taken from calibre source code + """ + title_patterns = [ + (re.compile(pat, re.IGNORECASE), repl) + for pat, repl in [ + # Remove things like: (2010) (Omnibus) etc. + ( + r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|" + r"audiobook|audio\scd|paperback|turtleback|" + r"mass\s*market|edition|ed\.)[\])}]", + "", + ), + # Remove any strings that contain the substring edition inside + # parentheses + (r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""), + # Remove commas used a separators in numbers + (r"(\d+),(\d+)", r"\1\2"), + # Remove hyphens only if they have whitespace before them + (r"(\s-)", " "), + # Replace other special chars with a space + (r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "), + ] + ] + + for pat, repl in title_patterns: + title = pat.sub(repl, title) + + tokens = title.split() + for token in tokens: + token = token.strip().strip('"').strip("'") + if token and ( + not strip_joiners or token.lower() not in ("a", "and", "the", "&") + ): + yield token + class MetaSourceInfo(TypedDict): id: str