run lubimyczytac detail pages in threadpool

This commit is contained in:
collerek 2021-12-13 02:14:53 +01:00
parent d55626d445
commit 362fdc5716
3 changed files with 99 additions and 87 deletions

View File

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
# This file is part of the Calibre-Web (https://github.com/janeczku/calibre-web)
# Copyright (C) 2021 OzzieIsaacs
#
@ -18,7 +17,8 @@
import datetime
import json
import re
from typing import Dict, Generator, List, Optional, Tuple, Union
from multiprocessing.pool import ThreadPool
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import quote
import requests
@ -114,13 +114,14 @@ class LubimyCzytac(Metadata):
lc_parser = LubimyCzytacParser(root=root, metadata=self)
matches = lc_parser.parse_search_results()
if matches:
final_matches = []
for match in matches:
response = requests.get(match.get("url"))
match = lc_parser.parse_single_book(
match=match, response=response, generic_cover=generic_cover
with ThreadPool(processes=10) as pool:
final_matches = pool.starmap(
lc_parser.parse_single_book,
[
(match, generic_cover)
for match in matches
],
)
final_matches.append(match)
return final_matches
return matches
@ -146,46 +147,6 @@ class LubimyCzytac(Metadata):
return ""
return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}"
@staticmethod
def get_title_tokens(
title: str, strip_joiners: bool = True
) -> Generator[str, None, None]:
"""
Taken from calibre source code
"""
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
r"audiobook|audio\scd|paperback|turtleback|"
r"mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
# parentheses
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
# Remove commas used a separators in numbers
(r"(\d+),(\d+)", r"\1\2"),
# Remove hyphens only if they have whitespace before them
(r"(\s-)", " "),
# Replace other special chars with a space
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
]
]
for pat, repl in title_patterns:
title = pat.sub(repl, title)
tokens = title.split()
for token in tokens:
token = token.strip().strip('"').strip("'")
if token and (
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token
class LubimyCzytacParser:
PAGES_TEMPLATE = "<p id='strony'>Książka ma {0} stron(y).</p>"
@ -232,8 +193,9 @@ class LubimyCzytacParser:
return matches
def parse_single_book(
self, match: Dict, response, generic_cover: str
self, match: Dict, generic_cover: str
) -> MetaRecord:
response = requests.get(match.get("url"))
self.root = fromstring(response.text)
match["series"], match["series_index"] = self._parse_series()
match["tags"] = self._parse_tags()

View File

@ -16,25 +16,23 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import json
import importlib
import sys
import inspect
import datetime
import concurrent.futures
import importlib
import inspect
import json
import os
import sys
from flask import Blueprint, request, Response, url_for
from flask import Blueprint, Response, request, url_for
from flask_login import current_user
from flask_login import login_required
from sqlalchemy.exc import InvalidRequestError, OperationalError
from sqlalchemy.orm.attributes import flag_modified
from sqlalchemy.exc import OperationalError, InvalidRequestError
from . import constants, logger, ub
from cps.services.Metadata import Metadata
from . import constants, logger, ub
meta = Blueprint('metadata', __name__)
meta = Blueprint("metadata", __name__)
log = logger.create()
@ -42,7 +40,7 @@ new_list = list()
meta_dir = os.path.join(constants.BASE_DIR, "cps", "metadata_provider")
modules = os.listdir(os.path.join(constants.BASE_DIR, "cps", "metadata_provider"))
for f in modules:
if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith('__init__.py'):
if os.path.isfile(os.path.join(meta_dir, f)) and not f.endswith("__init__.py"):
a = os.path.basename(f)[:-3]
try:
importlib.import_module("cps.metadata_provider." + a)
@ -51,34 +49,46 @@ for f in modules:
log.error("Import error for metadata source: {}".format(a))
pass
def list_classes(provider_list):
classes = list()
for element in provider_list:
for name, obj in inspect.getmembers(sys.modules["cps.metadata_provider." + element]):
if inspect.isclass(obj) and name != "Metadata" and issubclass(obj, Metadata):
for name, obj in inspect.getmembers(
sys.modules["cps.metadata_provider." + element]
):
if (
inspect.isclass(obj)
and name != "Metadata"
and issubclass(obj, Metadata)
):
classes.append(obj())
return classes
cl = list_classes(new_list)
@meta.route("/metadata/provider")
@login_required
def metadata_provider():
active = current_user.view_settings.get('metadata', {})
active = current_user.view_settings.get("metadata", {})
provider = list()
for c in cl:
ac = active.get(c.__id__, True)
provider.append({"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__})
return Response(json.dumps(provider), mimetype='application/json')
provider.append(
{"name": c.__name__, "active": ac, "initial": ac, "id": c.__id__}
)
return Response(json.dumps(provider), mimetype="application/json")
@meta.route("/metadata/provider", methods=['POST'])
@meta.route("/metadata/provider/<prov_name>", methods=['POST'])
@meta.route("/metadata/provider", methods=["POST"])
@meta.route("/metadata/provider/<prov_name>", methods=["POST"])
@login_required
def metadata_change_active_provider(prov_name):
new_state = request.get_json()
active = current_user.view_settings.get('metadata', {})
active[new_state['id']] = new_state['value']
current_user.view_settings['metadata'] = active
active = current_user.view_settings.get("metadata", {})
active[new_state["id"]] = new_state["value"]
current_user.view_settings["metadata"] = active
try:
try:
flag_modified(current_user, "view_settings")
@ -91,27 +101,26 @@ def metadata_change_active_provider(prov_name):
if "initial" in new_state and prov_name:
for c in cl:
if c.__id__ == prov_name:
data = c.search(new_state.get('query', ""))
data = c.search(new_state.get("query", ""))
break
return Response(json.dumps(data), mimetype='application/json')
return Response(json.dumps(data), mimetype="application/json")
return ""
@meta.route("/metadata/search", methods=['POST'])
@meta.route("/metadata/search", methods=["POST"])
@login_required
def metadata_search():
query = request.form.to_dict().get('query')
query = request.form.to_dict().get("query")
data = list()
active = current_user.view_settings.get('metadata', {})
active = current_user.view_settings.get("metadata", {})
if query:
static_cover = url_for('static', filename='generic_cover.jpg')
static_cover = url_for("static", filename="generic_cover.jpg")
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
meta = {executor.submit(c.search, query, static_cover): c for c in cl if active.get(c.__id__, True)}
meta = {
executor.submit(c.search, query, static_cover): c
for c in cl
if active.get(c.__id__, True)
}
for future in concurrent.futures.as_completed(meta):
data.extend(future.result())
return Response(json.dumps(data), mimetype='application/json')
return Response(json.dumps(data), mimetype="application/json")

View File

@ -16,7 +16,8 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import abc
from typing import Dict, List, Optional, TypedDict, Union
import re
from typing import Dict, Generator, List, Optional, TypedDict, Union
class Metadata:
@ -30,9 +31,49 @@ class Metadata:
self.active = state
@abc.abstractmethod
def search(self, query: str, generic_cover: str):
def search(self, query: str, generic_cover: str = ""):
pass
@staticmethod
def get_title_tokens(
title: str, strip_joiners: bool = True
) -> Generator[str, None, None]:
"""
Taken from calibre source code
"""
title_patterns = [
(re.compile(pat, re.IGNORECASE), repl)
for pat, repl in [
# Remove things like: (2010) (Omnibus) etc.
(
r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|"
r"audiobook|audio\scd|paperback|turtleback|"
r"mass\s*market|edition|ed\.)[\])}]",
"",
),
# Remove any strings that contain the substring edition inside
# parentheses
(r"(?i)[({\[].*?(edition|ed.).*?[\]})]", ""),
# Remove commas used a separators in numbers
(r"(\d+),(\d+)", r"\1\2"),
# Remove hyphens only if they have whitespace before them
(r"(\s-)", " "),
# Replace other special chars with a space
(r"""[:,;!@$%^&*(){}.`~"\s\[\]/]《》「」“”""", " "),
]
]
for pat, repl in title_patterns:
title = pat.sub(repl, title)
tokens = title.split()
for token in tokens:
token = token.strip().strip('"').strip("'")
if token and (
not strip_joiners or token.lower() not in ("a", "and", "the", "&")
):
yield token
class MetaSourceInfo(TypedDict):
id: str