From 5233f78d033ca8604827177df239453294a9d5bd Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 25 Apr 2023 20:07:27 +0200 Subject: [PATCH 01/25] comments --- cps/db.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cps/db.py b/cps/db.py index 70b4105b..8eef992d 100644 --- a/cps/db.py +++ b/cps/db.py @@ -886,9 +886,16 @@ class CalibreDB: term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) q = list() + #splits search term into single words author_terms = re.split("[, ]+", term) + + #search authors for match for author_term in author_terms: q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + author_term + "%"))) + + + + query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: query = query.outerjoin(join[0], join[1]).outerjoin(join[2]).outerjoin(join[3], join[4]).outerjoin(join[5]) @@ -900,17 +907,23 @@ class CalibreDB: query = query.outerjoin(join[0]) cc = self.get_cc_columns(config, filter_config_custom_read=True) + + #search each category for exact matches with the tag filter_expression = [Books.tags.any(func.lower(Tags.name).ilike("%" + term + "%")), Books.series.any(func.lower(Series.name).ilike("%" + term + "%")), Books.authors.any(and_(*q)), Books.publishers.any(func.lower(Publishers.name).ilike("%" + term + "%")), func.lower(Books.title).ilike("%" + term + "%")] + + for c in cc: if c.datatype not in ["datetime", "rating", "bool", "int", "float"]: filter_expression.append( getattr(Books, 'custom_column_' + str(c.id)).any( func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) + + #filter out multiple languages and archived books, then return all that match at least one of filter_expression return query.filter(self.common_filters(True)).filter(or_(*filter_expression)) def get_cc_columns(self, config, filter_config_custom_read=False): From 4b2e7b883b26b163139b4815ceab5b69b7da6edf Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 26 Apr 2023 15:27:41 +0200 Subject: [PATCH 02/25] Changed the quick search behavior so that it allows title and author in the same query. Also word order does not matter anymore. +some more comments --- cps/db.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/cps/db.py b/cps/db.py index 8eef992d..644041da 100644 --- a/cps/db.py +++ b/cps/db.py @@ -887,14 +887,12 @@ class CalibreDB: self.session.connection().connection.connection.create_function("lower", 1, lcase) q = list() #splits search term into single words - author_terms = re.split("[, ]+", term) - + words = re.split("[, ]+", term) + #put the longest words first to make queries more efficient + words.sort(key=len,reverse=True) #search authors for match - for author_term in author_terms: - q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + author_term + "%"))) - - - + for word in words: + q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + word + "%"))) query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -906,25 +904,29 @@ class CalibreDB: elif len(join) == 1: query = query.outerjoin(join[0]) + filter_expression=[] cc = self.get_cc_columns(config, filter_config_custom_read=True) - - #search each category for exact matches with the tag - filter_expression = [Books.tags.any(func.lower(Tags.name).ilike("%" + term + "%")), - Books.series.any(func.lower(Series.name).ilike("%" + term + "%")), - Books.authors.any(and_(*q)), - Books.publishers.any(func.lower(Publishers.name).ilike("%" + term + "%")), - func.lower(Books.title).ilike("%" + term + "%")] - - for c in cc: if c.datatype not in ["datetime", "rating", "bool", "int", "float"]: filter_expression.append( getattr(Books, 'custom_column_' + str(c.id)).any( - func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) + func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) #TODO ? + # filter out multiple languages and archived books, + results=query.filter(self.common_filters(True)) - #filter out multiple languages and archived books, then return all that match at least one of filter_expression - return query.filter(self.common_filters(True)).filter(or_(*filter_expression)) + for word in words: + filter_expression=[ + Books.tags.any(func.lower(Tags.name).ilike("%" + word + "%")), + Books.series.any(func.lower(Series.name).ilike("%" + word + "%")), + #change to or_ to allow mix of title and author in query term + Books.authors.any(or_(*q)), + Books.publishers.any(func.lower(Publishers.name).ilike("%" + word + "%")), + func.lower(Books.title).ilike("%" + word + "%") + ] + results=results.filter(or_(*filter_expression)) + + return results def get_cc_columns(self, config, filter_config_custom_read=False): tmp_cc = self.session.query(CustomColumns).filter(CustomColumns.datatype.notin_(cc_exceptions)).all() From d13d4653bebf9af2836edc1ab5900b811c2c6f3a Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 26 Apr 2023 22:04:41 +0200 Subject: [PATCH 03/25] proof of concept fuzzy matching --- cps/db.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/cps/db.py b/cps/db.py index 644041da..53fa422c 100644 --- a/cps/db.py +++ b/cps/db.py @@ -20,6 +20,7 @@ import os import re import json +import traceback from datetime import datetime from urllib.parse import quote import unidecode @@ -49,7 +50,7 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet - +from fuzzywuzzy.fuzz import ratio log = logger.create() @@ -885,6 +886,7 @@ class CalibreDB: def search_query(self, term, config, *join): term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) + self.session.connection().connection.connection.create_function("ratio", 2, ratio) q = list() #splits search term into single words words = re.split("[, ]+", term) @@ -915,18 +917,20 @@ class CalibreDB: # filter out multiple languages and archived books, results=query.filter(self.common_filters(True)) - for word in words: - filter_expression=[ - Books.tags.any(func.lower(Tags.name).ilike("%" + word + "%")), - Books.series.any(func.lower(Series.name).ilike("%" + word + "%")), - #change to or_ to allow mix of title and author in query term - Books.authors.any(or_(*q)), - Books.publishers.any(func.lower(Publishers.name).ilike("%" + word + "%")), - func.lower(Books.title).ilike("%" + word + "%") - ] - results=results.filter(or_(*filter_expression)) + # for word in words: + # filter_expression=[ + # Books.tags.any(func.lower(Tags.name).ilike("%" + word + "%")), + # Books.series.any(func.lower(Series.name).ilike("%" + word + "%")), + # #change to or_ to allow mix of title and author in query term + # Books.authors.any(or_(*q)), + # Books.publishers.any(func.lower(Publishers.name).ilike("%" + word + "%")), + # func.lower(Books.title).ilike("%" + word + "%") + # ] + # results=results.filter(or_(*filter_expression)) - return results + try: return results.filter(func.ratio(Books.title,term)>80) + except Exception: + print(traceback.format_exc()) def get_cc_columns(self, config, filter_config_custom_read=False): tmp_cc = self.session.query(CustomColumns).filter(CustomColumns.datatype.notin_(cc_exceptions)).all() From ada0cc477d93a8ba3119344ab3cb6fd3b5b6c81b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 29 Apr 2023 17:11:52 +0200 Subject: [PATCH 04/25] fuzzy matching for all but cc --- cps/db.py | 32 ++++++++++++++++++-------------- requirements.txt | 8 ++++++++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/cps/db.py b/cps/db.py index 53fa422c..9e9d70d4 100644 --- a/cps/db.py +++ b/cps/db.py @@ -50,7 +50,10 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet -from fuzzywuzzy.fuzz import ratio +from thefuzz.fuzz import partial_ratio + +# %-level, 100 means exact match +FUZZY_SEARCH_ACCURACY=80 log = logger.create() @@ -886,7 +889,7 @@ class CalibreDB: def search_query(self, term, config, *join): term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) - self.session.connection().connection.connection.create_function("ratio", 2, ratio) + self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) q = list() #splits search term into single words words = re.split("[, ]+", term) @@ -894,7 +897,7 @@ class CalibreDB: words.sort(key=len,reverse=True) #search authors for match for word in words: - q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + word + "%"))) + q.append(Books.authors.any(func.partial_ratio(func.lower(Authors.name),word)>=FUZZY_SEARCH_ACCURACY)) query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -917,18 +920,19 @@ class CalibreDB: # filter out multiple languages and archived books, results=query.filter(self.common_filters(True)) - # for word in words: - # filter_expression=[ - # Books.tags.any(func.lower(Tags.name).ilike("%" + word + "%")), - # Books.series.any(func.lower(Series.name).ilike("%" + word + "%")), - # #change to or_ to allow mix of title and author in query term - # Books.authors.any(or_(*q)), - # Books.publishers.any(func.lower(Publishers.name).ilike("%" + word + "%")), - # func.lower(Books.title).ilike("%" + word + "%") - # ] - # results=results.filter(or_(*filter_expression)) + #search tags, series and titles, also add author queries + for word in words: + filter_expression=[ + Books.tags.any(func.partial_ratio(func.lower(Tags.name),word)>=FUZZY_SEARCH_ACCURACY), + Books.series.any(func.partial_ratio(func.lower(Series.name),word)>=FUZZY_SEARCH_ACCURACY), + #change to or_ to allow mix of title and author in query term + Books.authors.any(or_(*q)), + Books.publishers.any(func.partial_ratio(func.lower(Publishers.name),word)>=FUZZY_SEARCH_ACCURACY), + func.partial_ratio(func.lower(Books.title),word)>=FUZZY_SEARCH_ACCURACY + ] + results=results.filter(or_(*filter_expression)) - try: return results.filter(func.ratio(Books.title,term)>80) + try: return results except Exception: print(traceback.format_exc()) diff --git a/requirements.txt b/requirements.txt index f0cd81c0..2c6a859b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,11 @@ flask-wtf>=0.14.2,<1.2.0 chardet>=3.0.0,<4.1.0 advocate>=1.0.0,<1.1.0 Flask-Limiter>=2.3.0,<3.4.0 + +thefuzz~=0.19.0 +MarkupSafe~=2.1.1 +Jinja2~=3.1.2 +Levenshtein~=0.21.0 +greenlet~=1.1.3 +cryptography~=38.0.1 +setuptools~=57.0.0 From 8e8c9a14a8b01fba943cacd234b8a918d8945c23 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 29 Apr 2023 17:11:52 +0200 Subject: [PATCH 05/25] fuzzy matching for all categories but cc --- cps/db.py | 32 ++++++++++++++++++-------------- requirements.txt | 8 ++++++++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/cps/db.py b/cps/db.py index 53fa422c..9e9d70d4 100644 --- a/cps/db.py +++ b/cps/db.py @@ -50,7 +50,10 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet -from fuzzywuzzy.fuzz import ratio +from thefuzz.fuzz import partial_ratio + +# %-level, 100 means exact match +FUZZY_SEARCH_ACCURACY=80 log = logger.create() @@ -886,7 +889,7 @@ class CalibreDB: def search_query(self, term, config, *join): term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) - self.session.connection().connection.connection.create_function("ratio", 2, ratio) + self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) q = list() #splits search term into single words words = re.split("[, ]+", term) @@ -894,7 +897,7 @@ class CalibreDB: words.sort(key=len,reverse=True) #search authors for match for word in words: - q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + word + "%"))) + q.append(Books.authors.any(func.partial_ratio(func.lower(Authors.name),word)>=FUZZY_SEARCH_ACCURACY)) query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -917,18 +920,19 @@ class CalibreDB: # filter out multiple languages and archived books, results=query.filter(self.common_filters(True)) - # for word in words: - # filter_expression=[ - # Books.tags.any(func.lower(Tags.name).ilike("%" + word + "%")), - # Books.series.any(func.lower(Series.name).ilike("%" + word + "%")), - # #change to or_ to allow mix of title and author in query term - # Books.authors.any(or_(*q)), - # Books.publishers.any(func.lower(Publishers.name).ilike("%" + word + "%")), - # func.lower(Books.title).ilike("%" + word + "%") - # ] - # results=results.filter(or_(*filter_expression)) + #search tags, series and titles, also add author queries + for word in words: + filter_expression=[ + Books.tags.any(func.partial_ratio(func.lower(Tags.name),word)>=FUZZY_SEARCH_ACCURACY), + Books.series.any(func.partial_ratio(func.lower(Series.name),word)>=FUZZY_SEARCH_ACCURACY), + #change to or_ to allow mix of title and author in query term + Books.authors.any(or_(*q)), + Books.publishers.any(func.partial_ratio(func.lower(Publishers.name),word)>=FUZZY_SEARCH_ACCURACY), + func.partial_ratio(func.lower(Books.title),word)>=FUZZY_SEARCH_ACCURACY + ] + results=results.filter(or_(*filter_expression)) - try: return results.filter(func.ratio(Books.title,term)>80) + try: return results except Exception: print(traceback.format_exc()) diff --git a/requirements.txt b/requirements.txt index f0cd81c0..2c6a859b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,11 @@ flask-wtf>=0.14.2,<1.2.0 chardet>=3.0.0,<4.1.0 advocate>=1.0.0,<1.1.0 Flask-Limiter>=2.3.0,<3.4.0 + +thefuzz~=0.19.0 +MarkupSafe~=2.1.1 +Jinja2~=3.1.2 +Levenshtein~=0.21.0 +greenlet~=1.1.3 +cryptography~=38.0.1 +setuptools~=57.0.0 From 61f1e20489e805294b2631a99558faeb4657f64b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 29 Apr 2023 17:11:52 +0200 Subject: [PATCH 06/25] fuzzy matching for all categories but cc --- cps/db.py | 32 ++++++++++++++++++-------------- requirements.txt | 8 ++++++++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/cps/db.py b/cps/db.py index 53fa422c..9e9d70d4 100644 --- a/cps/db.py +++ b/cps/db.py @@ -50,7 +50,10 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet -from fuzzywuzzy.fuzz import ratio +from thefuzz.fuzz import partial_ratio + +# %-level, 100 means exact match +FUZZY_SEARCH_ACCURACY=80 log = logger.create() @@ -886,7 +889,7 @@ class CalibreDB: def search_query(self, term, config, *join): term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) - self.session.connection().connection.connection.create_function("ratio", 2, ratio) + self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) q = list() #splits search term into single words words = re.split("[, ]+", term) @@ -894,7 +897,7 @@ class CalibreDB: words.sort(key=len,reverse=True) #search authors for match for word in words: - q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + word + "%"))) + q.append(Books.authors.any(func.partial_ratio(func.lower(Authors.name),word)>=FUZZY_SEARCH_ACCURACY)) query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -917,18 +920,19 @@ class CalibreDB: # filter out multiple languages and archived books, results=query.filter(self.common_filters(True)) - # for word in words: - # filter_expression=[ - # Books.tags.any(func.lower(Tags.name).ilike("%" + word + "%")), - # Books.series.any(func.lower(Series.name).ilike("%" + word + "%")), - # #change to or_ to allow mix of title and author in query term - # Books.authors.any(or_(*q)), - # Books.publishers.any(func.lower(Publishers.name).ilike("%" + word + "%")), - # func.lower(Books.title).ilike("%" + word + "%") - # ] - # results=results.filter(or_(*filter_expression)) + #search tags, series and titles, also add author queries + for word in words: + filter_expression=[ + Books.tags.any(func.partial_ratio(func.lower(Tags.name),word)>=FUZZY_SEARCH_ACCURACY), + Books.series.any(func.partial_ratio(func.lower(Series.name),word)>=FUZZY_SEARCH_ACCURACY), + #change to or_ to allow mix of title and author in query term + Books.authors.any(or_(*q)), + Books.publishers.any(func.partial_ratio(func.lower(Publishers.name),word)>=FUZZY_SEARCH_ACCURACY), + func.partial_ratio(func.lower(Books.title),word)>=FUZZY_SEARCH_ACCURACY + ] + results=results.filter(or_(*filter_expression)) - try: return results.filter(func.ratio(Books.title,term)>80) + try: return results except Exception: print(traceback.format_exc()) diff --git a/requirements.txt b/requirements.txt index f0cd81c0..2c6a859b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,11 @@ flask-wtf>=0.14.2,<1.2.0 chardet>=3.0.0,<4.1.0 advocate>=1.0.0,<1.1.0 Flask-Limiter>=2.3.0,<3.4.0 + +thefuzz~=0.19.0 +MarkupSafe~=2.1.1 +Jinja2~=3.1.2 +Levenshtein~=0.21.0 +greenlet~=1.1.3 +cryptography~=38.0.1 +setuptools~=57.0.0 From f497cc0b031744a0031e456c2d91005c7893c533 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 29 Apr 2023 17:47:04 +0200 Subject: [PATCH 07/25] removed todo --- cps/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/db.py b/cps/db.py index 9e9d70d4..35d9d6a2 100644 --- a/cps/db.py +++ b/cps/db.py @@ -916,7 +916,7 @@ class CalibreDB: filter_expression.append( getattr(Books, 'custom_column_' + str(c.id)).any( - func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) #TODO ? + func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) # filter out multiple languages and archived books, results=query.filter(self.common_filters(True)) From 2e3c93b9e4d299b7c53dd1b7b9f781df139b1b68 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 29 Apr 2023 17:47:04 +0200 Subject: [PATCH 08/25] removed debugging help --- cps/db.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cps/db.py b/cps/db.py index 9e9d70d4..a4c6633c 100644 --- a/cps/db.py +++ b/cps/db.py @@ -916,7 +916,7 @@ class CalibreDB: filter_expression.append( getattr(Books, 'custom_column_' + str(c.id)).any( - func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) #TODO ? + func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) # filter out multiple languages and archived books, results=query.filter(self.common_filters(True)) @@ -932,9 +932,7 @@ class CalibreDB: ] results=results.filter(or_(*filter_expression)) - try: return results - except Exception: - print(traceback.format_exc()) + return results def get_cc_columns(self, config, filter_config_custom_read=False): tmp_cc = self.session.query(CustomColumns).filter(CustomColumns.datatype.notin_(cc_exceptions)).all() From af40feee861b6981efe9cd4869c36df853686323 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 3 May 2023 19:43:25 +0200 Subject: [PATCH 09/25] I accidentally used pycharms auto-add-to-requirements feature which resulted in many more dependencies than needed. These are the actually relevant requirements for this pr. --- requirements.txt | 5 ----- 1 file changed, 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2c6a859b..da518c9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,9 +20,4 @@ advocate>=1.0.0,<1.1.0 Flask-Limiter>=2.3.0,<3.4.0 thefuzz~=0.19.0 -MarkupSafe~=2.1.1 -Jinja2~=3.1.2 Levenshtein~=0.21.0 -greenlet~=1.1.3 -cryptography~=38.0.1 -setuptools~=57.0.0 From 9fc0d54fde2331fd09b7f064d2d64f89e288f772 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 9 May 2023 00:36:41 +0200 Subject: [PATCH 10/25] idea for weighted sorting --- cps/db.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cps/db.py b/cps/db.py index a4c6633c..6cbaa29f 100644 --- a/cps/db.py +++ b/cps/db.py @@ -931,7 +931,15 @@ class CalibreDB: func.partial_ratio(func.lower(Books.title),word)>=FUZZY_SEARCH_ACCURACY ] results=results.filter(or_(*filter_expression)) + #TODO sort + # score = 0 + # for word in words: + # score += max( + # attr1 % word, + # attr2 % word, + # ) + # sort by score desc return results def get_cc_columns(self, config, filter_config_custom_read=False): From 4ba3b4e4943c72b044b06c6fa66c33c5eda3103b Mon Sep 17 00:00:00 2001 From: quarz12 Date: Tue, 9 May 2023 17:35:22 +0200 Subject: [PATCH 11/25] typing for query --- cps/db.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cps/db.py b/cps/db.py index 6cbaa29f..a29f7912 100644 --- a/cps/db.py +++ b/cps/db.py @@ -23,6 +23,8 @@ import json import traceback from datetime import datetime from urllib.parse import quote + +import sqlalchemy import unidecode from sqlite3 import OperationalError as sqliteOperationalError @@ -918,7 +920,7 @@ class CalibreDB: 'custom_column_' + str(c.id)).any( func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) # filter out multiple languages and archived books, - results=query.filter(self.common_filters(True)) + results:sqlalchemy.orm.Query=query.filter(self.common_filters(True)) #search tags, series and titles, also add author queries for word in words: @@ -933,6 +935,10 @@ class CalibreDB: results=results.filter(or_(*filter_expression)) #TODO sort + results.order_by(lambda Book:Book.title+Book.tags+Book.authors) + # v1 + # results.order_by(desc(lambda Book:levenshtein(Book.title+Book.tags+Book.authors,term))) + # v2 # score = 0 # for word in words: # score += max( From 932c7968ce28f2acf13865198c6cbcb75cb18760 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 10 May 2023 18:55:40 +0200 Subject: [PATCH 12/25] fixed an issue where the lowering and stripping of the search term was not saved --- cps/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/db.py b/cps/db.py index a4c6633c..1cf11216 100644 --- a/cps/db.py +++ b/cps/db.py @@ -887,7 +887,7 @@ class CalibreDB: .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first() def search_query(self, term, config, *join): - term.strip().lower() + term=term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) q = list() From 97c94f2117c5374472a5b51d82a775ed76e3bb0f Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 11 May 2023 01:34:01 +0200 Subject: [PATCH 13/25] moved sorting back to original place --- cps/db.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cps/db.py b/cps/db.py index a29f7912..57e3f283 100644 --- a/cps/db.py +++ b/cps/db.py @@ -40,6 +40,7 @@ try: from sqlalchemy.orm import declarative_base except ImportError: from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import desc,asc from sqlalchemy.pool import StaticPool from sqlalchemy.sql.expression import and_, true, false, text, func, or_ from sqlalchemy.ext.associationproxy import association_proxy @@ -585,7 +586,7 @@ class CalibreDB: return False, False try: check_engine = create_engine('sqlite://', - echo=False, + echo=True, isolation_level="SERIALIZABLE", connect_args={'check_same_thread': False}, poolclass=StaticPool) @@ -935,7 +936,6 @@ class CalibreDB: results=results.filter(or_(*filter_expression)) #TODO sort - results.order_by(lambda Book:Book.title+Book.tags+Book.authors) # v1 # results.order_by(desc(lambda Book:levenshtein(Book.title+Book.tags+Book.authors,term))) # v2 @@ -966,9 +966,12 @@ class CalibreDB: # read search results from calibre-database and return it (function is used for feed and simple search def get_search_results(self, term, config, offset=None, order=None, limit=None, *join): + self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) + self.session.connection().connection.connection.create_function("sort", 1, lambda tags :print(f"") or 3) order = order[0] if order else [Books.sort] pagination = None - result = self.search_query(term, config, *join).order_by(*order).all() + result = self.search_query(term, config, *join).order_by(*order).all()#*order + #result = self.search_query(term, config, *join).order_by(desc(func.sort(Books.tags))).all()#*order result_count = len(result) if offset != None and limit != None: offset = int(offset) From 45d8d637839c1fe2985016206e4d85a63e5800ae Mon Sep 17 00:00:00 2001 From: quarz12 Date: Thu, 11 May 2023 16:17:59 +0200 Subject: [PATCH 14/25] sort using only authorsort and title --- cps/db.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/cps/db.py b/cps/db.py index 57e3f283..3204f94f 100644 --- a/cps/db.py +++ b/cps/db.py @@ -889,7 +889,7 @@ class CalibreDB: return self.session.query(Books) \ .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first() - def search_query(self, term, config, *join): + def search_query(self, term, config, *join)->sqlalchemy.orm.Query: term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) @@ -921,7 +921,7 @@ class CalibreDB: 'custom_column_' + str(c.id)).any( func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) # filter out multiple languages and archived books, - results:sqlalchemy.orm.Query=query.filter(self.common_filters(True)) + results=query.filter(self.common_filters(True)) #search tags, series and titles, also add author queries for word in words: @@ -934,18 +934,6 @@ class CalibreDB: func.partial_ratio(func.lower(Books.title),word)>=FUZZY_SEARCH_ACCURACY ] results=results.filter(or_(*filter_expression)) - #TODO sort - - # v1 - # results.order_by(desc(lambda Book:levenshtein(Book.title+Book.tags+Book.authors,term))) - # v2 - # score = 0 - # for word in words: - # score += max( - # attr1 % word, - # attr2 % word, - # ) - # sort by score desc return results def get_cc_columns(self, config, filter_config_custom_read=False): @@ -970,7 +958,11 @@ class CalibreDB: self.session.connection().connection.connection.create_function("sort", 1, lambda tags :print(f"") or 3) order = order[0] if order else [Books.sort] pagination = None - result = self.search_query(term, config, *join).order_by(*order).all()#*order + #result = self.search_query(term, config, *join).order_by(*order).all()#*order + result = self.search_query(term, config, *join).order_by(desc(func.partial_ratio(Books.title+" "+Books.author_sort,term))).all() + for row in result: + print(row) + #result = self.search_query(term, config, *join).order_by(desc(func.sort(Books.tags))).all()#*order result_count = len(result) if offset != None and limit != None: From c115fe92954484268f648f8d17346a1dd6c718a5 Mon Sep 17 00:00:00 2001 From: quarz12 Date: Thu, 11 May 2023 16:21:17 +0200 Subject: [PATCH 15/25] use partial token set ratio instead --- cps/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cps/db.py b/cps/db.py index 3204f94f..c87c1f07 100644 --- a/cps/db.py +++ b/cps/db.py @@ -53,7 +53,7 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet -from thefuzz.fuzz import partial_ratio +from thefuzz.fuzz import partial_ratio, partial_token_set_ratio # %-level, 100 means exact match FUZZY_SEARCH_ACCURACY=80 @@ -954,7 +954,7 @@ class CalibreDB: # read search results from calibre-database and return it (function is used for feed and simple search def get_search_results(self, term, config, offset=None, order=None, limit=None, *join): - self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) + self.session.connection().connection.connection.create_function("partial_token_set_ratio", 2, partial_token_set_ratio) self.session.connection().connection.connection.create_function("sort", 1, lambda tags :print(f"") or 3) order = order[0] if order else [Books.sort] pagination = None From 086527f5eebf65cc32ba4f889c9f1e7e77a3cbf6 Mon Sep 17 00:00:00 2001 From: quarz12 Date: Thu, 11 May 2023 16:43:58 +0200 Subject: [PATCH 16/25] test at home --- cps/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cps/db.py b/cps/db.py index c87c1f07..eee5b841 100644 --- a/cps/db.py +++ b/cps/db.py @@ -959,7 +959,7 @@ class CalibreDB: order = order[0] if order else [Books.sort] pagination = None #result = self.search_query(term, config, *join).order_by(*order).all()#*order - result = self.search_query(term, config, *join).order_by(desc(func.partial_ratio(Books.title+" "+Books.author_sort,term))).all() + result = self.search_query(term, config, *join).order_by(desc(func.partial_ratio(Books.title.name+" "+Books.author_sort.name+" "+Books.tags.get(),term))).all() for row in result: print(row) From ad5313ee79cbfa2c393568b13f485d7edf0ddb1b Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 15 May 2023 18:17:47 +0200 Subject: [PATCH 17/25] new idea --- cps/db.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cps/db.py b/cps/db.py index eee5b841..133c4b50 100644 --- a/cps/db.py +++ b/cps/db.py @@ -954,12 +954,10 @@ class CalibreDB: # read search results from calibre-database and return it (function is used for feed and simple search def get_search_results(self, term, config, offset=None, order=None, limit=None, *join): - self.session.connection().connection.connection.create_function("partial_token_set_ratio", 2, partial_token_set_ratio) - self.session.connection().connection.connection.create_function("sort", 1, lambda tags :print(f"") or 3) order = order[0] if order else [Books.sort] pagination = None - #result = self.search_query(term, config, *join).order_by(*order).all()#*order - result = self.search_query(term, config, *join).order_by(desc(func.partial_ratio(Books.title.name+" "+Books.author_sort.name+" "+Books.tags.get(),term))).all() + result = self.search_query(term, config, *join).order_by(*order).all() + #sort here for row in result: print(row) From e45619f2268cfffb218f5cd0c83206f7ce052133 Mon Sep 17 00:00:00 2001 From: quarz12 Date: Wed, 17 May 2023 11:11:14 +0200 Subject: [PATCH 18/25] progress building string of book --- cps/db.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/cps/db.py b/cps/db.py index 133c4b50..f77a30d0 100644 --- a/cps/db.py +++ b/cps/db.py @@ -23,8 +23,6 @@ import json import traceback from datetime import datetime from urllib.parse import quote - -import sqlalchemy import unidecode from sqlite3 import OperationalError as sqliteOperationalError @@ -40,7 +38,6 @@ try: from sqlalchemy.orm import declarative_base except ImportError: from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import desc,asc from sqlalchemy.pool import StaticPool from sqlalchemy.sql.expression import and_, true, false, text, func, or_ from sqlalchemy.ext.associationproxy import association_proxy @@ -384,9 +381,11 @@ class Books(Base): self.has_cover = (has_cover != None) def __repr__(self): - return "".format(self.title, self.sort, self.author_sort, + return "".format(self.title, self.sort, self.author_sort, self.timestamp, self.pubdate, self.series_index, - self.last_modified, self.path, self.has_cover) + self.last_modified, self.path, self.has_cover, + [tag.name for tag in self.tags], + [series.name for series in self.series]) @property def atom_timestamp(self): @@ -586,7 +585,7 @@ class CalibreDB: return False, False try: check_engine = create_engine('sqlite://', - echo=True, + echo=False, isolation_level="SERIALIZABLE", connect_args={'check_same_thread': False}, poolclass=StaticPool) @@ -889,8 +888,8 @@ class CalibreDB: return self.session.query(Books) \ .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first() - def search_query(self, term, config, *join)->sqlalchemy.orm.Query: - term.strip().lower() + def search_query(self, term, config, *join): + term=term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) q = list() @@ -957,11 +956,10 @@ class CalibreDB: order = order[0] if order else [Books.sort] pagination = None result = self.search_query(term, config, *join).order_by(*order).all() - #sort here - for row in result: - print(row) - - #result = self.search_query(term, config, *join).order_by(desc(func.sort(Books.tags))).all()#*order + sorted(result,key=lambda book:1) + for res in result: + print(res[0]) + print(f"{res[0].title} {[tag.name for tag in res[0].tags]} {[series.name for series in res[0].series]}") result_count = len(result) if offset != None and limit != None: offset = int(offset) From a936a333a8de4ea73c3e8a78334ed550bf218f51 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 27 May 2023 15:55:54 +0200 Subject: [PATCH 19/25] updated Books string repr --- cps/db.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cps/db.py b/cps/db.py index f77a30d0..4f7dfaab 100644 --- a/cps/db.py +++ b/cps/db.py @@ -381,11 +381,12 @@ class Books(Base): self.has_cover = (has_cover != None) def __repr__(self): - return "".format(self.title, self.sort, self.author_sort, + return "".format(self.title, self.sort, self.author_sort, self.timestamp, self.pubdate, self.series_index, self.last_modified, self.path, self.has_cover, - [tag.name for tag in self.tags], - [series.name for series in self.series]) + " ".join([tag.name for tag in self.tags]), + " ".join([series.name for series in self.series]), " ".join([author.name for author in self.authors])," ".join([publisher.name for publisher in self.publishers])) + @property def atom_timestamp(self): @@ -959,7 +960,6 @@ class CalibreDB: sorted(result,key=lambda book:1) for res in result: print(res[0]) - print(f"{res[0].title} {[tag.name for tag in res[0].tags]} {[series.name for series in res[0].series]}") result_count = len(result) if offset != None and limit != None: offset = int(offset) From cb5e66facd9713745239117119c84dce5f080fec Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 28 May 2023 13:11:50 +0200 Subject: [PATCH 20/25] add partial token set ratio to db --- cps/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cps/db.py b/cps/db.py index 4f7dfaab..7c43ea8e 100644 --- a/cps/db.py +++ b/cps/db.py @@ -954,10 +954,10 @@ class CalibreDB: # read search results from calibre-database and return it (function is used for feed and simple search def get_search_results(self, term, config, offset=None, order=None, limit=None, *join): + self.session.connection().connection.connection.create_function("partial_token_set_ratio", 2, partial_token_set_ratio) order = order[0] if order else [Books.sort] pagination = None - result = self.search_query(term, config, *join).order_by(*order).all() - sorted(result,key=lambda book:1) + result = self.search_query(term, config, *join).order_by(func.desc(func.partial_token_set_ratio(str(Books),term))).all() for res in result: print(res[0]) result_count = len(result) From 025a888906326e923465663450ebfb221074d29c Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 28 May 2023 15:01:35 +0200 Subject: [PATCH 21/25] rolled back string repr of book, moved that part to a new method --- cps/db.py | 91 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/cps/db.py b/cps/db.py index 7c43ea8e..fa190015 100644 --- a/cps/db.py +++ b/cps/db.py @@ -33,6 +33,7 @@ from sqlalchemy.orm import relationship, sessionmaker, scoped_session from sqlalchemy.orm.collections import InstrumentedList from sqlalchemy.ext.declarative import DeclarativeMeta from sqlalchemy.exc import OperationalError + try: # Compatibility with sqlalchemy 2.0 from sqlalchemy.orm import declarative_base @@ -41,6 +42,7 @@ except ImportError: from sqlalchemy.pool import StaticPool from sqlalchemy.sql.expression import and_, true, false, text, func, or_ from sqlalchemy.ext.associationproxy import association_proxy +from sqlalchemy import desc from flask_login import current_user from flask_babel import gettext as _ from flask_babel import get_locale @@ -53,7 +55,7 @@ from weakref import WeakSet from thefuzz.fuzz import partial_ratio, partial_token_set_ratio # %-level, 100 means exact match -FUZZY_SEARCH_ACCURACY=80 +FUZZY_SEARCH_ACCURACY = 80 log = logger.create() @@ -381,12 +383,21 @@ class Books(Base): self.has_cover = (has_cover != None) def __repr__(self): - return "".format(self.title, self.sort, self.author_sort, - self.timestamp, self.pubdate, self.series_index, - self.last_modified, self.path, self.has_cover, - " ".join([tag.name for tag in self.tags]), - " ".join([series.name for series in self.series]), " ".join([author.name for author in self.authors])," ".join([publisher.name for publisher in self.publishers])) + return "".format(self.title, self.sort, self.author_sort, + self.timestamp, self.pubdate, self.series_index, + self.last_modified, self.path, self.has_cover) + def __sort_str(self): + return "{0} {1} {2} {3} {4}".format(self.title, " ".join([tag.name for tag in self.tags]), + " ".join( + [series.name for series + in self.series]), + " ".join( + [author.name for author + in self.authors]), + " ".join([publisher.name for + publisher in + self.publishers])) @property def atom_timestamp(self): @@ -428,13 +439,15 @@ class CustomColumns(Base): content['category_sort'] = "value" content['is_csp'] = False content['is_editable'] = self.editable - content['rec_index'] = sequence + 22 # toDo why ?? + content['rec_index'] = sequence + 22 # toDo why ?? if isinstance(value, datetime): - content['#value#'] = {"__class__": "datetime.datetime", "__value__": value.strftime("%Y-%m-%dT%H:%M:%S+00:00")} + content['#value#'] = {"__class__": "datetime.datetime", + "__value__": value.strftime("%Y-%m-%dT%H:%M:%S+00:00")} else: content['#value#'] = value content['#extra#'] = extra - content['is_multiple2'] = {} if not self.is_multiple else {"cache_to_list": "|", "ui_to_list": ",", "list_to_ui": ", "} + content['is_multiple2'] = {} if not self.is_multiple else {"cache_to_list": "|", "ui_to_list": ",", + "list_to_ui": ", "} return json.dumps(content, ensure_ascii=False) @@ -455,7 +468,7 @@ class AlchemyEncoder(json.JSONEncoder): el = list() # ele = None for ele in data: - if hasattr(ele, 'value'): # converter for custom_column values + if hasattr(ele, 'value'): # converter for custom_column values el.append(str(ele.value)) elif ele.get: el.append(ele.get()) @@ -494,7 +507,6 @@ class CalibreDB: if init: self.init_db(expire_on_commit) - def init_db(self, expire_on_commit=True): if self._init: self.init_session(expire_on_commit) @@ -666,13 +678,13 @@ class CalibreDB: if not read_column: bd = (self.session.query(Books, ub.ReadBook.read_status, ub.ArchivedBook.is_archived).select_from(Books) .join(ub.ReadBook, and_(ub.ReadBook.user_id == int(current_user.id), ub.ReadBook.book_id == book_id), - isouter=True)) + isouter=True)) else: try: read_column = cc_classes[read_column] bd = (self.session.query(Books, read_column.value, ub.ArchivedBook.is_archived).select_from(Books) .join(read_column, read_column.book == book_id, - isouter=True)) + isouter=True)) except (KeyError, AttributeError, IndexError): log.error("Custom Column No.{} does not exist in calibre database".format(read_column)) # Skip linking read column and return None instead of read status @@ -725,11 +737,11 @@ class CalibreDB: pos_cc_list = current_user.allowed_column_value.split(',') pos_content_cc_filter = true() if pos_cc_list == [''] else \ getattr(Books, 'custom_column_' + str(self.config.config_restricted_column)). \ - any(cc_classes[self.config.config_restricted_column].value.in_(pos_cc_list)) + any(cc_classes[self.config.config_restricted_column].value.in_(pos_cc_list)) neg_cc_list = current_user.denied_column_value.split(',') neg_content_cc_filter = false() if neg_cc_list == [''] else \ getattr(Books, 'custom_column_' + str(self.config.config_restricted_column)). \ - any(cc_classes[self.config.config_restricted_column].value.in_(neg_cc_list)) + any(cc_classes[self.config.config_restricted_column].value.in_(neg_cc_list)) except (KeyError, AttributeError, IndexError): pos_content_cc_filter = false() neg_content_cc_filter = true() @@ -809,18 +821,18 @@ class CalibreDB: element = 0 while indx: if indx >= 3: - query = query.outerjoin(join[element], join[element+1]).outerjoin(join[element+2]) + query = query.outerjoin(join[element], join[element + 1]).outerjoin(join[element + 2]) indx -= 3 element += 3 elif indx == 2: - query = query.outerjoin(join[element], join[element+1]) + query = query.outerjoin(join[element], join[element + 1]) indx -= 2 element += 2 elif indx == 1: query = query.outerjoin(join[element]) indx -= 1 element += 1 - query = query.filter(db_filter)\ + query = query.filter(db_filter) \ .filter(self.common_filters(allow_show_archived)) entries = list() pagination = list() @@ -890,17 +902,17 @@ class CalibreDB: .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first() def search_query(self, term, config, *join): - term=term.strip().lower() + term = term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) q = list() - #splits search term into single words + # splits search term into single words words = re.split("[, ]+", term) - #put the longest words first to make queries more efficient - words.sort(key=len,reverse=True) - #search authors for match + # put the longest words first to make queries more efficient + words.sort(key=len, reverse=True) + # search authors for match for word in words: - q.append(Books.authors.any(func.partial_ratio(func.lower(Authors.name),word)>=FUZZY_SEARCH_ACCURACY)) + q.append(Books.authors.any(func.partial_ratio(func.lower(Authors.name), word) >= FUZZY_SEARCH_ACCURACY)) query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -912,7 +924,7 @@ class CalibreDB: elif len(join) == 1: query = query.outerjoin(join[0]) - filter_expression=[] + filter_expression = [] cc = self.get_cc_columns(config, filter_config_custom_read=True) for c in cc: if c.datatype not in ["datetime", "rating", "bool", "int", "float"]: @@ -921,19 +933,19 @@ class CalibreDB: 'custom_column_' + str(c.id)).any( func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) # filter out multiple languages and archived books, - results=query.filter(self.common_filters(True)) + results = query.filter(self.common_filters(True)) - #search tags, series and titles, also add author queries + # search tags, series and titles, also add author queries for word in words: - filter_expression=[ - Books.tags.any(func.partial_ratio(func.lower(Tags.name),word)>=FUZZY_SEARCH_ACCURACY), - Books.series.any(func.partial_ratio(func.lower(Series.name),word)>=FUZZY_SEARCH_ACCURACY), - #change to or_ to allow mix of title and author in query term + filter_expression = [ + Books.tags.any(func.partial_ratio(func.lower(Tags.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.series.any(func.partial_ratio(func.lower(Series.name), word) >= FUZZY_SEARCH_ACCURACY), + # change to or_ to allow mix of title and author in query term Books.authors.any(or_(*q)), - Books.publishers.any(func.partial_ratio(func.lower(Publishers.name),word)>=FUZZY_SEARCH_ACCURACY), - func.partial_ratio(func.lower(Books.title),word)>=FUZZY_SEARCH_ACCURACY + Books.publishers.any(func.partial_ratio(func.lower(Publishers.name), word) >= FUZZY_SEARCH_ACCURACY), + func.partial_ratio(func.lower(Books.title), word) >= FUZZY_SEARCH_ACCURACY ] - results=results.filter(or_(*filter_expression)) + results = results.filter(or_(*filter_expression)) return results def get_cc_columns(self, config, filter_config_custom_read=False): @@ -954,10 +966,12 @@ class CalibreDB: # read search results from calibre-database and return it (function is used for feed and simple search def get_search_results(self, term, config, offset=None, order=None, limit=None, *join): - self.session.connection().connection.connection.create_function("partial_token_set_ratio", 2, partial_token_set_ratio) + self.session.connection().connection.connection.create_function("partial_token_set_ratio", 2, + partial_token_set_ratio) order = order[0] if order else [Books.sort] pagination = None - result = self.search_query(term, config, *join).order_by(func.desc(func.partial_token_set_ratio(str(Books),term))).all() + result = self.search_query(term, config, *join).order_by( + desc(func.partial_token_set_ratio(str(Books), term))).all() for res in result: print(res[0]) result_count = len(result) @@ -979,8 +993,8 @@ class CalibreDB: if with_count: if not languages: - languages = self.session.query(Languages, func.count('books_languages_link.book'))\ - .join(books_languages_link).join(Books)\ + languages = self.session.query(Languages, func.count('books_languages_link.book')) \ + .join(books_languages_link).join(Books) \ .filter(self.common_filters(return_all_languages=return_all_languages)) \ .group_by(text('books_languages_link.lang_code')).all() tags = list() @@ -1090,6 +1104,7 @@ class Category: self.rating = rating self.count = 1 + '''class Count: count = None From caf6079b6bcd29f27a87f3c9e0fa6322d7b8bb0d Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 28 May 2023 22:04:41 +0200 Subject: [PATCH 22/25] moved author filter to the rest of the filters, ignore words smaller than 4 letters for searching, introduced max_ratio at new filter function --- cps/db.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cps/db.py b/cps/db.py index fa190015..fef610a7 100644 --- a/cps/db.py +++ b/cps/db.py @@ -52,7 +52,7 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet -from thefuzz.fuzz import partial_ratio, partial_token_set_ratio +from thefuzz.fuzz import partial_ratio, partial_token_set_ratio, partial_token_sort_ratio, ratio # %-level, 100 means exact match FUZZY_SEARCH_ACCURACY = 80 @@ -387,7 +387,7 @@ class Books(Base): self.timestamp, self.pubdate, self.series_index, self.last_modified, self.path, self.has_cover) - def __sort_str(self): + def __str__(self): return "{0} {1} {2} {3} {4}".format(self.title, " ".join([tag.name for tag in self.tags]), " ".join( [series.name for series @@ -904,15 +904,13 @@ class CalibreDB: def search_query(self, term, config, *join): term = term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) - self.session.connection().connection.connection.create_function("partial_ratio", 2, partial_ratio) + self.session.connection().connection.connection.create_function("max_ratio", 2, max_ratio) q = list() # splits search term into single words words = re.split("[, ]+", term) # put the longest words first to make queries more efficient words.sort(key=len, reverse=True) - # search authors for match - for word in words: - q.append(Books.authors.any(func.partial_ratio(func.lower(Authors.name), word) >= FUZZY_SEARCH_ACCURACY)) + words=[x for x in filter(lambda w:len(w)>3,words)] query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -934,18 +932,17 @@ class CalibreDB: func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) # filter out multiple languages and archived books, results = query.filter(self.common_filters(True)) - + filters=[filter_expression] if filter_expression else [] # search tags, series and titles, also add author queries for word in words: - filter_expression = [ - Books.tags.any(func.partial_ratio(func.lower(Tags.name), word) >= FUZZY_SEARCH_ACCURACY), - Books.series.any(func.partial_ratio(func.lower(Series.name), word) >= FUZZY_SEARCH_ACCURACY), - # change to or_ to allow mix of title and author in query term - Books.authors.any(or_(*q)), - Books.publishers.any(func.partial_ratio(func.lower(Publishers.name), word) >= FUZZY_SEARCH_ACCURACY), - func.partial_ratio(func.lower(Books.title), word) >= FUZZY_SEARCH_ACCURACY - ] - results = results.filter(or_(*filter_expression)) + filters.append(or_(*[ + Books.tags.any(func.max_ratio(func.lower(Tags.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.series.any(func.max_ratio(func.lower(Series.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.authors.any(func.max_ratio(func.lower(Authors.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.publishers.any(func.max_ratio(func.lower(Publishers.name), word) >= FUZZY_SEARCH_ACCURACY), + func.max_ratio(func.lower(Books.title), word) >= FUZZY_SEARCH_ACCURACY + ])) + results = results.filter(and_(*filters)) return results def get_cc_columns(self, config, filter_config_custom_read=False): @@ -966,14 +963,12 @@ class CalibreDB: # read search results from calibre-database and return it (function is used for feed and simple search def get_search_results(self, term, config, offset=None, order=None, limit=None, *join): - self.session.connection().connection.connection.create_function("partial_token_set_ratio", 2, - partial_token_set_ratio) order = order[0] if order else [Books.sort] pagination = None - result = self.search_query(term, config, *join).order_by( - desc(func.partial_token_set_ratio(str(Books), term))).all() + result = self.search_query(term, config, *join).order_by(*order).all() + result = sorted(result,key=lambda query:partial_token_sort_ratio(str(query[0]),term),reverse=True) for res in result: - print(res[0]) + print(str(res[0])) result_count = len(result) if offset != None and limit != None: offset = int(offset) @@ -1092,6 +1087,11 @@ def lcase(s): return s.lower() +def max_ratio(string:str,term): + """applies ratio on each word of string and returns the max value""" + words=string.split() + return max([ratio(word.strip(":"),term) for word in words]) + class Category: name = None id = None From b96d02c9212c9cdab59848f37a6bc5675fdf4909 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 31 May 2023 01:03:33 +0200 Subject: [PATCH 23/25] now return empty list if all words of query are < 3 letters, only compare term to words > 3 letters of book attributes --- cps/db.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cps/db.py b/cps/db.py index fef610a7..852283e0 100644 --- a/cps/db.py +++ b/cps/db.py @@ -54,8 +54,8 @@ from .pagination import Pagination from weakref import WeakSet from thefuzz.fuzz import partial_ratio, partial_token_set_ratio, partial_token_sort_ratio, ratio -# %-level, 100 means exact match -FUZZY_SEARCH_ACCURACY = 80 +# %-level, 100 means exact match, 75 allows exactly 1 wrong character in a 4 letter word +FUZZY_SEARCH_ACCURACY = 75 log = logger.create() @@ -911,6 +911,9 @@ class CalibreDB: # put the longest words first to make queries more efficient words.sort(key=len, reverse=True) words=[x for x in filter(lambda w:len(w)>3,words)] + # no word in search term is longer than 3 letters -> return empty query #TODO give some kind of error message + if not any([len(word)>3 for word in words]): + return self.session.query(Books).filter(False) query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: @@ -1090,7 +1093,7 @@ def lcase(s): def max_ratio(string:str,term): """applies ratio on each word of string and returns the max value""" words=string.split() - return max([ratio(word.strip(":"),term) for word in words]) + return max([ratio(word.strip(":"),term) if len(word.strip(":")) > 3 else 0 for word in words]) # ignore words of len < 3#do not compare words of len < 3 -> too generic class Category: name = None From 5e0430e60e5047bf9a5217509ad026680ed316cf Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 31 May 2023 15:23:19 +0200 Subject: [PATCH 24/25] message when query returns 0 results --- cps/templates/search.html | 1 + 1 file changed, 1 insertion(+) diff --git a/cps/templates/search.html b/cps/templates/search.html index 41f52b98..b35a67c8 100644 --- a/cps/templates/search.html +++ b/cps/templates/search.html @@ -5,6 +5,7 @@ {% if entries|length < 1 %}

{{_('No Results Found')}}

{{_('Search Term:')}} {{adv_searchterm}}

+

{{_('Words smaller than 3 letters are not considered')}}

{% else %}

{{result_count}} {{_('Results for:')}} {{adv_searchterm}}

{% if current_user.is_authenticated %} From 896e8fddc3c301d90e5cedec2b587ff62b1469cc Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 3 Jun 2023 20:25:39 +0200 Subject: [PATCH 25/25] minor cleanup, removed unused code --- cps/db.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cps/db.py b/cps/db.py index 852283e0..51645346 100644 --- a/cps/db.py +++ b/cps/db.py @@ -905,14 +905,13 @@ class CalibreDB: term = term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) self.session.connection().connection.connection.create_function("max_ratio", 2, max_ratio) - q = list() # splits search term into single words - words = re.split("[, ]+", term) + words = re.split("[,\s]+", term) # put the longest words first to make queries more efficient words.sort(key=len, reverse=True) - words=[x for x in filter(lambda w:len(w)>3,words)] + words=list(filter(lambda w:len(w)>3,words)) # no word in search term is longer than 3 letters -> return empty query #TODO give some kind of error message - if not any([len(word)>3 for word in words]): + if len(words)==0: return self.session.query(Books).filter(False) query = self.generate_linked_query(config.config_read_column, Books) @@ -970,8 +969,6 @@ class CalibreDB: pagination = None result = self.search_query(term, config, *join).order_by(*order).all() result = sorted(result,key=lambda query:partial_token_sort_ratio(str(query[0]),term),reverse=True) - for res in result: - print(str(res[0])) result_count = len(result) if offset != None and limit != None: offset = int(offset)