diff --git a/cps/db.py b/cps/db.py index f0295fe5..8036bccd 100644 --- a/cps/db.py +++ b/cps/db.py @@ -20,6 +20,7 @@ import os import re import json +import traceback from datetime import datetime from urllib.parse import quote import unidecode @@ -32,6 +33,7 @@ from sqlalchemy.orm import relationship, sessionmaker, scoped_session from sqlalchemy.orm.collections import InstrumentedList from sqlalchemy.ext.declarative import DeclarativeMeta from sqlalchemy.exc import OperationalError + try: # Compatibility with sqlalchemy 2.0 from sqlalchemy.orm import declarative_base @@ -40,6 +42,7 @@ except ImportError: from sqlalchemy.pool import StaticPool from sqlalchemy.sql.expression import and_, true, false, text, func, or_ from sqlalchemy.ext.associationproxy import association_proxy +from sqlalchemy import desc from flask_login import current_user from flask_babel import gettext as _ from flask_babel import get_locale @@ -49,7 +52,10 @@ from . import logger, ub, isoLanguages from .pagination import Pagination from weakref import WeakSet +from thefuzz.fuzz import partial_ratio, partial_token_set_ratio, partial_token_sort_ratio, ratio +# %-level, 100 means exact match, 75 allows exactly 1 wrong character in a 4 letter word +FUZZY_SEARCH_ACCURACY = 75 log = logger.create() @@ -399,8 +405,20 @@ class Books(Base): def __repr__(self): return "".format(self.title, self.sort, self.author_sort, - self.timestamp, self.pubdate, self.series_index, - self.last_modified, self.path, self.has_cover) + self.timestamp, self.pubdate, self.series_index, + self.last_modified, self.path, self.has_cover) + + def __str__(self): + return "{0} {1} {2} {3} {4}".format(self.title, " ".join([tag.name for tag in self.tags]), + " ".join( + [series.name for series + in self.series]), + " ".join( + [author.name for author + in self.authors]), + " ".join([publisher.name for + publisher in + self.publishers])) @property def atom_timestamp(self): @@ -442,13 +460,15 @@ class CustomColumns(Base): content['category_sort'] = "value" content['is_csp'] = False content['is_editable'] = self.editable - content['rec_index'] = sequence + 22 # toDo why ?? + content['rec_index'] = sequence + 22 # toDo why ?? if isinstance(value, datetime): - content['#value#'] = {"__class__": "datetime.datetime", "__value__": value.strftime("%Y-%m-%dT%H:%M:%S+00:00")} + content['#value#'] = {"__class__": "datetime.datetime", + "__value__": value.strftime("%Y-%m-%dT%H:%M:%S+00:00")} else: content['#value#'] = value content['#extra#'] = extra - content['is_multiple2'] = {} if not self.is_multiple else {"cache_to_list": "|", "ui_to_list": ",", "list_to_ui": ", "} + content['is_multiple2'] = {} if not self.is_multiple else {"cache_to_list": "|", "ui_to_list": ",", + "list_to_ui": ", "} return json.dumps(content, ensure_ascii=False) @@ -469,7 +489,7 @@ class AlchemyEncoder(json.JSONEncoder): el = list() # ele = None for ele in data: - if hasattr(ele, 'value'): # converter for custom_column values + if hasattr(ele, 'value'): # converter for custom_column values el.append(str(ele.value)) elif ele.get: el.append(ele.get()) @@ -508,7 +528,6 @@ class CalibreDB: if init: self.init_db(expire_on_commit) - def init_db(self, expire_on_commit=True): if self._init: self.init_session(expire_on_commit) @@ -680,13 +699,13 @@ class CalibreDB: if not read_column: bd = (self.session.query(Books, ub.ReadBook.read_status, ub.ArchivedBook.is_archived).select_from(Books) .join(ub.ReadBook, and_(ub.ReadBook.user_id == int(current_user.id), ub.ReadBook.book_id == book_id), - isouter=True)) + isouter=True)) else: try: read_column = cc_classes[read_column] bd = (self.session.query(Books, read_column.value, ub.ArchivedBook.is_archived).select_from(Books) .join(read_column, read_column.book == book_id, - isouter=True)) + isouter=True)) except (KeyError, AttributeError, IndexError): log.error("Custom Column No.{} does not exist in calibre database".format(read_column)) # Skip linking read column and return None instead of read status @@ -739,11 +758,11 @@ class CalibreDB: pos_cc_list = current_user.allowed_column_value.split(',') pos_content_cc_filter = true() if pos_cc_list == [''] else \ getattr(Books, 'custom_column_' + str(self.config.config_restricted_column)). \ - any(cc_classes[self.config.config_restricted_column].value.in_(pos_cc_list)) + any(cc_classes[self.config.config_restricted_column].value.in_(pos_cc_list)) neg_cc_list = current_user.denied_column_value.split(',') neg_content_cc_filter = false() if neg_cc_list == [''] else \ getattr(Books, 'custom_column_' + str(self.config.config_restricted_column)). \ - any(cc_classes[self.config.config_restricted_column].value.in_(neg_cc_list)) + any(cc_classes[self.config.config_restricted_column].value.in_(neg_cc_list)) except (KeyError, AttributeError, IndexError): pos_content_cc_filter = false() neg_content_cc_filter = true() @@ -823,18 +842,18 @@ class CalibreDB: element = 0 while indx: if indx >= 3: - query = query.outerjoin(join[element], join[element+1]).outerjoin(join[element+2]) + query = query.outerjoin(join[element], join[element + 1]).outerjoin(join[element + 2]) indx -= 3 element += 3 elif indx == 2: - query = query.outerjoin(join[element], join[element+1]) + query = query.outerjoin(join[element], join[element + 1]) indx -= 2 element += 2 elif indx == 1: query = query.outerjoin(join[element]) indx -= 1 element += 1 - query = query.filter(db_filter)\ + query = query.filter(db_filter) \ .filter(self.common_filters(allow_show_archived)) entries = list() pagination = list() @@ -904,12 +923,18 @@ class CalibreDB: .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first() def search_query(self, term, config, *join): - term.strip().lower() + term = term.strip().lower() self.session.connection().connection.connection.create_function("lower", 1, lcase) - q = list() - author_terms = re.split("[, ]+", term) - for author_term in author_terms: - q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + author_term + "%"))) + self.session.connection().connection.connection.create_function("max_ratio", 2, max_ratio) + # splits search term into single words + words = re.split("[,\s]+", term) + # put the longest words first to make queries more efficient + words.sort(key=len, reverse=True) + words=list(filter(lambda w:len(w)>3,words)) + # no word in search term is longer than 3 letters -> return empty query #TODO give some kind of error message + if len(words)==0: + return self.session.query(Books).filter(False) + query = self.generate_linked_query(config.config_read_column, Books) if len(join) == 6: query = query.outerjoin(join[0], join[1]).outerjoin(join[2]).outerjoin(join[3], join[4]).outerjoin(join[5]) @@ -920,19 +945,28 @@ class CalibreDB: elif len(join) == 1: query = query.outerjoin(join[0]) + filter_expression = [] cc = self.get_cc_columns(config, filter_config_custom_read=True) - filter_expression = [Books.tags.any(func.lower(Tags.name).ilike("%" + term + "%")), - Books.series.any(func.lower(Series.name).ilike("%" + term + "%")), - Books.authors.any(and_(*q)), - Books.publishers.any(func.lower(Publishers.name).ilike("%" + term + "%")), - func.lower(Books.title).ilike("%" + term + "%")] for c in cc: if c.datatype not in ["datetime", "rating", "bool", "int", "float"]: filter_expression.append( getattr(Books, 'custom_column_' + str(c.id)).any( func.lower(cc_classes[c.id].value).ilike("%" + term + "%"))) - return query.filter(self.common_filters(True)).filter(or_(*filter_expression)) + # filter out multiple languages and archived books, + results = query.filter(self.common_filters(True)) + filters=[filter_expression] if filter_expression else [] + # search tags, series and titles, also add author queries + for word in words: + filters.append(or_(*[ + Books.tags.any(func.max_ratio(func.lower(Tags.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.series.any(func.max_ratio(func.lower(Series.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.authors.any(func.max_ratio(func.lower(Authors.name), word) >= FUZZY_SEARCH_ACCURACY), + Books.publishers.any(func.max_ratio(func.lower(Publishers.name), word) >= FUZZY_SEARCH_ACCURACY), + func.max_ratio(func.lower(Books.title), word) >= FUZZY_SEARCH_ACCURACY + ])) + results = results.filter(and_(*filters)) + return results def get_cc_columns(self, config, filter_config_custom_read=False): tmp_cc = self.session.query(CustomColumns).filter(CustomColumns.datatype.notin_(cc_exceptions)).all() @@ -955,6 +989,7 @@ class CalibreDB: order = order[0] if order else [Books.sort] pagination = None result = self.search_query(term, config, *join).order_by(*order).all() + result = sorted(result,key=lambda query:partial_token_sort_ratio(str(query[0]),term),reverse=True) result_count = len(result) if offset != None and limit != None: offset = int(offset) @@ -974,8 +1009,8 @@ class CalibreDB: if with_count: if not languages: - languages = self.session.query(Languages, func.count('books_languages_link.book'))\ - .join(books_languages_link).join(Books)\ + languages = self.session.query(Languages, func.count('books_languages_link.book')) \ + .join(books_languages_link).join(Books) \ .filter(self.common_filters(return_all_languages=return_all_languages)) \ .group_by(text('books_languages_link.lang_code')).all() tags = list() @@ -1073,6 +1108,11 @@ def lcase(s): return s.lower() +def max_ratio(string:str,term): + """applies ratio on each word of string and returns the max value""" + words=string.split() + return max([ratio(word.strip(":"),term) if len(word.strip(":")) > 3 else 0 for word in words]) # ignore words of len < 3#do not compare words of len < 3 -> too generic + class Category: name = None id = None @@ -1085,6 +1125,7 @@ class Category: self.rating = rating self.count = 1 + '''class Count: count = None diff --git a/cps/templates/search.html b/cps/templates/search.html index 41f52b98..b35a67c8 100644 --- a/cps/templates/search.html +++ b/cps/templates/search.html @@ -5,6 +5,7 @@ {% if entries|length < 1 %}

{{_('No Results Found')}}

{{_('Search Term:')}} {{adv_searchterm}}

+

{{_('Words smaller than 3 letters are not considered')}}

{% else %}

{{result_count}} {{_('Results for:')}} {{adv_searchterm}}

{% if current_user.is_authenticated %} diff --git a/requirements.txt b/requirements.txt index fef4bbf2..40e78706 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,6 @@ flask-wtf>=0.14.2,<1.2.0 chardet>=3.0.0,<4.1.0 advocate>=1.0.0,<1.1.0 Flask-Limiter>=2.3.0,<3.4.0 + +thefuzz~=0.19.0 +Levenshtein~=0.21.0