Optimized search query

2026-01-08 15:19:04 +00:00 · 2025-12-05 16:33:30 +01:00
parent ca63d343bd
commit d2015dd998
3 changed files with 2456 additions and 321 deletions
--- a/cps/db.py
+++ b/cps/db.py
@@ -30,7 +30,7 @@ from sqlite3 import OperationalError as sqliteOperationalError
 from sqlalchemy import create_engine
 from sqlalchemy import Table, Column, ForeignKey, CheckConstraint
 from sqlalchemy import String, Integer, Boolean, TIMESTAMP, Float
-from sqlalchemy.orm import relationship, sessionmaker, scoped_session
+from sqlalchemy.orm import relationship, sessionmaker, scoped_session, selectinload
 from sqlalchemy.orm.collections import InstrumentedList
 from sqlalchemy.ext.declarative import DeclarativeMeta
 from sqlalchemy.exc import OperationalError
@@ -901,29 +901,36 @@ class CalibreDB:
        for entry in entries:
            if combined:
                sort_authors = entry.Books.author_sort.split('&')
-                ids = [a.id for a in entry.Books.authors]
-
+                authors_list = entry.Books.authors
            else:
                sort_authors = entry.author_sort.split('&')
-                ids = [a.id for a in entry.authors]
-            authors_ordered = list()
-            # error = False
+                authors_list = entry.authors
+
+            # Create dictionary for O(1) lookup instead of nested loops
+            authors_by_sort = {}
+            authors_by_id = {}
+            for author in authors_list:
+                authors_by_sort[author.sort] = author
+                authors_by_id[author.id] = author
+
+            authors_ordered = []
+            ids_remaining = set(authors_by_id.keys())
+
+            # Order authors based on sort field using dictionary lookup
            for auth in sort_authors:
                auth = strip_whitespaces(auth)
-                results = self.session.query(Authors).filter(Authors.sort == auth).all()
-                # ToDo: How to handle not found author name
-                if not len(results):
-                    book_id = entry.id if isinstance(entry, Books) else entry[0].id
-                    log.error("Author '{}' of book {} not found to display name in right order".format(auth, book_id))
-                    # error = True
-                    break
-                for r in results:
-                    if r.id in ids:
-                        authors_ordered.append(r)
-                        ids.remove(r.id)
-            for author_id in ids:
-                result = self.session.query(Authors).filter(Authors.id == author_id).first()
-                authors_ordered.append(result)
+                if auth in authors_by_sort:
+                    author = authors_by_sort[auth]
+                    authors_ordered.append(author)
+                    ids_remaining.discard(author.id)
+                else:
+                    # This can happen if author_sort has stale data or formatting issues
+                    book_id = entry.id if isinstance(entry, Books) else (entry.Books.id if combined else entry.id)
+                    log.warning("Author '{}' of book {} not found in author list, skipping in sort order".format(auth, book_id))
+
+            # Add any remaining authors not in sort order
+            for author_id in ids_remaining:
+                authors_ordered.append(authors_by_id[author_id])

            if list_return:
                if combined:
@@ -956,36 +963,95 @@ class CalibreDB:
            .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first()

    def search_query(self, term, config, *join):
-        strip_whitespaces(term).lower()
+        term = strip_whitespaces(term).lower()
        self.create_functions()
-        # self.session.connection().connection.connection.create_function("lower", 1, lcase)
+
+        # Try FTS5 search first for better performance
+        fts_ids = None
+        # Check if FTS5 table exists before attempting search
+        if not hasattr(self, '_fts_available'):
+            try:
+                result = self.session.execute(
+                    text("SELECT name FROM sqlite_master WHERE type='table' AND name='books_fts'")
+                ).fetchone()
+                self._fts_available = result is not None
+            except Exception:
+                self._fts_available = False
+
+        if self._fts_available:
+            try:
+                # Escape FTS5 special characters to prevent query errors
+                term_fts = term.replace('"', '""')
+                # Wrap in quotes for phrase matching and better accuracy
+                fts_results = self.session.execute(
+                    text("SELECT DISTINCT rowid FROM books_fts WHERE books_fts MATCH :term"),
+                    {"term": f'"{term_fts}"'}
+                ).fetchall()
+                if fts_results:
+                    fts_ids = [r[0] for r in fts_results]
+            except Exception as ex:
+                # FTS5 query failed, fall back to traditional search
+                log.debug("FTS5 search failed for term '{}', using fallback: {}".format(term, ex))
+
+        # Build base query with optimized joins
+        base_query = self.generate_linked_query(config.config_read_column, Books)
+        base_query = base_query.filter(self.common_filters(True))
+
+        # Apply eager loading for authors to avoid N+1 queries
+        base_query = base_query.options(selectinload(Books.authors))
+        
+
+        if len(join) == 6:
+            base_query = base_query.outerjoin(join[0], join[1]).outerjoin(join[2]).outerjoin(join[3], join[4]).outerjoin(join[5])
+        if len(join) == 3:
+            base_query = base_query.outerjoin(join[0], join[1]).outerjoin(join[2])
+        elif len(join) == 2:
+            base_query = base_query.outerjoin(join[0], join[1])
+        elif len(join) == 1:
+            base_query = base_query.outerjoin(join[0])
+
+        # If FTS5 found results, use those IDs
+        if fts_ids:
+            return base_query.filter(Books.id.in_(fts_ids))
+
+        # Fallback to traditional search with optimized subqueries
        q = list()
        author_terms = re.split("[, ]+", term)
-        for author_term in author_terms:
-            q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + author_term + "%")))
-        query = self.generate_linked_query(config.config_read_column, Books)
-        if len(join) == 6:
-            query = query.outerjoin(join[0], join[1]).outerjoin(join[2]).outerjoin(join[3], join[4]).outerjoin(join[5])
-        if len(join) == 3:
-            query = query.outerjoin(join[0], join[1]).outerjoin(join[2])
-        elif len(join) == 2:
-            query = query.outerjoin(join[0], join[1])
-        elif len(join) == 1:
-            query = query.outerjoin(join[0])

+        # Use subquery for authors to avoid expensive .any() with OR
+        author_subquery = self.session.query(books_authors_link.c.book).join(
+            Authors, books_authors_link.c.author == Authors.id
+        )
+        author_filters = []
+        for author_term in author_terms:
+            author_filters.append(func.lower(Authors.name).ilike("%" + author_term + "%"))
+        if author_filters:
+            author_subquery = author_subquery.filter(or_(*author_filters))
+
+        # Build optimized filter expressions
        cc = self.get_cc_columns(config, filter_config_custom_read=True)
-        filter_expression = [Books.tags.any(func.lower(Tags.name).ilike("%" + term + "%")),
-                             Books.series.any(func.lower(Series.name).ilike("%" + term + "%")),
-                             Books.authors.any(and_(*q)),
-                             Books.publishers.any(func.lower(Publishers.name).ilike("%" + term + "%")),
-                             func.lower(Books.title).ilike("%" + term + "%")]
+        filter_expression = [
+            Books.id.in_(self.session.query(books_tags_link.c.book).join(
+                Tags, books_tags_link.c.tag == Tags.id
+            ).filter(func.lower(Tags.name).ilike("%" + term + "%"))),
+            Books.id.in_(self.session.query(books_series_link.c.book).join(
+                Series, books_series_link.c.series == Series.id
+            ).filter(func.lower(Series.name).ilike("%" + term + "%"))),
+            Books.id.in_(author_subquery),
+            Books.id.in_(self.session.query(books_publishers_link.c.book).join(
+                Publishers, books_publishers_link.c.publisher == Publishers.id
+            ).filter(func.lower(Publishers.name).ilike("%" + term + "%"))),
+            func.lower(Books.title).ilike("%" + term + "%")
+        ]
+
        for c in cc:
            if c.datatype not in ["datetime", "rating", "bool", "int", "float"]:
                filter_expression.append(
                    getattr(Books,
                            'custom_column_' + str(c.id)).any(
                        func.lower(cc_classes[c.id].value).ilike("%" + term + "%")))
-        return query.filter(self.common_filters(True)).filter(or_(*filter_expression))
+
+        return base_query.filter(or_(*filter_expression))

    def get_cc_columns(self, config, filter_config_custom_read=False):
        tmp_cc = self.session.query(CustomColumns).filter(CustomColumns.datatype.notin_(cc_exceptions)).all()
@@ -1007,18 +1073,32 @@ class CalibreDB:
    def get_search_results(self, term, config, offset=None, order=None, limit=None, *join):
        order = order[0] if order else [Books.sort]
        pagination = None
-        result = self.search_query(term, config, *join).order_by(*order).all()
-        result_count = len(result)
+
        if offset is not None and limit is not None:
            offset = int(offset)
-            limit_all = offset + int(limit)
-            pagination = Pagination((offset / (int(limit)) + 1), limit, result_count)
+            limit_int = int(limit)
+
+            # Use LIMIT+1 pattern to estimate total count without expensive count()
+            query = self.search_query(term, config, *join).order_by(*order)
+            result = query.limit(offset + limit_int + 1).all()
+
+            # Check if there are more results
+            has_more = len(result) > (offset + limit_int)
+            if has_more:
+                result_count = offset + limit_int + 1  # Estimate: at least this many
+            else:
+                result_count = len(result)
+
+            # Extract the page of results
+            result = result[offset:offset + limit_int]
+            pagination = Pagination((offset / limit_int + 1), limit_int, result_count)
        else:
-            offset = 0
-            limit_all = result_count
+            # No pagination, fetch all results
+            result = self.search_query(term, config, *join).order_by(*order).all()
+            result_count = len(result)

        ub.store_combo_ids(result)
-        entries = self.order_authors(result[offset:limit_all], list_return=True, combined=True)
+        entries = self.order_authors(result, list_return=True, combined=True)

        return entries, result_count, pagination

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,7 @@ dependencies = [
    "python-magic-bin>=0.4.0,<0.5.0;sys_platform=='win32'",
    "flask-httpAuth>=4.4.0,<5.0.0",
    "cryptography>=39.0.0,<45.0.0",
+    "certifi>=2024.7.4,<2025.8.24",
 ]
 dynamic = ["version"]

@@ -69,7 +70,7 @@ content-type = "text/markdown"

 [project.optional-dependencies]
 gdrive = [
-    "google-api-python-client>=1.7.11,<2.200.0",
+    "google-api-python-client>=2.73.00,<2.200.0",
    "gevent>20.6.0,<24.12.0",
    "greenlet>=0.4.17,<3.3.0",
    "httplib2>=0.9.2,<0.23.0",
@@ -77,13 +78,13 @@ gdrive = [
    "uritemplate>=3.0.0,<4.3.0",
    "pyasn1-modules>=0.0.8,<0.7.0",
    "pyasn1>=0.1.9,<0.7.0",
-    "PyDrive2>=1.3.1,<1.22.0",
+    "PyDrive2>=1.15.0,<1.22.0",
    "PyYAML>=3.12,<6.1",
    "rsa>=3.4.2,<4.10.0",
 ]
 gmail = [
-    "google-auth-oauthlib>=0.4.3,<1.3.0",
-    "google-api-python-client>=1.7.11,<2.200.0",
+    "google-auth-oauthlib>=1.0.0,<1.3.0",
+    "google-api-python-client>=2.73.00,<2.200.0",
 ]
 goodreads = [
    "goodreads>=0.3.2,<0.4.0",
--- a/TestSummary_Linux.html
+++ b/TestSummary_Linux.html