preliminary work on OCR

2025-09-12 07:46:04 +00:00 · 2024-05-18 00:39:05 +01:00
parent a3574674d0
commit 6491e02e88
4 changed files with 1484 additions and 0 deletions
--- a/ocr.mjs
+++ b/ocr.mjs
@@ -0,0 +1,107 @@
+import Lens from 'chrome-lens-ocr'
+import sharp from "sharp"
+import fs from "fs/promises"
+import sqlite3 from "better-sqlite3"
+import path from "path"
+
+import memeSearchConfig from "./mse_config.json" with { type: "json" }
+import ocrConfig from "./ocr_config.json" with { type: "json" }
+
+const DB = sqlite3(memeSearchConfig.db_path)
+
+DB.exec(`
+CREATE TABLE IF NOT EXISTS ocr (
+    filename TEXT PRIMARY KEY REFERENCES files(filename),
+    scan_time INTEGER NOT NULL,
+    text TEXT NOT NULL,
+    raw_segments TEXT
+);
+
+CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (
+    filename,
+    text,
+    tokenize='unicode61 remove_diacritics 2',
+    content='ocr'
+);
+
+CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN
+    INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);
+END;
+
+CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN
+    INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);
+END;
+`)
+
+const preparedStatements = new Map()
+const SQL = (strings, ...params) => {
+    const sql = strings.join("?")
+    let stmt
+    const cachedValue = preparedStatements.get(sql)
+    if (!cachedValue) {
+            stmt = DB.prepare(sql)
+            preparedStatements.set(sql, stmt)
+    } else {
+            stmt = cachedValue
+    }
+    return {
+            get: () => stmt.get.apply(stmt, params),
+            run: () => stmt.run.apply(stmt, params),
+            all: () => stmt.all.apply(stmt, params),
+            statement: stmt
+    }
+}
+
+const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))
+
+const lens = new Lens(ocrConfig.lens_options || {})
+
+for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
+    console.log(file.filename)
+    const filepath = path.join(memeSearchConfig.files, file.filename)
+    const metadata = await sharp(filepath).metadata()
+    console.log(metadata.width, metadata.height)
+    let newWidth = Math.min(metadata.width, ocrConfig.image_dim)
+    let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))
+    let text = ""
+    let segments = []
+    let failed = false
+    for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {
+        const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({
+            left: 0,
+            width: newWidth,
+            top: y,
+            height: Math.min(ocrConfig.image_dim, newHeight - y)
+        }).png().toBuffer()
+        let chunk
+        let count = 10
+        while (!chunk) {
+            try {
+                chunk = await lens.scanByBuffer(result)
+            } catch(e) {
+                console.log("OCR failed, retry", e.body ? "?" : e, count)
+                await wait(500)
+                count--
+                if (count === 0) {
+                    console.log("retry limit")
+                    failed = true
+                    break
+                }
+            }
+        }
+        if (failed) break
+        // they appear to be in the "right order" out of the API anyway
+        for (const segment of chunk.segments) {
+            text += segment.text + "\n"
+            segments.push({
+                text: segment.text,
+                x: segment.boundingBox.pixelCoords.x,
+                y: segment.boundingBox.pixelCoords.y + y,
+                width: segment.boundingBox.pixelCoords.width,
+                height: segment.boundingBox.pixelCoords.height
+            })
+        }
+    }
+    if (failed) continue
+    SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
+}
--- a/ocr_config.json
+++ b/ocr_config.json
@@ -0,0 +1,6 @@
+{
+    "image_dim": 1000,
+    "lens_options": {
+
+    }
+}
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -0,0 +1,7 @@
+{
+  "dependencies": {
+    "better-sqlite3": "^10.0.0",
+    "chrome-lens-ocr": "^2.0.1",
+    "sharp": "^0.33.4"
+  }
+}