meme-search-engine/ocr.mjs

import Lens from 'chrome-lens-ocr'
import sharp from "sharp"
import fs from "fs/promises"
import sqlite3 from "better-sqlite3"
import path from "path"

import memeSearchConfig from "./mse_config.json" with { type: "json" }
import ocrConfig from "./ocr_config.json" with { type: "json" }

const DB = sqlite3(memeSearchConfig.db_path)

DB.exec(`
CREATE TABLE IF NOT EXISTS ocr (
    filename TEXT PRIMARY KEY REFERENCES files(filename),
    scan_time INTEGER NOT NULL,
    text TEXT NOT NULL,
    raw_segments TEXT
);

CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (
    filename,
    text,
    tokenize='unicode61 remove_diacritics 2',
    content='ocr'
);

CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN
    INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);
END;

CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN
    INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);
END;
`)

const preparedStatements = new Map()
const SQL = (strings, ...params) => {
    const sql = strings.join("?")
    let stmt
    const cachedValue = preparedStatements.get(sql)
    if (!cachedValue) {
            stmt = DB.prepare(sql)
            preparedStatements.set(sql, stmt)
    } else {
            stmt = cachedValue
    }
    return {
            get: () => stmt.get.apply(stmt, params),
            run: () => stmt.run.apply(stmt, params),
            all: () => stmt.all.apply(stmt, params),
            statement: stmt
    }
}

const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))

const lens = new Lens(ocrConfig.lens_options || {})

for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
    console.log(file.filename)
    const filepath = path.join(memeSearchConfig.files, file.filename)
    const metadata = await sharp(filepath).metadata()
    console.log(metadata.width, metadata.height)
    let newWidth = Math.min(metadata.width, ocrConfig.image_dim)
    let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))
    let text = ""
    let segments = []
    let failed = false
    for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {
        const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({
            left: 0,
            width: newWidth,
            top: y,
            height: Math.min(ocrConfig.image_dim, newHeight - y)
        }).png().toBuffer()
        let chunk
        let count = 10
        while (!chunk) {
            try {
                chunk = await lens.scanByBuffer(result)
            } catch(e) {
                console.log("OCR failed, retry", e.body ? "?" : e, count)
                await wait(500)
                count--
                if (count === 0) {
                    console.log("retry limit")
                    failed = true
                    break
                }
            }
        }
        if (failed) break
        // they appear to be in the "right order" out of the API anyway
        for (const segment of chunk.segments) {
            text += segment.text + "\n"
            segments.push({
                text: segment.text,
                x: segment.boundingBox.pixelCoords.x,
                y: segment.boundingBox.pixelCoords.y + y,
                width: segment.boundingBox.pixelCoords.width,
                height: segment.boundingBox.pixelCoords.height
            })
        }
    }
    if (failed) continue
    SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
}