meme-search-engine/ocr.mjs

import Lens from 'chrome-lens-ocr'
import sharp from "sharp"
import fs from "fs/promises"
import sqlite3 from "better-sqlite3"
import path from "path"

import memeSearchConfig from "./mse_config.json" with { type: "json" }
import ocrConfig from "./ocr_config.json" with { type: "json" }

const DB = sqlite3(memeSearchConfig.db_path)

DB.exec(`
CREATE TABLE IF NOT EXISTS ocr (
    filename TEXT PRIMARY KEY REFERENCES files(filename),
    scan_time INTEGER NOT NULL,
    text TEXT NOT NULL,
    raw_segments TEXT
);

CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (
    filename,
    text,
    tokenize='unicode61 remove_diacritics 2',
    content='ocr'
);

CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN
    INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);
END;

CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN
    INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);
END;
`)

const preparedStatements = new Map()
const SQL = (strings, ...params) => {
    const sql = strings.join("?")
    let stmt
    const cachedValue = preparedStatements.get(sql)
    if (!cachedValue) {
            stmt = DB.prepare(sql)
            preparedStatements.set(sql, stmt)
    } else {
            stmt = cachedValue
    }
    return {
            get: () => stmt.get.apply(stmt, params),
            run: () => stmt.run.apply(stmt, params),
            all: () => stmt.all.apply(stmt, params),
            statement: stmt
    }
}

const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))

const lens = new Lens(ocrConfig.lens_options || {})

for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
    console.log(file.filename)
    const filepath = path.join(memeSearchConfig.files, file.filename)
    const metadata = await sharp(filepath).metadata()
    console.log(metadata.width, metadata.height)
    let newWidth = Math.min(metadata.width, ocrConfig.image_dim)
    let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))
    let text = ""
    let segments = []
    let failed = false
    for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {
        const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({
            left: 0,
            width: newWidth,
            top: y,
            height: Math.min(ocrConfig.image_dim, newHeight - y)
        }).png().toBuffer()
        let chunk
        let count = 10
        while (!chunk) {
            try {
                chunk = await lens.scanByBuffer(result)
            } catch(e) {
                console.log("OCR failed, retry", e.body ? "?" : e, count)
                await wait(500)
                count--
                if (count === 0) {
                    console.log("retry limit")
                    failed = true
                    break
                }
            }
        }
        if (failed) break
        // they appear to be in the "right order" out of the API anyway
        for (const segment of chunk.segments) {
            text += segment.text + "\n"
            segments.push({
                text: segment.text,
                x: segment.boundingBox.pixelCoords.x,
                y: segment.boundingBox.pixelCoords.y + y,
                width: segment.boundingBox.pixelCoords.width,
                height: segment.boundingBox.pixelCoords.height
            })
        }
    }
    if (failed) continue
    SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
}
preliminary work on OCR 2024-05-17 23:39:05 +00:00			`import Lens from 'chrome-lens-ocr'`
			`import sharp from "sharp"`
			`import fs from "fs/promises"`
			`import sqlite3 from "better-sqlite3"`
			`import path from "path"`

			`import memeSearchConfig from "./mse_config.json" with { type: "json" }`
			`import ocrConfig from "./ocr_config.json" with { type: "json" }`

			`const DB = sqlite3(memeSearchConfig.db_path)`

			DB.exec(`
			`CREATE TABLE IF NOT EXISTS ocr (`
			`filename TEXT PRIMARY KEY REFERENCES files(filename),`
			`scan_time INTEGER NOT NULL,`
			`text TEXT NOT NULL,`
			`raw_segments TEXT`
			`);`

			`CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (`
			`filename,`
			`text,`
			`tokenize='unicode61 remove_diacritics 2',`
			`content='ocr'`
			`);`

			`CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN`
			`INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);`
			`END;`

			`CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN`
			`INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);`
			`END;`
			`)

			`const preparedStatements = new Map()`
			`const SQL = (strings, ...params) => {`
			`const sql = strings.join("?")`
			`let stmt`
			`const cachedValue = preparedStatements.get(sql)`
			`if (!cachedValue) {`
			`stmt = DB.prepare(sql)`
			`preparedStatements.set(sql, stmt)`
			`} else {`
			`stmt = cachedValue`
			`}`
			`return {`
			`get: () => stmt.get.apply(stmt, params),`
			`run: () => stmt.run.apply(stmt, params),`
			`all: () => stmt.all.apply(stmt, params),`
			`statement: stmt`
			`}`
			`}`

			`const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))`

			`const lens = new Lens(ocrConfig.lens_options \|\| {})`

			for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
			`console.log(file.filename)`
			`const filepath = path.join(memeSearchConfig.files, file.filename)`
			`const metadata = await sharp(filepath).metadata()`
			`console.log(metadata.width, metadata.height)`
			`let newWidth = Math.min(metadata.width, ocrConfig.image_dim)`
			`let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))`
			`let text = ""`
			`let segments = []`
			`let failed = false`
			`for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {`
			`const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({`
			`left: 0,`
			`width: newWidth,`
			`top: y,`
			`height: Math.min(ocrConfig.image_dim, newHeight - y)`
			`}).png().toBuffer()`
			`let chunk`
			`let count = 10`
			`while (!chunk) {`
			`try {`
			`chunk = await lens.scanByBuffer(result)`
			`} catch(e) {`
			`console.log("OCR failed, retry", e.body ? "?" : e, count)`
			`await wait(500)`
			`count--`
			`if (count === 0) {`
			`console.log("retry limit")`
			`failed = true`
			`break`
			`}`
			`}`
			`}`
			`if (failed) break`
			`// they appear to be in the "right order" out of the API anyway`
			`for (const segment of chunk.segments) {`
			`text += segment.text + "\n"`
			`segments.push({`
			`text: segment.text,`
			`x: segment.boundingBox.pixelCoords.x,`
			`y: segment.boundingBox.pixelCoords.y + y,`
			`width: segment.boundingBox.pixelCoords.width,`
			`height: segment.boundingBox.pixelCoords.height`
			`})`
			`}`
			`}`
			`if (failed) continue`
			SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
			`}`