mirror of
https://github.com/osmarks/meme-search-engine.git
synced 2024-11-13 23:34:49 +00:00
107 lines
3.7 KiB
JavaScript
107 lines
3.7 KiB
JavaScript
import Lens from 'chrome-lens-ocr'
|
|
import sharp from "sharp"
|
|
import fs from "fs/promises"
|
|
import sqlite3 from "better-sqlite3"
|
|
import path from "path"
|
|
|
|
import memeSearchConfig from "./mse_config.json" with { type: "json" }
|
|
import ocrConfig from "./ocr_config.json" with { type: "json" }
|
|
|
|
const DB = sqlite3(memeSearchConfig.db_path)
|
|
|
|
DB.exec(`
|
|
CREATE TABLE IF NOT EXISTS ocr (
|
|
filename TEXT PRIMARY KEY REFERENCES files(filename),
|
|
scan_time INTEGER NOT NULL,
|
|
text TEXT NOT NULL,
|
|
raw_segments TEXT
|
|
);
|
|
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (
|
|
filename,
|
|
text,
|
|
tokenize='unicode61 remove_diacritics 2',
|
|
content='ocr'
|
|
);
|
|
|
|
CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN
|
|
INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);
|
|
END;
|
|
|
|
CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN
|
|
INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);
|
|
END;
|
|
`)
|
|
|
|
const preparedStatements = new Map()
|
|
const SQL = (strings, ...params) => {
|
|
const sql = strings.join("?")
|
|
let stmt
|
|
const cachedValue = preparedStatements.get(sql)
|
|
if (!cachedValue) {
|
|
stmt = DB.prepare(sql)
|
|
preparedStatements.set(sql, stmt)
|
|
} else {
|
|
stmt = cachedValue
|
|
}
|
|
return {
|
|
get: () => stmt.get.apply(stmt, params),
|
|
run: () => stmt.run.apply(stmt, params),
|
|
all: () => stmt.all.apply(stmt, params),
|
|
statement: stmt
|
|
}
|
|
}
|
|
|
|
const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))
|
|
|
|
const lens = new Lens(ocrConfig.lens_options || {})
|
|
|
|
for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
|
|
console.log(file.filename)
|
|
const filepath = path.join(memeSearchConfig.files, file.filename)
|
|
const metadata = await sharp(filepath).metadata()
|
|
console.log(metadata.width, metadata.height)
|
|
let newWidth = Math.min(metadata.width, ocrConfig.image_dim)
|
|
let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))
|
|
let text = ""
|
|
let segments = []
|
|
let failed = false
|
|
for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {
|
|
const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({
|
|
left: 0,
|
|
width: newWidth,
|
|
top: y,
|
|
height: Math.min(ocrConfig.image_dim, newHeight - y)
|
|
}).png().toBuffer()
|
|
let chunk
|
|
let count = 10
|
|
while (!chunk) {
|
|
try {
|
|
chunk = await lens.scanByBuffer(result)
|
|
} catch(e) {
|
|
console.log("OCR failed, retry", e.body ? "?" : e, count)
|
|
await wait(500)
|
|
count--
|
|
if (count === 0) {
|
|
console.log("retry limit")
|
|
failed = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if (failed) break
|
|
// they appear to be in the "right order" out of the API anyway
|
|
for (const segment of chunk.segments) {
|
|
text += segment.text + "\n"
|
|
segments.push({
|
|
text: segment.text,
|
|
x: segment.boundingBox.pixelCoords.x,
|
|
y: segment.boundingBox.pixelCoords.y + y,
|
|
width: segment.boundingBox.pixelCoords.width,
|
|
height: segment.boundingBox.pixelCoords.height
|
|
})
|
|
}
|
|
}
|
|
if (failed) continue
|
|
SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
|
|
} |