1
0
mirror of https://github.com/osmarks/meme-search-engine.git synced 2024-11-10 22:09:54 +00:00

preliminary work on OCR

This commit is contained in:
osmarks 2024-05-18 00:39:05 +01:00
parent a3574674d0
commit 6491e02e88
4 changed files with 1484 additions and 0 deletions

107
ocr.mjs Normal file
View File

@ -0,0 +1,107 @@
import Lens from 'chrome-lens-ocr'
import sharp from "sharp"
import fs from "fs/promises"
import sqlite3 from "better-sqlite3"
import path from "path"
import memeSearchConfig from "./mse_config.json" with { type: "json" }
import ocrConfig from "./ocr_config.json" with { type: "json" }
const DB = sqlite3(memeSearchConfig.db_path)
DB.exec(`
CREATE TABLE IF NOT EXISTS ocr (
filename TEXT PRIMARY KEY REFERENCES files(filename),
scan_time INTEGER NOT NULL,
text TEXT NOT NULL,
raw_segments TEXT
);
CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (
filename,
text,
tokenize='unicode61 remove_diacritics 2',
content='ocr'
);
CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN
INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);
END;
CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN
INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);
END;
`)
const preparedStatements = new Map()
const SQL = (strings, ...params) => {
const sql = strings.join("?")
let stmt
const cachedValue = preparedStatements.get(sql)
if (!cachedValue) {
stmt = DB.prepare(sql)
preparedStatements.set(sql, stmt)
} else {
stmt = cachedValue
}
return {
get: () => stmt.get.apply(stmt, params),
run: () => stmt.run.apply(stmt, params),
all: () => stmt.all.apply(stmt, params),
statement: stmt
}
}
const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))
const lens = new Lens(ocrConfig.lens_options || {})
for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
console.log(file.filename)
const filepath = path.join(memeSearchConfig.files, file.filename)
const metadata = await sharp(filepath).metadata()
console.log(metadata.width, metadata.height)
let newWidth = Math.min(metadata.width, ocrConfig.image_dim)
let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))
let text = ""
let segments = []
let failed = false
for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {
const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({
left: 0,
width: newWidth,
top: y,
height: Math.min(ocrConfig.image_dim, newHeight - y)
}).png().toBuffer()
let chunk
let count = 10
while (!chunk) {
try {
chunk = await lens.scanByBuffer(result)
} catch(e) {
console.log("OCR failed, retry", e.body ? "?" : e, count)
await wait(500)
count--
if (count === 0) {
console.log("retry limit")
failed = true
break
}
}
}
if (failed) break
// they appear to be in the "right order" out of the API anyway
for (const segment of chunk.segments) {
text += segment.text + "\n"
segments.push({
text: segment.text,
x: segment.boundingBox.pixelCoords.x,
y: segment.boundingBox.pixelCoords.y + y,
width: segment.boundingBox.pixelCoords.width,
height: segment.boundingBox.pixelCoords.height
})
}
}
if (failed) continue
SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
}

6
ocr_config.json Normal file
View File

@ -0,0 +1,6 @@
{
"image_dim": 1000,
"lens_options": {
}
}

1364
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

7
package.json Normal file
View File

@ -0,0 +1,7 @@
{
"dependencies": {
"better-sqlite3": "^10.0.0",
"chrome-lens-ocr": "^2.0.1",
"sharp": "^0.33.4"
}
}