mirror of
https://github.com/osmarks/meme-search-engine.git
synced 2024-12-30 12:00:31 +00:00
preliminary work on OCR
This commit is contained in:
parent
a3574674d0
commit
6491e02e88
107
ocr.mjs
Normal file
107
ocr.mjs
Normal file
@ -0,0 +1,107 @@
|
||||
import Lens from 'chrome-lens-ocr'
|
||||
import sharp from "sharp"
|
||||
import fs from "fs/promises"
|
||||
import sqlite3 from "better-sqlite3"
|
||||
import path from "path"
|
||||
|
||||
import memeSearchConfig from "./mse_config.json" with { type: "json" }
|
||||
import ocrConfig from "./ocr_config.json" with { type: "json" }
|
||||
|
||||
const DB = sqlite3(memeSearchConfig.db_path)
|
||||
|
||||
DB.exec(`
|
||||
CREATE TABLE IF NOT EXISTS ocr (
|
||||
filename TEXT PRIMARY KEY REFERENCES files(filename),
|
||||
scan_time INTEGER NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
raw_segments TEXT
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 (
|
||||
filename,
|
||||
text,
|
||||
tokenize='unicode61 remove_diacritics 2',
|
||||
content='ocr'
|
||||
);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON ocr BEGIN
|
||||
INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, new.text);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON ocr BEGIN
|
||||
INSERT INTO ocr_fts (ocr_fts, rowid, filename, text) VALUES ('delete', old.rowid, old.filename, old.text);
|
||||
END;
|
||||
`)
|
||||
|
||||
const preparedStatements = new Map()
|
||||
const SQL = (strings, ...params) => {
|
||||
const sql = strings.join("?")
|
||||
let stmt
|
||||
const cachedValue = preparedStatements.get(sql)
|
||||
if (!cachedValue) {
|
||||
stmt = DB.prepare(sql)
|
||||
preparedStatements.set(sql, stmt)
|
||||
} else {
|
||||
stmt = cachedValue
|
||||
}
|
||||
return {
|
||||
get: () => stmt.get.apply(stmt, params),
|
||||
run: () => stmt.run.apply(stmt, params),
|
||||
all: () => stmt.all.apply(stmt, params),
|
||||
statement: stmt
|
||||
}
|
||||
}
|
||||
|
||||
const wait = timeout => new Promise(resolve => setTimeout(resolve, timeout))
|
||||
|
||||
const lens = new Lens(ocrConfig.lens_options || {})
|
||||
|
||||
for (const file of SQL`SELECT files.filename FROM files LEFT JOIN ocr ON files.filename = ocr.filename WHERE ocr.scan_time IS NULL OR ocr.scan_time < files.modtime`.all()) {
|
||||
console.log(file.filename)
|
||||
const filepath = path.join(memeSearchConfig.files, file.filename)
|
||||
const metadata = await sharp(filepath).metadata()
|
||||
console.log(metadata.width, metadata.height)
|
||||
let newWidth = Math.min(metadata.width, ocrConfig.image_dim)
|
||||
let newHeight = Math.ceil(metadata.height * (newWidth / metadata.width))
|
||||
let text = ""
|
||||
let segments = []
|
||||
let failed = false
|
||||
for (let y = 0; y < newHeight; y += ocrConfig.image_dim) {
|
||||
const result = await sharp(filepath).resize(newWidth, newHeight, { fit: "fill" }).extract({
|
||||
left: 0,
|
||||
width: newWidth,
|
||||
top: y,
|
||||
height: Math.min(ocrConfig.image_dim, newHeight - y)
|
||||
}).png().toBuffer()
|
||||
let chunk
|
||||
let count = 10
|
||||
while (!chunk) {
|
||||
try {
|
||||
chunk = await lens.scanByBuffer(result)
|
||||
} catch(e) {
|
||||
console.log("OCR failed, retry", e.body ? "?" : e, count)
|
||||
await wait(500)
|
||||
count--
|
||||
if (count === 0) {
|
||||
console.log("retry limit")
|
||||
failed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if (failed) break
|
||||
// they appear to be in the "right order" out of the API anyway
|
||||
for (const segment of chunk.segments) {
|
||||
text += segment.text + "\n"
|
||||
segments.push({
|
||||
text: segment.text,
|
||||
x: segment.boundingBox.pixelCoords.x,
|
||||
y: segment.boundingBox.pixelCoords.y + y,
|
||||
width: segment.boundingBox.pixelCoords.width,
|
||||
height: segment.boundingBox.pixelCoords.height
|
||||
})
|
||||
}
|
||||
}
|
||||
if (failed) continue
|
||||
SQL`INSERT OR REPLACE INTO ocr VALUES (${file.filename}, ${Date.now() / 1000}, ${text.trim()}, ${JSON.stringify(segments)})`.run()
|
||||
}
|
6
ocr_config.json
Normal file
6
ocr_config.json
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"image_dim": 1000,
|
||||
"lens_options": {
|
||||
|
||||
}
|
||||
}
|
1364
package-lock.json
generated
Normal file
1364
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
7
package.json
Normal file
7
package.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"better-sqlite3": "^10.0.0",
|
||||
"chrome-lens-ocr": "^2.0.1",
|
||||
"sharp": "^0.33.4"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user