From d3fcedda09842e8a502bb5047ca20f3d7752cf9c Mon Sep 17 00:00:00 2001 From: osmarks Date: Sat, 18 Jan 2025 07:25:21 +0000 Subject: [PATCH] cleanup --- .gitignore | 3 + misc/bad-go-version/go.mod | 26 - misc/bad-go-version/go.sum | 100 --- misc/bad-go-version/meme_search.go | 877 -------------------- misc/bad-go-version/ocr.go | 264 ------ misc/bad-go-version/problematic_thing.go | 891 --------------------- misc/bad-go-version/problematic_thing_2.go | 265 ------ misc/mse_accursed.py | 212 ----- misc/train_xgboost.py | 19 - 9 files changed, 3 insertions(+), 2654 deletions(-) delete mode 100644 misc/bad-go-version/go.mod delete mode 100644 misc/bad-go-version/go.sum delete mode 100644 misc/bad-go-version/meme_search.go delete mode 100644 misc/bad-go-version/ocr.go delete mode 100644 misc/bad-go-version/problematic_thing.go delete mode 100644 misc/bad-go-version/problematic_thing_2.go delete mode 100644 misc/mse_accursed.py delete mode 100644 misc/train_xgboost.py diff --git a/.gitignore b/.gitignore index 55883d8..9c2d619 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ diskann/target */flamegraph.svg *.hdf5 *.v +shards +index +queries.txt diff --git a/misc/bad-go-version/go.mod b/misc/bad-go-version/go.mod deleted file mode 100644 index 5dd9b45..0000000 --- a/misc/bad-go-version/go.mod +++ /dev/null @@ -1,26 +0,0 @@ -module meme-search - -go 1.22.2 - -require ( - github.com/DataIntelligenceCrew/go-faiss v0.2.0 - github.com/jmoiron/sqlx v1.4.0 - github.com/mattn/go-sqlite3 v1.14.22 - github.com/samber/lo v1.39.0 - github.com/titanous/json5 v1.0.0 - github.com/vmihailenco/msgpack v4.0.4+incompatible - github.com/x448/float16 v0.8.4 - golang.org/x/sync v0.7.0 -) - -require ( - github.com/davidbyttow/govips/v2 v2.14.0 // indirect - github.com/golang/protobuf v1.5.2 // indirect - github.com/h2non/bimg v1.1.9 // indirect - golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect - golang.org/x/image v0.16.0 // indirect - golang.org/x/net v0.25.0 // indirect - golang.org/x/text v0.15.0 // indirect - google.golang.org/appengine v1.6.8 // indirect - google.golang.org/protobuf v1.26.0 // indirect -) diff --git a/misc/bad-go-version/go.sum b/misc/bad-go-version/go.sum deleted file mode 100644 index 02a29a2..0000000 --- a/misc/bad-go-version/go.sum +++ /dev/null @@ -1,100 +0,0 @@ -filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= -github.com/DataIntelligenceCrew/go-faiss v0.2.0 h1:c0pxAr0vldXIuE4DZnqpl6FuuH1uZd45d+NiQHKg1uU= -github.com/DataIntelligenceCrew/go-faiss v0.2.0/go.mod h1:4Gi7G3PF78IwZigTL2M1AJXOaAgxyL66vCqUYVaNgwk= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davidbyttow/govips/v2 v2.14.0 h1:il3pX0XMZ5nlwipkFJHRZ3vGzcdXWApARalJxNpRHJU= -github.com/davidbyttow/govips/v2 v2.14.0/go.mod h1:eglyvgm65eImDiJJk4wpj9LSz4pWivPzWgDqkxWJn5k= -github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/h2non/bimg v1.1.9 h1:WH20Nxko9l/HFm4kZCA3Phbgu2cbHvYzxwxn9YROEGg= -github.com/h2non/bimg v1.1.9/go.mod h1:R3+UiYwkK4rQl6KVFTOFJHitgLbZXBZNFh2cv3AEbp8= -github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= -github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= -github.com/json5/json5-go v0.0.0-20160331055859-40c2958e3bf8 h1:BQuwfXQRDQMI8YNqINKNlFV23P0h07ZvOQAtezAEsP8= -github.com/json5/json5-go v0.0.0-20160331055859-40c2958e3bf8/go.mod h1:7n1PdYNh4RIHTvILru80IEstTADqQz/wmjeNXTcC9rA= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= -github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA= -github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/thoas/go-funk v0.9.3 h1:7+nAEx3kn5ZJcnDm2Bh23N2yOtweO14bi//dvRtgLpw= -github.com/thoas/go-funk v0.9.3/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= -github.com/titanous/json5 v1.0.0 h1:hJf8Su1d9NuI/ffpxgxQfxh/UiBFZX7bMPid0rIL/7s= -github.com/titanous/json5 v1.0.0/go.mod h1:7JH1M8/LHKc6cyP5o5g3CSaRj+mBrIimTxzpvmckH8c= -github.com/vmihailenco/msgpack v4.0.4+incompatible h1:dSLoQfGFAo3F6OoNhwUmLwVgaUXK79GlxNBwueZn0xI= -github.com/vmihailenco/msgpack v4.0.4+incompatible/go.mod h1:fy3FlTQTDXWkZ7Bh6AcGMlsjHatGryHQYUTf1ShIgkk= -github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= -github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= -golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 h1:3MTrJm4PyNL9NBqvYDSj3DHl46qQakyfqfWo4jgfaEM= -golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17/go.mod h1:lgLbSvA5ygNOMpwM/9anMpWVlVJ7Z+cHWq/eFuinpGE= -golang.org/x/image v0.10.0/go.mod h1:jtrku+n79PfroUbvDdeUWMAI+heR786BofxrbiSF+J0= -golang.org/x/image v0.16.0 h1:9kloLAKhUufZhA12l5fwnx2NZW39/we1UhBesW433jw= -golang.org/x/image v0.16.0/go.mod h1:ugSZItdV4nOxyqp56HmXwH0Ry0nBCpjnZdpDaIHdoPs= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= -golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= -golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= -golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= -golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0 h1:bxAC2xTBsZGibn2RTntX0oH50xLsqy1OxA9tTL3p/lk= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/misc/bad-go-version/meme_search.go b/misc/bad-go-version/meme_search.go deleted file mode 100644 index 81fc2b7..0000000 --- a/misc/bad-go-version/meme_search.go +++ /dev/null @@ -1,877 +0,0 @@ -package main - -import ( - "bytes" - "encoding/base64" - "encoding/json" - "fmt" - "hash/fnv" - "io" - "log" - "net/http" - "os" - "path/filepath" - "runtime" - "runtime/pprof" - "strings" - "sync" - "time" - - "github.com/DataIntelligenceCrew/go-faiss" - "github.com/h2non/bimg" - "github.com/jmoiron/sqlx" - _ "github.com/mattn/go-sqlite3" - "github.com/samber/lo" - "github.com/vmihailenco/msgpack" - "github.com/x448/float16" - "golang.org/x/sync/errgroup" -) - -type Config struct { - ClipServer string `json:"clip_server"` - DbPath string `json:"db_path"` - Port int16 `json:"port"` - Files string `json:"files"` - EnableOCR bool `json:"enable_ocr"` - ThumbsPath string `json:"thumbs_path"` - EnableThumbnails bool `json:"enable_thumbs"` -} - -type Index struct { - vectors *faiss.IndexImpl - filenames []string - formatCodes []int64 - formatNames []string -} - -var schema = ` -CREATE TABLE IF NOT EXISTS files ( - filename TEXT PRIMARY KEY, - embedding_time INTEGER, - ocr_time INTEGER, - thumbnail_time INTEGER, - embedding BLOB, - ocr TEXT, - raw_ocr_segments BLOB, - thumbnails BLOB -); - -CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 ( - filename, - ocr, - tokenize='unicode61 remove_diacritics 2', - content='ocr' -); - -CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON files BEGIN - INSERT INTO ocr_fts (rowid, filename, ocr) VALUES (new.rowid, new.filename, COALESCE(new.ocr, '')); -END; - -CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON files BEGIN - INSERT INTO ocr_fts (ocr_fts, rowid, filename, ocr) VALUES ('delete', old.rowid, old.filename, COALESCE(old.ocr, '')); -END; - -CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER UPDATE ON files BEGIN - INSERT INTO ocr_fts (ocr_fts, rowid, filename, ocr) VALUES ('delete', old.rowid, old.filename, COALESCE(old.ocr, '')); - INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, COALESCE(new.ocr, '')); -END; -` - -type FileRecord struct { - Filename string `db:"filename"` - EmbedTime int64 `db:"embedding_time"` - OcrTime int64 `db:"ocr_time"` - ThumbnailTime int64 `db:"thumbnail_time"` - Embedding []byte `db:"embedding"` - Ocr string `db:"ocr"` - RawOcrSegments []byte `db:"raw_ocr_segments"` - Thumbnails []byte `db:"thumbnails"` -} - -type InferenceServerConfig struct { - BatchSize uint `msgpack:"batch"` - ImageSize []uint `msgpack:"image_size"` - EmbeddingSize uint `msgpack:"embedding_size"` -} - -func decodeMsgpackFrom[O interface{}](resp *http.Response) (O, error) { - var result O - respData, err := io.ReadAll(resp.Body) - if err != nil { - return result, err - } - err = msgpack.Unmarshal(respData, &result) - return result, err -} - -func queryClipServer[I interface{}, O interface{}](config Config, path string, data I) (O, error) { - var result O - b, err := msgpack.Marshal(data) - if err != nil { - return result, err - } - resp, err := http.Post(config.ClipServer+path, "application/msgpack", bytes.NewReader(b)) - if err != nil { - return result, err - } - defer resp.Body.Close() - return decodeMsgpackFrom[O](resp) -} - -type LoadedImage struct { - image *bimg.Image - filename string - originalSize int -} - -type EmbeddingInput struct { - image []byte - filename string -} - -type EmbeddingRequest struct { - Images [][]byte `msgpack:"images"` - Text []string `msgpack:"text"` -} - -type EmbeddingResponse = [][]byte - -func timestamp() int64 { - return time.Now().UnixMicro() -} - -type ImageFormatConfig struct { - targetWidth int - targetFilesize int - quality int - format bimg.ImageType - extension string -} - -func generateFilenameHash(filename string) string { - hasher := fnv.New128() - hasher.Write([]byte(filename)) - hash := hasher.Sum(make([]byte, 0)) - return base64.RawURLEncoding.EncodeToString(hash) -} - -func generateThumbnailFilename(filename string, formatName string, formatConfig ImageFormatConfig) string { - return fmt.Sprintf("%s%s.%s", generateFilenameHash(filename), formatName, formatConfig.extension) -} - -func initializeDatabase(config Config) (*sqlx.DB, error) { - db, err := sqlx.Connect("sqlite3", config.DbPath) - if err != nil { - return nil, err - } - _, err = db.Exec("PRAGMA busy_timeout = 2000; PRAGMA journal_mode = WAL") - if err != nil { - return nil, err - } - return db, nil -} - -func imageFormats(config Config) map[string]ImageFormatConfig { - return map[string]ImageFormatConfig{ - "jpegl": { - targetWidth: 800, - quality: 70, - format: bimg.JPEG, - extension: "jpg", - }, - "jpegh": { - targetWidth: 1600, - quality: 80, - format: bimg.JPEG, - extension: "jpg", - }, - "jpeg256kb": { - targetWidth: 500, - targetFilesize: 256000, - format: bimg.JPEG, - extension: "jpg", - }, - "avifh": { - targetWidth: 1600, - quality: 80, - format: bimg.AVIF, - extension: "avif", - }, - "avifl": { - targetWidth: 800, - quality: 30, - format: bimg.AVIF, - extension: "avif", - }, - } -} - -func ingestFiles(config Config, backend InferenceServerConfig) error { - var wg errgroup.Group - var iwg errgroup.Group - - // We assume everything is either a modern browser (low-DPI or high-DPI), an ancient browser or a ComputerCraft machine abusing Extra Utilities 2 screens. - var formats = imageFormats(config) - - db, err := initializeDatabase(config) - if err != nil { - return err - } - defer db.Close() - - toProcess := make(chan FileRecord, 100) - toEmbed := make(chan EmbeddingInput, backend.BatchSize) - toThumbnail := make(chan LoadedImage, 30) - toOCR := make(chan LoadedImage, 30) - embedBatches := make(chan []EmbeddingInput, 1) - - // image loading and preliminary resizing - for range runtime.NumCPU() { - iwg.Go(func() error { - for record := range toProcess { - path := filepath.Join(config.Files, record.Filename) - buffer, err := bimg.Read(path) - if err != nil { - log.Println("could not read ", record.Filename) - } - img := bimg.NewImage(buffer) - if record.Embedding == nil { - resized, err := img.Process(bimg.Options{ - Width: int(backend.ImageSize[0]), - Height: int(backend.ImageSize[1]), - Force: true, - Type: bimg.PNG, - Interpretation: bimg.InterpretationSRGB, - }) - if err != nil { - log.Println("resize failure", record.Filename, err) - } else { - toEmbed <- EmbeddingInput{ - image: resized, - filename: record.Filename, - } - } - } - if record.Thumbnails == nil && config.EnableThumbnails { - toThumbnail <- LoadedImage{ - image: img, - filename: record.Filename, - originalSize: len(buffer), - } - } - if record.RawOcrSegments == nil && config.EnableOCR { - toOCR <- LoadedImage{ - image: img, - filename: record.Filename, - } - } - } - return nil - }) - } - - if config.EnableThumbnails { - for range runtime.NumCPU() { - wg.Go(func() error { - for image := range toThumbnail { - generatedFormats := make([]string, 0) - for formatName, formatConfig := range formats { - var err error - var resized []byte - if formatConfig.targetFilesize != 0 { - lb := 1 - ub := 100 - for { - quality := (lb + ub) / 2 - resized, err = image.image.Process(bimg.Options{ - Width: formatConfig.targetWidth, - Type: formatConfig.format, - Speed: 4, - Quality: quality, - StripMetadata: true, - Enlarge: false, - }) - if len(resized) > image.originalSize { - ub = quality - } else { - lb = quality + 1 - } - if lb >= ub { - break - } - } - } else { - resized, err = image.image.Process(bimg.Options{ - Width: formatConfig.targetWidth, - Type: formatConfig.format, - Speed: 4, - Quality: formatConfig.quality, - StripMetadata: true, - Enlarge: false, - }) - } - if err != nil { - log.Println("thumbnailing failure", image.filename, err) - continue - } - if len(resized) < image.originalSize { - generatedFormats = append(generatedFormats, formatName) - err = bimg.Write(filepath.Join(config.ThumbsPath, generateThumbnailFilename(image.filename, formatName, formatConfig)), resized) - if err != nil { - return err - } - } - } - formatsData, err := msgpack.Marshal(generatedFormats) - if err != nil { - return err - } - _, err = db.Exec("UPDATE files SET thumbnails = ?, thumbnail_time = ? WHERE filename = ?", formatsData, timestamp(), image.filename) - if err != nil { - return err - } - } - return nil - }) - } - } - - if config.EnableOCR { - for range 100 { - wg.Go(func() error { - for image := range toOCR { - scan, err := scanImage(image.image) - if err != nil { - log.Println("OCR failure", image.filename, err) - continue - } - ocrText := "" - for _, segment := range scan { - ocrText += segment.text - ocrText += "\n" - } - ocrData, err := msgpack.Marshal(scan) - if err != nil { - return err - } - _, err = db.Exec("UPDATE files SET ocr = ?, raw_ocr_segments = ?, ocr_time = ? WHERE filename = ?", ocrText, ocrData, timestamp(), image.filename) - if err != nil { - return err - } - } - return nil - }) - } - } - - wg.Go(func() error { - buffer := make([]EmbeddingInput, 0, backend.BatchSize) - for input := range toEmbed { - buffer = append(buffer, input) - if len(buffer) == int(backend.BatchSize) { - embedBatches <- buffer - buffer = make([]EmbeddingInput, 0, backend.BatchSize) - } - } - if len(buffer) > 0 { - embedBatches <- buffer - } - close(embedBatches) - return nil - }) - - for range 3 { - wg.Go(func() error { - for batch := range embedBatches { - result, err := queryClipServer[EmbeddingRequest, EmbeddingResponse](config, "", EmbeddingRequest{ - Images: lo.Map(batch, func(item EmbeddingInput, _ int) []byte { return item.image }), - }) - if err != nil { - return err - } - - tx, err := db.Begin() - if err != nil { - return err - } - for i, vector := range result { - _, err = tx.Exec("UPDATE files SET embedding_time = ?, embedding = ? WHERE filename = ?", timestamp(), vector, batch[i].filename) - if err != nil { - return err - } - } - err = tx.Commit() - if err != nil { - return err - } - } - return nil - }) - } - - filenamesOnDisk := make(map[string]struct{}) - - err = filepath.WalkDir(config.Files, func(path string, d os.DirEntry, err error) error { - filename := strings.TrimPrefix(path, config.Files) - if err != nil { - return err - } - if d.IsDir() { - return nil - } - filenamesOnDisk[filename] = struct{}{} - records := []FileRecord{} - err = db.Select(&records, "SELECT * FROM files WHERE filename = ?", filename) - if err != nil { - return err - } - stat, err := d.Info() - if err != nil { - return err - } - modtime := stat.ModTime().UnixMicro() - if len(records) == 0 || modtime > records[0].EmbedTime || modtime > records[0].OcrTime || modtime > records[0].ThumbnailTime { - _, err = db.Exec("INSERT OR IGNORE INTO files VALUES (?, 0, 0, 0, '', '', '', '')", filename) - if err != nil { - return err - } - record := FileRecord{ - Filename: filename, - } - if len(records) > 0 { - record = records[0] - } - if modtime > record.EmbedTime || len(record.Embedding) == 0 { - record.Embedding = nil - } - if modtime > record.OcrTime || len(record.RawOcrSegments) == 0 { - record.RawOcrSegments = nil - } - if modtime > record.ThumbnailTime || len(record.Thumbnails) == 0 { - record.Thumbnails = nil - } - toProcess <- record - } - return nil - }) - if err != nil { - return err - } - close(toProcess) - - err = iwg.Wait() - close(toEmbed) - close(toThumbnail) - if err != nil { - return err - } - err = wg.Wait() - if err != nil { - return err - } - - rows, err := db.Queryx("SELECT filename FROM files") - if err != nil { - return err - } - tx, err := db.Begin() - if err != nil { - return err - } - for rows.Next() { - var filename string - err := rows.Scan(&filename) - if err != nil { - return err - } - if _, ok := filenamesOnDisk[filename]; !ok { - _, err = tx.Exec("DELETE FROM files WHERE filename = ?", filename) - if err != nil { - return err - } - } - } - if err = tx.Commit(); err != nil { - return err - } - - return nil -} - -const INDEX_ADD_BATCH = 512 - -func buildIndex(config Config, backend InferenceServerConfig) (Index, error) { - var index Index - - db, err := initializeDatabase(config) - if err != nil { - return index, err - } - defer db.Close() - - newFAISSIndex, err := faiss.IndexFactory(int(backend.EmbeddingSize), "SQfp16", faiss.MetricInnerProduct) - if err != nil { - return index, err - } - index.vectors = newFAISSIndex - - var count int - err = db.Get(&count, "SELECT COUNT(*) FROM files") - if err != nil { - return index, err - } - - index.filenames = make([]string, 0, count) - index.formatCodes = make([]int64, 0, count) - buffer := make([]float32, 0, INDEX_ADD_BATCH*backend.EmbeddingSize) - index.formatNames = make([]string, 0, 5) - - record := FileRecord{} - rows, err := db.Queryx("SELECT * FROM files") - if err != nil { - return index, err - } - for rows.Next() { - err := rows.StructScan(&record) - if err != nil { - return index, err - } - if len(record.Embedding) > 0 { - index.filenames = append(index.filenames, record.Filename) - for i := 0; i < len(record.Embedding); i += 2 { - buffer = append(buffer, float16.Frombits(uint16(record.Embedding[i])+uint16(record.Embedding[i+1])<<8).Float32()) - } - if len(buffer) == cap(buffer) { - index.vectors.Add(buffer) - buffer = make([]float32, 0, INDEX_ADD_BATCH*backend.EmbeddingSize) - } - - formats := make([]string, 0, 5) - if len(record.Thumbnails) > 0 { - err := msgpack.Unmarshal(record.Thumbnails, &formats) - if err != nil { - return index, err - } - } - - formatCode := int64(0) - for _, formatString := range formats { - found := false - for i, name := range index.formatNames { - if name == formatString { - formatCode |= 1 << i - found = true - break - } - } - if !found { - newIndex := len(index.formatNames) - formatCode |= 1 << newIndex - index.formatNames = append(index.formatNames, formatString) - } - } - index.formatCodes = append(index.formatCodes, formatCode) - } - } - if len(buffer) > 0 { - index.vectors.Add(buffer) - } - - return index, nil -} - -func decodeFP16Buffer(buf []byte) []float32 { - out := make([]float32, 0, len(buf)/2) - for i := 0; i < len(buf); i += 2 { - out = append(out, float16.Frombits(uint16(buf[i])+uint16(buf[i+1])<<8).Float32()) - } - return out -} - -type EmbeddingVector []float32 - -type QueryResult struct { - Matches [][]interface{} `json:"matches"` - Formats []string `json:"formats"` - Extensions map[string]string `json:"extensions"` -} - -// this terrible language cannot express tagged unions -type QueryTerm struct { - Embedding *EmbeddingVector `json:"embedding"` - Image *string `json:"image"` // base64 - Text *string `json:"text"` - Weight *float32 `json:"weight"` -} - -type QueryRequest struct { - Terms []QueryTerm `json:"terms"` - K *int `json:"k"` -} - -func queryIndex(index *Index, query EmbeddingVector, k int) (QueryResult, error) { - var qr QueryResult - distances, ids, err := index.vectors.Search(query, int64(k)) - if err != nil { - return qr, err - } - items := lo.Map(lo.Zip2(distances, ids), func(x lo.Tuple2[float32, int64], i int) []interface{} { - return []interface{}{ - x.A, - index.filenames[x.B], - generateFilenameHash(index.filenames[x.B]), - index.formatCodes[x.B], - } - }) - - return QueryResult{ - Matches: items, - Formats: index.formatNames, - }, nil -} - -func handleRequest(config Config, backendConfig InferenceServerConfig, index *Index, w http.ResponseWriter, req *http.Request) error { - if req.Body == nil { - io.WriteString(w, "OK") // health check - return nil - } - dec := json.NewDecoder(req.Body) - var qreq QueryRequest - err := dec.Decode(&qreq) - if err != nil { - return err - } - - totalEmbedding := make(EmbeddingVector, backendConfig.EmbeddingSize) - - imageBatch := make([][]byte, 0) - imageWeights := make([]float32, 0) - textBatch := make([]string, 0) - textWeights := make([]float32, 0) - - for _, term := range qreq.Terms { - if term.Image != nil { - bytes, err := base64.StdEncoding.DecodeString(*term.Image) - if err != nil { - return err - } - loaded := bimg.NewImage(bytes) - resized, err := loaded.Process(bimg.Options{ - Width: int(backendConfig.ImageSize[0]), - Height: int(backendConfig.ImageSize[1]), - Force: true, - Type: bimg.PNG, - Interpretation: bimg.InterpretationSRGB, - }) - if err != nil { - return err - } - imageBatch = append(imageBatch, resized) - if term.Weight != nil { - imageWeights = append(imageWeights, *term.Weight) - } else { - imageWeights = append(imageWeights, 1) - } - } - if term.Text != nil { - textBatch = append(textBatch, *term.Text) - if term.Weight != nil { - textWeights = append(textWeights, *term.Weight) - } else { - textWeights = append(textWeights, 1) - } - } - if term.Embedding != nil { - weight := float32(1.0) - if term.Weight != nil { - weight = *term.Weight - } - for i := 0; i < int(backendConfig.EmbeddingSize); i += 1 { - totalEmbedding[i] += (*term.Embedding)[i] * weight - } - } - } - - if len(imageBatch) > 0 { - embs, err := queryClipServer[EmbeddingRequest, EmbeddingResponse](config, "/", EmbeddingRequest{ - Images: imageBatch, - }) - if err != nil { - return err - } - for j, emb := range embs { - embd := decodeFP16Buffer(emb) - for i := 0; i < int(backendConfig.EmbeddingSize); i += 1 { - totalEmbedding[i] += embd[i] * imageWeights[j] - } - } - } - if len(textBatch) > 0 { - embs, err := queryClipServer[EmbeddingRequest, EmbeddingResponse](config, "/", EmbeddingRequest{ - Text: textBatch, - }) - if err != nil { - return err - } - for j, emb := range embs { - embd := decodeFP16Buffer(emb) - for i := 0; i < int(backendConfig.EmbeddingSize); i += 1 { - totalEmbedding[i] += embd[i] * textWeights[j] - } - } - } - - k := 1000 - if qreq.K != nil { - k = *qreq.K - } - - w.Header().Add("Content-Type", "application/json") - enc := json.NewEncoder(w) - - qres, err := queryIndex(index, totalEmbedding, k) - - qres.Extensions = make(map[string]string) - for k, v := range imageFormats(config) { - qres.Extensions[k] = v.extension - } - - if err != nil { - return err - } - - err = enc.Encode(qres) - if err != nil { - return err - } - return nil -} - -func init() { - os.Setenv("VIPS_WARNING", "FALSE") // this does not actually work - bimg.VipsCacheSetMax(0) - bimg.VipsCacheSetMaxMem(0) -} - -func main() { - content, err := os.ReadFile(os.Args[1]) - if err != nil { - log.Fatal("config file unreadable ", err) - } - var config Config - err = json.Unmarshal(content, &config) - if err != nil { - log.Fatal("config file wrong ", err) - } - fmt.Println(config) - - db, err := sqlx.Connect("sqlite3", config.DbPath) - if err != nil { - log.Fatal("DB connection failure ", db) - } - db.MustExec(schema) - - var backend InferenceServerConfig - for { - resp, err := http.Get(config.ClipServer + "/config") - if err != nil { - log.Println("backend failed (fetch) ", err) - } - backend, err = decodeMsgpackFrom[InferenceServerConfig](resp) - resp.Body.Close() - if err != nil { - log.Println("backend failed (parse) ", err) - } else { - break - } - time.Sleep(time.Second) - } - - requestIngest := make(chan struct{}, 1) - - var index *Index - // maybe this ought to be mutexed? - var lastError *error - // there's not a neat way to reusably broadcast to multiple channels, but I *can* abuse WaitGroups probably - // this might cause horrible concurrency issues, but you brought me to this point, Go designers - var wg sync.WaitGroup - - go func() { - for { - wg.Add(1) - log.Println("ingest running") - err := ingestFiles(config, backend) - if err != nil { - log.Println("ingest failed ", err) - lastError = &err - } else { - newIndex, err := buildIndex(config, backend) - if err != nil { - log.Println("index build failed ", err) - lastError = &err - } else { - lastError = nil - index = &newIndex - } - } - wg.Done() - <-requestIngest - } - }() - newIndex, err := buildIndex(config, backend) - index = &newIndex - if err != nil { - log.Fatal("index build failed ", err) - } - - http.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) { - w.Header().Add("Access-Control-Allow-Origin", "*") - w.Header().Add("Access-Control-Allow-Headers", "Content-Type") - if req.Method == "OPTIONS" { - w.WriteHeader(204) - return - } - err := handleRequest(config, backend, index, w, req) - if err != nil { - w.Header().Add("Content-Type", "application/json") - w.WriteHeader(500) - json.NewEncoder(w).Encode(map[string]string{ - "error": err.Error(), - }) - } - }) - http.HandleFunc("/reload", func(w http.ResponseWriter, req *http.Request) { - if req.Method == "POST" { - log.Println("requesting index reload") - select { - case requestIngest <- struct{}{}: - default: - } - wg.Wait() - if lastError == nil { - w.Write([]byte("OK")) - } else { - w.WriteHeader(500) - w.Write([]byte((*lastError).Error())) - } - } - }) - http.HandleFunc("/profile", func(w http.ResponseWriter, req *http.Request) { - f, err := os.Create("mem.pprof") - if err != nil { - log.Fatal("could not create memory profile: ", err) - } - defer f.Close() - var m runtime.MemStats - runtime.ReadMemStats(&m) - log.Printf("Memory usage: Alloc=%v, TotalAlloc=%v, Sys=%v", m.Alloc, m.TotalAlloc, m.Sys) - log.Println(bimg.VipsMemory()) - bimg.VipsDebugInfo() - runtime.GC() // Trigger garbage collection - if err := pprof.WriteHeapProfile(f); err != nil { - log.Fatal("could not write memory profile: ", err) - } - }) - log.Println("starting server") - http.ListenAndServe(fmt.Sprintf(":%d", config.Port), nil) -} diff --git a/misc/bad-go-version/ocr.go b/misc/bad-go-version/ocr.go deleted file mode 100644 index 55ca675..0000000 --- a/misc/bad-go-version/ocr.go +++ /dev/null @@ -1,264 +0,0 @@ -package main - -import ( - "bytes" - "errors" - "fmt" - "io" - "math" - "mime/multipart" - "net/http" - "net/textproto" - "regexp" - "strings" - "time" - - "github.com/h2non/bimg" - "github.com/samber/lo" - "github.com/titanous/json5" -) - -const CALLBACK_REGEX string = ">AF_initDataCallback\\(({key: 'ds:1'.*?)\\);" - -type SegmentCoords struct { - x int - y int - w int - h int -} - -type Segment struct { - coords SegmentCoords - text string -} - -type ScanResult []Segment - -// TODO coordinates are negative sometimes and I think they shouldn't be -func rationalizeCoordsFormat1(imageW float64, imageH float64, centerXFraction float64, centerYFraction float64, widthFraction float64, heightFraction float64) SegmentCoords { - return SegmentCoords{ - x: int(math.Round((centerXFraction - widthFraction/2) * imageW)), - y: int(math.Round((centerYFraction - heightFraction/2) * imageH)), - w: int(math.Round(widthFraction * imageW)), - h: int(math.Round(heightFraction * imageH)), - } -} - -func scanImageChunk(image []byte, imageWidth int, imageHeight int) (ScanResult, error) { - var result ScanResult - timestamp := time.Now().UnixMicro() - var b bytes.Buffer - w := multipart.NewWriter(&b) - defer w.Close() - h := make(textproto.MIMEHeader) - h.Set("Content-Disposition", fmt.Sprintf(`form-data; name="encoded_image"; filename="ocr%d.png"`, timestamp)) - h.Set("Content-Type", "image/png") - fw, err := w.CreatePart(h) - if err != nil { - return result, err - } - fw.Write(image) - w.Close() - - req, err := http.NewRequest("POST", fmt.Sprintf("https://lens.google.com/v3/upload?stcs=%d", timestamp), &b) - if err != nil { - return result, err - } - req.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 13; RMX3771) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.144 Mobile Safari/537.36") - req.AddCookie(&http.Cookie{ - Name: "SOCS", - Value: "CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg", - }) - req.Header.Set("Content-Type", w.FormDataContentType()) - client := http.Client{} - res, err := client.Do(req) - if err != nil { - return result, err - } - defer res.Body.Close() - body, err := io.ReadAll(res.Body) - if err != nil { - return result, err - } - re, _ := regexp.Compile(CALLBACK_REGEX) - matches := re.FindStringSubmatch(string(body[:])) - if len(matches) == 0 { - return result, fmt.Errorf("invalid API response") - } - match := matches[1] - var lensObject map[string]interface{} - err = json5.Unmarshal([]byte(match), &lensObject) - if err != nil { - return result, err - } - - if _, ok := lensObject["errorHasStatus"]; ok { - return result, errors.New("lens failed") - } - - root := lensObject["data"].([]interface{}) - - var textSegments []string - var textRegions []SegmentCoords - - // I don't know why Google did this. - // Text segments are in one place and their locations are in another, using a very strange coordinate system. - // At least I don't need whatever is contained in the base64 parts (which I assume are protobufs). - // TODO: on a few images, this seems to not work for some reason. - defer func() { - if r := recover(); r != nil { - // https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode. - // In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images. - textSegments = []string{} - textRegions = []SegmentCoords{} - } - }() - - textSegmentsRaw := root[3].([]interface{})[4].([]interface{})[0].([]interface{})[0].([]interface{}) - textRegionsRaw := root[2].([]interface{})[3].([]interface{})[0].([]interface{}) - for _, x := range textRegionsRaw { - if strings.HasPrefix(x.([]interface{})[11].(string), "text:") { - rawCoords := x.([]interface{})[1].([]interface{}) - coords := rationalizeCoordsFormat1(float64(imageWidth), float64(imageHeight), rawCoords[0].(float64), rawCoords[1].(float64), rawCoords[2].(float64), rawCoords[3].(float64)) - textRegions = append(textRegions, coords) - } - } - for _, x := range textSegmentsRaw { - textSegment := x.(string) - textSegments = append(textSegments, textSegment) - } - - return lo.Map(lo.Zip2(textSegments, textRegions), func(x lo.Tuple2[string, SegmentCoords], _ int) Segment { - return Segment{ - text: x.A, - coords: x.B, - } - }), nil -} - -const MAX_DIM int = 1024 - -func scanImage(image *bimg.Image) (ScanResult, error) { - result := ScanResult{} - metadata, err := image.Metadata() - if err != nil { - return result, err - } - width := metadata.Size.Width - height := metadata.Size.Height - if width > MAX_DIM { - width = MAX_DIM - height = int(math.Round(float64(height) * (float64(width) / float64(metadata.Size.Width)))) - } - for y := 0; y < height; y += MAX_DIM { - chunkHeight := MAX_DIM - if y+chunkHeight > height { - chunkHeight = height - y - } - chunk, err := image.Process(bimg.Options{ - Height: height, // these are for overall image dimensions (resize then crop) - Width: width, - Top: y, - AreaHeight: chunkHeight, - AreaWidth: width, - Crop: true, - Type: bimg.PNG, - }) - if err != nil { - return result, err - } - res, err := scanImageChunk(chunk, width, chunkHeight) - if err != nil { - return result, err - } - for _, segment := range res { - result = append(result, Segment{ - text: segment.text, - coords: SegmentCoords{ - y: segment.coords.y + y, - x: segment.coords.x, - w: segment.coords.w, - h: segment.coords.h, - }, - }) - } - } - - return result, nil -} - -/* -async def scan_image_chunk(sess, image): - # send data to inscrutable undocumented Google service - # https://github.com/AuroraWright/owocr/blob/master/owocr/ocr.py#L193 - async with aiohttp.ClientSession() as sess: - data = aiohttp.FormData() - data.add_field( - "encoded_image", - encode_img(image), - filename="ocr" + str(timestamp) + ".png", - content_type="image/png" - ) - async with sess.post(url, headers=headers, cookies=cookies, data=data, timeout=10) as res: - body = await res.text() - - # I really worry about Google sometimes. This is not a sensible format. - match = CALLBACK_REGEX.search(body) - if match == None: - raise ValueError("Invalid callback") - - lens_object = pyjson5.loads(match.group(1)) - if "errorHasStatus" in lens_object: - raise RuntimeError("Lens failed") - - text_segments = [] - text_regions = [] - - root = lens_object["data"] - - # I don't know why Google did this. - # Text segments are in one place and their locations are in another, using a very strange coordinate system. - # At least I don't need whatever is contained in the base64 partss (which I assume are protobufs). - # TODO: on a few images, this seems to not work for some reason. - try: - text_segments = root[3][4][0][0] - text_regions = [ rationalize_coords_format1(image.width, image.height, *x[1]) for x in root[2][3][0] if x[11].startswith("text:") ] - except (KeyError, IndexError): - # https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode. - # In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images. - return [], [] - - return text_segments, text_regions - -MAX_SCAN_DIM = 1000 # not actually true but close enough -def chunk_image(image: Image): - chunks = [] - # Cut image down in X axis (I'm assuming images aren't too wide to scan in downscaled form because merging text horizontally would be annoying) - if image.width > MAX_SCAN_DIM: - image = image.resize((MAX_SCAN_DIM, round(image.height * (image.width / MAX_SCAN_DIM))), Image.LANCZOS) - for y in range(0, image.height, MAX_SCAN_DIM): - chunks.append(image.crop((0, y, image.width, min(y + MAX_SCAN_DIM, image.height)))) - return chunks - -async def scan_chunks(sess: aiohttp.ClientSession, chunks: [Image]): - # If text happens to be split across the cut line it won't get read. - # This is because doing overlap read areas would be really annoying. - text = "" - regions = [] - for chunk in chunks: - new_segments, new_regions = await scan_image_chunk(sess, chunk) - for segment in new_segments: - text += segment + "\n" - for i, (segment, region) in enumerate(zip(new_segments, new_regions)): - regions.append({ **region, "y": region["y"] + (MAX_SCAN_DIM * i), "text": segment }) - return text, regions - -async def scan_image(sess: aiohttp.ClientSession, image: Image): - return await scan_chunks(sess, chunk_image(image)) - -if __name__ == "__main__": - async def main(): - async with aiohttp.ClientSession() as sess: - print(await scan_image(sess, Image.open("/data/public/memes-or-something/linear-algebra-chess.png"))) - asyncio.run(main()) -*/ diff --git a/misc/bad-go-version/problematic_thing.go b/misc/bad-go-version/problematic_thing.go deleted file mode 100644 index 487be9c..0000000 --- a/misc/bad-go-version/problematic_thing.go +++ /dev/null @@ -1,891 +0,0 @@ -package main - -import ( - "bytes" - "encoding/base64" - "encoding/json" - "fmt" - "hash/fnv" - "io" - "log" - "net/http" - "os" - "path/filepath" - "runtime" - "runtime/pprof" - "strings" - "sync" - "time" - - "github.com/DataIntelligenceCrew/go-faiss" - "github.com/davidbyttow/govips/v2/vips" - "github.com/h2non/bimg" - "github.com/jmoiron/sqlx" - _ "github.com/mattn/go-sqlite3" - "github.com/samber/lo" - "github.com/vmihailenco/msgpack" - "github.com/x448/float16" - "golang.org/x/sync/errgroup" -) - -type Config struct { - ClipServer string `json:"clip_server"` - DbPath string `json:"db_path"` - Port int16 `json:"port"` - Files string `json:"files"` - EnableOCR bool `json:"enable_ocr"` - ThumbsPath string `json:"thumbs_path"` - EnableThumbnails bool `json:"enable_thumbs"` -} - -type Index struct { - vectors *faiss.IndexImpl - filenames []string - formatCodes []int64 - formatNames []string -} - -var schema = ` -CREATE TABLE IF NOT EXISTS files ( - filename TEXT PRIMARY KEY, - embedding_time INTEGER, - ocr_time INTEGER, - thumbnail_time INTEGER, - embedding BLOB, - ocr TEXT, - raw_ocr_segments BLOB, - thumbnails BLOB -); - -CREATE VIRTUAL TABLE IF NOT EXISTS ocr_fts USING fts5 ( - filename, - ocr, - tokenize='unicode61 remove_diacritics 2', - content='ocr' -); - -CREATE TRIGGER IF NOT EXISTS ocr_fts_ins AFTER INSERT ON files BEGIN - INSERT INTO ocr_fts (rowid, filename, ocr) VALUES (new.rowid, new.filename, COALESCE(new.ocr, '')); -END; - -CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER DELETE ON files BEGIN - INSERT INTO ocr_fts (ocr_fts, rowid, filename, ocr) VALUES ('delete', old.rowid, old.filename, COALESCE(old.ocr, '')); -END; - -CREATE TRIGGER IF NOT EXISTS ocr_fts_del AFTER UPDATE ON files BEGIN - INSERT INTO ocr_fts (ocr_fts, rowid, filename, ocr) VALUES ('delete', old.rowid, old.filename, COALESCE(old.ocr, '')); - INSERT INTO ocr_fts (rowid, filename, text) VALUES (new.rowid, new.filename, COALESCE(new.ocr, '')); -END; -` - -type FileRecord struct { - Filename string `db:"filename"` - EmbedTime int64 `db:"embedding_time"` - OcrTime int64 `db:"ocr_time"` - ThumbnailTime int64 `db:"thumbnail_time"` - Embedding []byte `db:"embedding"` - Ocr string `db:"ocr"` - RawOcrSegments []byte `db:"raw_ocr_segments"` - Thumbnails []byte `db:"thumbnails"` - filesize int64 -} - -type InferenceServerConfig struct { - BatchSize uint `msgpack:"batch"` - ImageSize []uint `msgpack:"image_size"` - EmbeddingSize uint `msgpack:"embedding_size"` -} - -func decodeMsgpackFrom[O interface{}](resp *http.Response) (O, error) { - var result O - respData, err := io.ReadAll(resp.Body) - if err != nil { - return result, err - } - err = msgpack.Unmarshal(respData, &result) - return result, err -} - -func queryClipServer[I interface{}, O interface{}](config Config, path string, data I) (O, error) { - var result O - b, err := msgpack.Marshal(data) - if err != nil { - return result, err - } - resp, err := http.Post(config.ClipServer+path, "application/msgpack", bytes.NewReader(b)) - if err != nil { - return result, err - } - defer resp.Body.Close() - return decodeMsgpackFrom[O](resp) -} - -type LoadedImage struct { - image *vips.ImageRef - filename string - originalSize int -} - -type EmbeddingInput struct { - image []byte - filename string -} - -type EmbeddingRequest struct { - Images [][]byte `msgpack:"images"` - Text []string `msgpack:"text"` -} - -type EmbeddingResponse = [][]byte - -func timestamp() int64 { - return time.Now().UnixMicro() -} - -type ImageFormatConfig struct { - targetWidth int - targetFilesize int - quality int - format vips.ImageType - extension string -} - -func generateFilenameHash(filename string) string { - hasher := fnv.New128() - hasher.Write([]byte(filename)) - hash := hasher.Sum(make([]byte, 0)) - return base64.RawURLEncoding.EncodeToString(hash) -} - -func generateThumbnailFilename(filename string, formatName string, formatConfig ImageFormatConfig) string { - return fmt.Sprintf("%s%s.%s", generateFilenameHash(filename), formatName, formatConfig.extension) -} - -func initializeDatabase(config Config) (*sqlx.DB, error) { - db, err := sqlx.Connect("sqlite3", config.DbPath) - if err != nil { - return nil, err - } - _, err = db.Exec("PRAGMA busy_timeout = 2000; PRAGMA journal_mode = WAL") - if err != nil { - return nil, err - } - return db, nil -} - -func imageFormats(config Config) map[string]ImageFormatConfig { - return map[string]ImageFormatConfig{ - "jpegl": { - targetWidth: 800, - quality: 70, - format: vips.ImageTypeJPEG, - extension: "jpg", - }, - "jpegh": { - targetWidth: 1600, - quality: 80, - format: vips.ImageTypeJPEG, - extension: "jpg", - }, - "jpeg256kb": { - targetWidth: 500, - targetFilesize: 256000, - format: vips.ImageTypeJPEG, - extension: "jpg", - }, - "avifh": { - targetWidth: 1600, - quality: 80, - format: vips.ImageTypeAVIF, - extension: "avif", - }, - "avifl": { - targetWidth: 800, - quality: 30, - format: vips.ImageTypeAVIF, - extension: "avif", - }, - } -} - -func ingestFiles(config Config, backend InferenceServerConfig) error { - var wg errgroup.Group - var iwg errgroup.Group - - // We assume everything is either a modern browser (low-DPI or high-DPI), an ancient browser or a ComputerCraft machine abusing Extra Utilities 2 screens. - var formats = imageFormats(config) - - db, err := initializeDatabase(config) - if err != nil { - return err - } - defer db.Close() - - toProcess := make(chan FileRecord, 100) - toEmbed := make(chan EmbeddingInput, backend.BatchSize) - toThumbnail := make(chan LoadedImage, 30) - toOCR := make(chan LoadedImage, 30) - embedBatches := make(chan []EmbeddingInput, 1) - - // image loading and preliminary resizing - for range runtime.NumCPU() { - iwg.Go(func() error { - for record := range toProcess { - path := filepath.Join(config.Files, record.Filename) - img, err := vips.LoadImageFromFile(path, &vips.ImportParams{}) - if err != nil { - log.Println("could not read", record.Filename) - continue - } - if record.Embedding == nil { - i, err := img.Copy() // TODO this is ugly, we should not need to do in-place operations - if err != nil { - return err - } - err = i.ResizeWithVScale(float64(backend.ImageSize[0])/float64(i.Width()), float64(backend.ImageSize[1])/float64(i.Height()), vips.KernelLanczos3) - if err != nil { - return err - } - resized, _, err := i.ExportPng(vips.NewPngExportParams()) - if err != nil { - log.Println("resize failure", record.Filename, err) - } else { - toEmbed <- EmbeddingInput{ - image: resized, - filename: record.Filename, - } - } - } - if record.Thumbnails == nil && config.EnableThumbnails { - toThumbnail <- LoadedImage{ - image: img, - filename: record.Filename, - originalSize: int(record.filesize), - } - } - if record.RawOcrSegments == nil && config.EnableOCR { - toOCR <- LoadedImage{ - image: img, - filename: record.Filename, - } - } - } - return nil - }) - } - - if config.EnableThumbnails { - for range runtime.NumCPU() { - wg.Go(func() error { - for image := range toThumbnail { - generatedFormats := make([]string, 0) - for formatName, formatConfig := range formats { - var err error - var resized []byte - if formatConfig.targetFilesize != 0 { - lb := 1 - ub := 100 - for { - quality := (lb + ub) / 2 - i, err := image.image.Copy() - if err != nil { - return err - } - i.Resize(float64(formatConfig.targetWidth)/float64(i.Width()), vips.KernelLanczos3) - resized, _, err = i.Export(&vips.ExportParams{ - Format: formatConfig.format, - Speed: 4, - Quality: quality, - StripMetadata: true, - }) - if len(resized) > image.originalSize { - ub = quality - } else { - lb = quality + 1 - } - if lb >= ub { - break - } - } - } else { - i, err := image.image.Copy() - if err != nil { - return err - } - i.Resize(float64(formatConfig.targetWidth)/float64(i.Width()), vips.KernelLanczos3) - resized, _, err = i.Export(&vips.ExportParams{ - Format: formatConfig.format, - Speed: 4, - Quality: formatConfig.quality, - StripMetadata: true, - }) - } - if err != nil { - log.Println("thumbnailing failure", image.filename, err) - continue - } - if len(resized) < image.originalSize { - generatedFormats = append(generatedFormats, formatName) - err = bimg.Write(filepath.Join(config.ThumbsPath, generateThumbnailFilename(image.filename, formatName, formatConfig)), resized) - if err != nil { - return err - } - } - } - formatsData, err := msgpack.Marshal(generatedFormats) - if err != nil { - return err - } - _, err = db.Exec("UPDATE files SET thumbnails = ?, thumbnail_time = ? WHERE filename = ?", formatsData, timestamp(), image.filename) - if err != nil { - return err - } - } - return nil - }) - } - } - - if config.EnableOCR { - for range 100 { - wg.Go(func() error { - for image := range toOCR { - scan, err := scanImage(image.image) - if err != nil { - log.Println("OCR failure", image.filename, err) - continue - } - ocrText := "" - for _, segment := range scan { - ocrText += segment.text - ocrText += "\n" - } - ocrData, err := msgpack.Marshal(scan) - if err != nil { - return err - } - _, err = db.Exec("UPDATE files SET ocr = ?, raw_ocr_segments = ?, ocr_time = ? WHERE filename = ?", ocrText, ocrData, timestamp(), image.filename) - if err != nil { - return err - } - } - return nil - }) - } - } - - wg.Go(func() error { - buffer := make([]EmbeddingInput, 0, backend.BatchSize) - for input := range toEmbed { - buffer = append(buffer, input) - if len(buffer) == int(backend.BatchSize) { - embedBatches <- buffer - buffer = make([]EmbeddingInput, 0, backend.BatchSize) - } - } - if len(buffer) > 0 { - embedBatches <- buffer - } - close(embedBatches) - return nil - }) - - for range 3 { - wg.Go(func() error { - for batch := range embedBatches { - result, err := queryClipServer[EmbeddingRequest, EmbeddingResponse](config, "", EmbeddingRequest{ - Images: lo.Map(batch, func(item EmbeddingInput, _ int) []byte { return item.image }), - }) - if err != nil { - return err - } - - tx, err := db.Begin() - if err != nil { - return err - } - for i, vector := range result { - _, err = tx.Exec("UPDATE files SET embedding_time = ?, embedding = ? WHERE filename = ?", timestamp(), vector, batch[i].filename) - if err != nil { - return err - } - } - err = tx.Commit() - if err != nil { - return err - } - } - return nil - }) - } - - filenamesOnDisk := make(map[string]struct{}) - - err = filepath.WalkDir(config.Files, func(path string, d os.DirEntry, err error) error { - filename := strings.TrimPrefix(path, config.Files) - if err != nil { - return err - } - if d.IsDir() { - return nil - } - filenamesOnDisk[filename] = struct{}{} - records := []FileRecord{} - err = db.Select(&records, "SELECT * FROM files WHERE filename = ?", filename) - if err != nil { - return err - } - stat, err := d.Info() - if err != nil { - return err - } - modtime := stat.ModTime().UnixMicro() - if len(records) == 0 || modtime > records[0].EmbedTime || modtime > records[0].OcrTime || modtime > records[0].ThumbnailTime { - _, err = db.Exec("INSERT OR IGNORE INTO files VALUES (?, 0, 0, 0, '', '', '', '')", filename) - if err != nil { - return err - } - record := FileRecord{ - Filename: filename, - filesize: stat.Size(), - } - if len(records) > 0 { - record = records[0] - } - if modtime > record.EmbedTime || len(record.Embedding) == 0 { - record.Embedding = nil - } - if modtime > record.OcrTime || len(record.RawOcrSegments) == 0 { - record.RawOcrSegments = nil - } - if modtime > record.ThumbnailTime || len(record.Thumbnails) == 0 { - record.Thumbnails = nil - } - toProcess <- record - } - return nil - }) - if err != nil { - return err - } - close(toProcess) - - err = iwg.Wait() - close(toEmbed) - close(toThumbnail) - if err != nil { - return err - } - err = wg.Wait() - if err != nil { - return err - } - - rows, err := db.Queryx("SELECT filename FROM files") - if err != nil { - return err - } - tx, err := db.Begin() - if err != nil { - return err - } - for rows.Next() { - var filename string - err := rows.Scan(&filename) - if err != nil { - return err - } - if _, ok := filenamesOnDisk[filename]; !ok { - _, err = tx.Exec("DELETE FROM files WHERE filename = ?", filename) - if err != nil { - return err - } - } - } - if err = tx.Commit(); err != nil { - return err - } - - return nil -} - -const INDEX_ADD_BATCH = 512 - -func buildIndex(config Config, backend InferenceServerConfig) (Index, error) { - var index Index - - db, err := initializeDatabase(config) - if err != nil { - return index, err - } - defer db.Close() - - newFAISSIndex, err := faiss.IndexFactory(int(backend.EmbeddingSize), "SQfp16", faiss.MetricInnerProduct) - if err != nil { - return index, err - } - index.vectors = newFAISSIndex - - var count int - err = db.Get(&count, "SELECT COUNT(*) FROM files") - if err != nil { - return index, err - } - - index.filenames = make([]string, 0, count) - index.formatCodes = make([]int64, 0, count) - buffer := make([]float32, 0, INDEX_ADD_BATCH*backend.EmbeddingSize) - index.formatNames = make([]string, 0, 5) - - record := FileRecord{} - rows, err := db.Queryx("SELECT * FROM files") - if err != nil { - return index, err - } - for rows.Next() { - err := rows.StructScan(&record) - if err != nil { - return index, err - } - if len(record.Embedding) > 0 { - index.filenames = append(index.filenames, record.Filename) - for i := 0; i < len(record.Embedding); i += 2 { - buffer = append(buffer, float16.Frombits(uint16(record.Embedding[i])+uint16(record.Embedding[i+1])<<8).Float32()) - } - if len(buffer) == cap(buffer) { - index.vectors.Add(buffer) - buffer = make([]float32, 0, INDEX_ADD_BATCH*backend.EmbeddingSize) - } - - formats := make([]string, 0, 5) - if len(record.Thumbnails) > 0 { - err := msgpack.Unmarshal(record.Thumbnails, &formats) - if err != nil { - return index, err - } - } - - formatCode := int64(0) - for _, formatString := range formats { - found := false - for i, name := range index.formatNames { - if name == formatString { - formatCode |= 1 << i - found = true - break - } - } - if !found { - newIndex := len(index.formatNames) - formatCode |= 1 << newIndex - index.formatNames = append(index.formatNames, formatString) - } - } - index.formatCodes = append(index.formatCodes, formatCode) - } - } - if len(buffer) > 0 { - index.vectors.Add(buffer) - } - - return index, nil -} - -func decodeFP16Buffer(buf []byte) []float32 { - out := make([]float32, 0, len(buf)/2) - for i := 0; i < len(buf); i += 2 { - out = append(out, float16.Frombits(uint16(buf[i])+uint16(buf[i+1])<<8).Float32()) - } - return out -} - -type EmbeddingVector []float32 - -type QueryResult struct { - Matches [][]interface{} `json:"matches"` - Formats []string `json:"formats"` - Extensions map[string]string `json:"extensions"` -} - -// this terrible language cannot express tagged unions -type QueryTerm struct { - Embedding *EmbeddingVector `json:"embedding"` - Image *string `json:"image"` // base64 - Text *string `json:"text"` - Weight *float32 `json:"weight"` -} - -type QueryRequest struct { - Terms []QueryTerm `json:"terms"` - K *int `json:"k"` -} - -func queryIndex(index *Index, query EmbeddingVector, k int) (QueryResult, error) { - var qr QueryResult - distances, ids, err := index.vectors.Search(query, int64(k)) - if err != nil { - return qr, err - } - items := lo.Map(lo.Zip2(distances, ids), func(x lo.Tuple2[float32, int64], i int) []interface{} { - return []interface{}{ - x.A, - index.filenames[x.B], - generateFilenameHash(index.filenames[x.B]), - index.formatCodes[x.B], - } - }) - - return QueryResult{ - Matches: items, - Formats: index.formatNames, - }, nil -} - -func handleRequest(config Config, backendConfig InferenceServerConfig, index *Index, w http.ResponseWriter, req *http.Request) error { - if req.Body == nil { - io.WriteString(w, "OK") // health check - return nil - } - dec := json.NewDecoder(req.Body) - var qreq QueryRequest - err := dec.Decode(&qreq) - if err != nil { - return err - } - - totalEmbedding := make(EmbeddingVector, backendConfig.EmbeddingSize) - - imageBatch := make([][]byte, 0) - imageWeights := make([]float32, 0) - textBatch := make([]string, 0) - textWeights := make([]float32, 0) - - for _, term := range qreq.Terms { - if term.Image != nil { - bytes, err := base64.StdEncoding.DecodeString(*term.Image) - if err != nil { - return err - } - loaded := bimg.NewImage(bytes) - resized, err := loaded.Process(bimg.Options{ - Width: int(backendConfig.ImageSize[0]), - Height: int(backendConfig.ImageSize[1]), - Force: true, - Type: bimg.PNG, - Interpretation: bimg.InterpretationSRGB, - }) - if err != nil { - return err - } - imageBatch = append(imageBatch, resized) - if term.Weight != nil { - imageWeights = append(imageWeights, *term.Weight) - } else { - imageWeights = append(imageWeights, 1) - } - } - if term.Text != nil { - textBatch = append(textBatch, *term.Text) - if term.Weight != nil { - textWeights = append(textWeights, *term.Weight) - } else { - textWeights = append(textWeights, 1) - } - } - if term.Embedding != nil { - weight := float32(1.0) - if term.Weight != nil { - weight = *term.Weight - } - for i := 0; i < int(backendConfig.EmbeddingSize); i += 1 { - totalEmbedding[i] += (*term.Embedding)[i] * weight - } - } - } - - if len(imageBatch) > 0 { - embs, err := queryClipServer[EmbeddingRequest, EmbeddingResponse](config, "/", EmbeddingRequest{ - Images: imageBatch, - }) - if err != nil { - return err - } - for j, emb := range embs { - embd := decodeFP16Buffer(emb) - for i := 0; i < int(backendConfig.EmbeddingSize); i += 1 { - totalEmbedding[i] += embd[i] * imageWeights[j] - } - } - } - if len(textBatch) > 0 { - embs, err := queryClipServer[EmbeddingRequest, EmbeddingResponse](config, "/", EmbeddingRequest{ - Text: textBatch, - }) - if err != nil { - return err - } - for j, emb := range embs { - embd := decodeFP16Buffer(emb) - for i := 0; i < int(backendConfig.EmbeddingSize); i += 1 { - totalEmbedding[i] += embd[i] * textWeights[j] - } - } - } - - k := 1000 - if qreq.K != nil { - k = *qreq.K - } - - w.Header().Add("Content-Type", "application/json") - enc := json.NewEncoder(w) - - qres, err := queryIndex(index, totalEmbedding, k) - - qres.Extensions = make(map[string]string) - for k, v := range imageFormats(config) { - qres.Extensions[k] = v.extension - } - - if err != nil { - return err - } - - err = enc.Encode(qres) - if err != nil { - return err - } - return nil -} - -func init() { - os.Setenv("VIPS_WARNING", "FALSE") // this does not actually work - bimg.VipsCacheSetMax(0) - bimg.VipsCacheSetMaxMem(0) -} - -func main() { - vips.Startup(&vips.Config{}) - defer vips.Shutdown() - - content, err := os.ReadFile(os.Args[1]) - if err != nil { - log.Fatal("config file unreadable ", err) - } - var config Config - err = json.Unmarshal(content, &config) - if err != nil { - log.Fatal("config file wrong ", err) - } - fmt.Println(config) - - db, err := sqlx.Connect("sqlite3", config.DbPath) - if err != nil { - log.Fatal("DB connection failure ", db) - } - db.MustExec(schema) - - var backend InferenceServerConfig - for { - resp, err := http.Get(config.ClipServer + "/config") - if err != nil { - log.Println("backend failed (fetch) ", err) - } - backend, err = decodeMsgpackFrom[InferenceServerConfig](resp) - resp.Body.Close() - if err != nil { - log.Println("backend failed (parse) ", err) - } else { - break - } - time.Sleep(time.Second) - } - - requestIngest := make(chan struct{}, 1) - - var index *Index - // maybe this ought to be mutexed? - var lastError *error - // there's not a neat way to reusably broadcast to multiple channels, but I *can* abuse WaitGroups probably - // this might cause horrible concurrency issues, but you brought me to this point, Go designers - var wg sync.WaitGroup - - go func() { - for { - wg.Add(1) - log.Println("ingest running") - err := ingestFiles(config, backend) - if err != nil { - log.Println("ingest failed ", err) - lastError = &err - } else { - newIndex, err := buildIndex(config, backend) - if err != nil { - log.Println("index build failed ", err) - lastError = &err - } else { - lastError = nil - index = &newIndex - } - } - wg.Done() - <-requestIngest - } - }() - newIndex, err := buildIndex(config, backend) - index = &newIndex - if err != nil { - log.Fatal("index build failed ", err) - } - - http.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) { - w.Header().Add("Access-Control-Allow-Origin", "*") - w.Header().Add("Access-Control-Allow-Headers", "Content-Type") - if req.Method == "OPTIONS" { - w.WriteHeader(204) - return - } - err := handleRequest(config, backend, index, w, req) - if err != nil { - w.Header().Add("Content-Type", "application/json") - w.WriteHeader(500) - json.NewEncoder(w).Encode(map[string]string{ - "error": err.Error(), - }) - } - }) - http.HandleFunc("/reload", func(w http.ResponseWriter, req *http.Request) { - if req.Method == "POST" { - log.Println("requesting index reload") - select { - case requestIngest <- struct{}{}: - default: - } - wg.Wait() - if lastError == nil { - w.Write([]byte("OK")) - } else { - w.WriteHeader(500) - w.Write([]byte((*lastError).Error())) - } - } - }) - http.HandleFunc("/profile", func(w http.ResponseWriter, req *http.Request) { - f, err := os.Create("mem.pprof") - if err != nil { - log.Fatal("could not create memory profile: ", err) - } - defer f.Close() - var m runtime.MemStats - runtime.ReadMemStats(&m) - log.Printf("Memory usage: Alloc=%v, TotalAlloc=%v, Sys=%v", m.Alloc, m.TotalAlloc, m.Sys) - log.Println(bimg.VipsMemory()) - bimg.VipsDebugInfo() - runtime.GC() // Trigger garbage collection - if err := pprof.WriteHeapProfile(f); err != nil { - log.Fatal("could not write memory profile: ", err) - } - }) - log.Println("starting server") - http.ListenAndServe(fmt.Sprintf(":%d", config.Port), nil) -} diff --git a/misc/bad-go-version/problematic_thing_2.go b/misc/bad-go-version/problematic_thing_2.go deleted file mode 100644 index 3f79685..0000000 --- a/misc/bad-go-version/problematic_thing_2.go +++ /dev/null @@ -1,265 +0,0 @@ -package main - -import ( - "bytes" - "errors" - "fmt" - "io" - "math" - "mime/multipart" - "net/http" - "net/textproto" - "regexp" - "strings" - "time" - - "github.com/davidbyttow/govips/v2/vips" - "github.com/samber/lo" - "github.com/titanous/json5" -) - -const CALLBACK_REGEX string = ">AF_initDataCallback\\(({key: 'ds:1'.*?)\\);" - -type SegmentCoords struct { - x int - y int - w int - h int -} - -type Segment struct { - coords SegmentCoords - text string -} - -type ScanResult []Segment - -// TODO coordinates are negative sometimes and I think they shouldn't be -func rationalizeCoordsFormat1(imageW float64, imageH float64, centerXFraction float64, centerYFraction float64, widthFraction float64, heightFraction float64) SegmentCoords { - return SegmentCoords{ - x: int(math.Round((centerXFraction - widthFraction/2) * imageW)), - y: int(math.Round((centerYFraction - heightFraction/2) * imageH)), - w: int(math.Round(widthFraction * imageW)), - h: int(math.Round(heightFraction * imageH)), - } -} - -func scanImageChunk(image []byte, imageWidth int, imageHeight int) (ScanResult, error) { - var result ScanResult - timestamp := time.Now().UnixMicro() - var b bytes.Buffer - w := multipart.NewWriter(&b) - defer w.Close() - h := make(textproto.MIMEHeader) - h.Set("Content-Disposition", fmt.Sprintf(`form-data; name="encoded_image"; filename="ocr%d.png"`, timestamp)) - h.Set("Content-Type", "image/png") - fw, err := w.CreatePart(h) - if err != nil { - return result, err - } - fw.Write(image) - w.Close() - - req, err := http.NewRequest("POST", fmt.Sprintf("https://lens.google.com/v3/upload?stcs=%d", timestamp), &b) - if err != nil { - return result, err - } - req.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 13; RMX3771) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.144 Mobile Safari/537.36") - req.AddCookie(&http.Cookie{ - Name: "SOCS", - Value: "CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg", - }) - req.Header.Set("Content-Type", w.FormDataContentType()) - client := http.Client{} - res, err := client.Do(req) - if err != nil { - return result, err - } - defer res.Body.Close() - body, err := io.ReadAll(res.Body) - if err != nil { - return result, err - } - re, _ := regexp.Compile(CALLBACK_REGEX) - matches := re.FindStringSubmatch(string(body[:])) - if len(matches) == 0 { - return result, fmt.Errorf("invalid API response") - } - match := matches[1] - var lensObject map[string]interface{} - err = json5.Unmarshal([]byte(match), &lensObject) - if err != nil { - return result, err - } - - if _, ok := lensObject["errorHasStatus"]; ok { - return result, errors.New("lens failed") - } - - root := lensObject["data"].([]interface{}) - - var textSegments []string - var textRegions []SegmentCoords - - // I don't know why Google did this. - // Text segments are in one place and their locations are in another, using a very strange coordinate system. - // At least I don't need whatever is contained in the base64 parts (which I assume are protobufs). - // TODO: on a few images, this seems to not work for some reason. - defer func() { - if r := recover(); r != nil { - // https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode. - // In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images. - textSegments = []string{} - textRegions = []SegmentCoords{} - } - }() - - textSegmentsRaw := root[3].([]interface{})[4].([]interface{})[0].([]interface{})[0].([]interface{}) - textRegionsRaw := root[2].([]interface{})[3].([]interface{})[0].([]interface{}) - for _, x := range textRegionsRaw { - if strings.HasPrefix(x.([]interface{})[11].(string), "text:") { - rawCoords := x.([]interface{})[1].([]interface{}) - coords := rationalizeCoordsFormat1(float64(imageWidth), float64(imageHeight), rawCoords[0].(float64), rawCoords[1].(float64), rawCoords[2].(float64), rawCoords[3].(float64)) - textRegions = append(textRegions, coords) - } - } - for _, x := range textSegmentsRaw { - textSegment := x.(string) - textSegments = append(textSegments, textSegment) - } - - return lo.Map(lo.Zip2(textSegments, textRegions), func(x lo.Tuple2[string, SegmentCoords], _ int) Segment { - return Segment{ - text: x.A, - coords: x.B, - } - }), nil -} - -const MAX_DIM int = 1024 - -func scanImage(image *vips.ImageRef) (ScanResult, error) { - result := ScanResult{} - width := image.Width() - height := image.Height() - if width > MAX_DIM { - width = MAX_DIM - height = int(math.Round(float64(height) * (float64(width) / float64(image.Width())))) - } - downscaled, err := image.Copy() - if err != nil { - return result, err - } - downscaled.Resize(float64(width)/float64(image.Width()), vips.KernelLanczos3) - for y := 0; y < height; y += MAX_DIM { - chunkHeight := MAX_DIM - if y+chunkHeight > height { - chunkHeight = height - y - } - chunk, err := image.Copy() // TODO this really really should not be in-place - if err != nil { - return result, err - } - err = chunk.ExtractArea(0, y, width, height) - if err != nil { - return result, err - } - buf, _, err := chunk.ExportPng(&vips.PngExportParams{}) - if err != nil { - return result, err - } - res, err := scanImageChunk(buf, width, chunkHeight) - if err != nil { - return result, err - } - for _, segment := range res { - result = append(result, Segment{ - text: segment.text, - coords: SegmentCoords{ - y: segment.coords.y + y, - x: segment.coords.x, - w: segment.coords.w, - h: segment.coords.h, - }, - }) - } - } - - return result, nil -} - -/* -async def scan_image_chunk(sess, image): - # send data to inscrutable undocumented Google service - # https://github.com/AuroraWright/owocr/blob/master/owocr/ocr.py#L193 - async with aiohttp.ClientSession() as sess: - data = aiohttp.FormData() - data.add_field( - "encoded_image", - encode_img(image), - filename="ocr" + str(timestamp) + ".png", - content_type="image/png" - ) - async with sess.post(url, headers=headers, cookies=cookies, data=data, timeout=10) as res: - body = await res.text() - - # I really worry about Google sometimes. This is not a sensible format. - match = CALLBACK_REGEX.search(body) - if match == None: - raise ValueError("Invalid callback") - - lens_object = pyjson5.loads(match.group(1)) - if "errorHasStatus" in lens_object: - raise RuntimeError("Lens failed") - - text_segments = [] - text_regions = [] - - root = lens_object["data"] - - # I don't know why Google did this. - # Text segments are in one place and their locations are in another, using a very strange coordinate system. - # At least I don't need whatever is contained in the base64 partss (which I assume are protobufs). - # TODO: on a few images, this seems to not work for some reason. - try: - text_segments = root[3][4][0][0] - text_regions = [ rationalize_coords_format1(image.width, image.height, *x[1]) for x in root[2][3][0] if x[11].startswith("text:") ] - except (KeyError, IndexError): - # https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode. - # In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images. - return [], [] - - return text_segments, text_regions - -MAX_SCAN_DIM = 1000 # not actually true but close enough -def chunk_image(image: Image): - chunks = [] - # Cut image down in X axis (I'm assuming images aren't too wide to scan in downscaled form because merging text horizontally would be annoying) - if image.width > MAX_SCAN_DIM: - image = image.resize((MAX_SCAN_DIM, round(image.height * (image.width / MAX_SCAN_DIM))), Image.LANCZOS) - for y in range(0, image.height, MAX_SCAN_DIM): - chunks.append(image.crop((0, y, image.width, min(y + MAX_SCAN_DIM, image.height)))) - return chunks - -async def scan_chunks(sess: aiohttp.ClientSession, chunks: [Image]): - # If text happens to be split across the cut line it won't get read. - # This is because doing overlap read areas would be really annoying. - text = "" - regions = [] - for chunk in chunks: - new_segments, new_regions = await scan_image_chunk(sess, chunk) - for segment in new_segments: - text += segment + "\n" - for i, (segment, region) in enumerate(zip(new_segments, new_regions)): - regions.append({ **region, "y": region["y"] + (MAX_SCAN_DIM * i), "text": segment }) - return text, regions - -async def scan_image(sess: aiohttp.ClientSession, image: Image): - return await scan_chunks(sess, chunk_image(image)) - -if __name__ == "__main__": - async def main(): - async with aiohttp.ClientSession() as sess: - print(await scan_image(sess, Image.open("/data/public/memes-or-something/linear-algebra-chess.png"))) - asyncio.run(main()) -*/ diff --git a/misc/mse_accursed.py b/misc/mse_accursed.py deleted file mode 100644 index ed6a407..0000000 --- a/misc/mse_accursed.py +++ /dev/null @@ -1,212 +0,0 @@ -from aiohttp import web -import aiohttp -import asyncio -import traceback -import umsgpack -from PIL import Image -import base64 -import aiosqlite -import faiss -import numpy -import os -import aiohttp_cors -import json -import io -import sys -from concurrent.futures import ProcessPoolExecutor - -with open(sys.argv[1], "r") as config_file: - CONFIG = json.load(config_file) - -app = web.Application(client_max_size=32*1024**2) -routes = web.RouteTableDef() - -async def clip_server(query, unpack_buffer=True): - async with aiohttp.ClientSession() as sess: - async with sess.post(CONFIG["clip_server"], data=umsgpack.dumps(query)) as res: - response = umsgpack.loads(await res.read()) - if res.status == 200: - if unpack_buffer: - response = [ numpy.frombuffer(x, dtype="float16") for x in response ] - return response - else: - raise Exception(response if res.headers.get("content-type") == "application/msgpack" else (await res.text())) - -@routes.post("/") -async def run_query(request): - data = await request.json() - embeddings = [] - if images := data.get("images", []): - embeddings.extend(await clip_server({ "images": [ base64.b64decode(x) for x, w in images ] })) - if text := data.get("text", []): - embeddings.extend(await clip_server({ "text": [ x for x, w in text ] })) - weights = [ w for x, w in images ] + [ w for x, w in text ] - embeddings = [ e * w for e, w in zip(embeddings, weights) ] - if not embeddings: - return web.json_response([]) - return web.json_response(app["index"].search(sum(embeddings))) - -@routes.get("/") -async def health_check(request): - return web.Response(text="OK") - -@routes.post("/reload_index") -async def reload_index_route(request): - await request.app["index"].reload() - return web.json_response(True) - -def load_image(path, image_size): - im = Image.open(path) - im.draft("RGB", image_size) - buf = io.BytesIO() - im.resize(image_size).convert("RGB").save(buf, format="BMP") - return buf.getvalue(), path - -class Index: - def __init__(self, inference_server_config): - self.faiss_index = faiss.IndexFlatIP(inference_server_config["embedding_size"]) - self.associated_filenames = [] - self.inference_server_config = inference_server_config - self.lock = asyncio.Lock() - - def search(self, query): - distances, indices = self.faiss_index.search(numpy.array([query]), 4000) - distances = distances[0] - indices = indices[0] - try: - indices = indices[:numpy.where(indices==-1)[0][0]] - except IndexError: pass - return [ { "score": float(distance), "file": self.associated_filenames[index] } for index, distance in zip(indices, distances) ] - - async def reload(self): - async with self.lock: - with ProcessPoolExecutor(max_workers=12) as executor: - print("Indexing") - conn = await aiosqlite.connect(CONFIG["db_path"], parent_loop=asyncio.get_running_loop()) - conn.row_factory = aiosqlite.Row - await conn.executescript(""" - CREATE TABLE IF NOT EXISTS files ( - filename TEXT PRIMARY KEY, - modtime REAL NOT NULL, - embedding_vector BLOB NOT NULL - ); - """) - try: - async with asyncio.TaskGroup() as tg: - batch_sem = asyncio.Semaphore(32) - - modified = set() - - async def do_batch(batch): - try: - query = { "images": [ arg[2] for arg in batch ] } - embeddings = await clip_server(query, False) - await conn.executemany("INSERT OR REPLACE INTO files VALUES (?, ?, ?)", [ - (filename, modtime, embedding) for (filename, modtime, _), embedding in zip(batch, embeddings) - ]) - await conn.commit() - for filename, _, _ in batch: - modified.add(filename) - sys.stdout.write(".") - finally: - batch_sem.release() - - async def dispatch_batch(batch): - await batch_sem.acquire() - tg.create_task(do_batch(batch)) - - files = {} - for filename, modtime in await conn.execute_fetchall("SELECT filename, modtime FROM files"): - files[filename] = modtime - await conn.commit() - batch = [] - - for dirpath, _, filenames in os.walk(CONFIG["files"]): - paths = [] - for file in filenames: - path = os.path.join(dirpath, file) - file = os.path.relpath(path, CONFIG["files"]) - st = os.stat(path) - if st.st_mtime != files.get(file): - paths.append(path) - for task in asyncio.as_completed([ asyncio.get_running_loop().run_in_executor(executor, load_image, path, self.inference_server_config["image_size"]) for path in paths ]): - try: - b, path = await task - st = os.stat(path) - file = os.path.relpath(path, CONFIG["files"]) - except Exception as e: - print(file, "failed", e) - continue - batch.append((file, st.st_mtime, b)) - if len(batch) == self.inference_server_config["batch"]: - await dispatch_batch(batch) - batch = [] - if batch: - await dispatch_batch(batch) - - remove_indices = [] - for index, filename in enumerate(self.associated_filenames): - if filename not in files or filename in modified: - remove_indices.append(index) - self.associated_filenames[index] = None - if filename not in files: - await conn.execute("DELETE FROM files WHERE filename = ?", (filename,)) - await conn.commit() - # TODO concurrency - # TODO understand what that comment meant - if remove_indices: - self.faiss_index.remove_ids(numpy.array(remove_indices)) - self.associated_filenames = [ x for x in self.associated_filenames if x is not None ] - - filenames_set = set(self.associated_filenames) - new_data = [] - new_filenames = [] - async with conn.execute("SELECT * FROM files") as csr: - while row := await csr.fetchone(): - filename, modtime, embedding_vector = row - if filename not in filenames_set: - new_data.append(numpy.frombuffer(embedding_vector, dtype="float16")) - new_filenames.append(filename) - new_data = numpy.array(new_data) - self.associated_filenames.extend(new_filenames) - self.faiss_index.add(new_data) - finally: - await conn.close() - -app.router.add_routes(routes) - -cors = aiohttp_cors.setup(app, defaults={ - "*": aiohttp_cors.ResourceOptions( - allow_credentials=False, - expose_headers="*", - allow_headers="*", - ) -}) -for route in list(app.router.routes()): - cors.add(route) - -async def main(): - while True: - async with aiohttp.ClientSession() as sess: - try: - async with await sess.get(CONFIG["clip_server"] + "config") as res: - inference_server_config = umsgpack.unpackb(await res.read()) - print("Backend config:", inference_server_config) - break - except: - traceback.print_exc() - await asyncio.sleep(1) - index = Index(inference_server_config) - app["index"] = index - await index.reload() - print("Ready") - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, "", CONFIG["port"]) - await site.start() - -if __name__ == "__main__": - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - loop.run_until_complete(main()) - loop.run_forever() \ No newline at end of file diff --git a/misc/train_xgboost.py b/misc/train_xgboost.py deleted file mode 100644 index a320c57..0000000 --- a/misc/train_xgboost.py +++ /dev/null @@ -1,19 +0,0 @@ -import numpy -import xgboost as xgb - -import shared - -trains, validations = shared.fetch_ratings() - -ranker = xgb.XGBRanker( - tree_method="hist", - lambdarank_num_pair_per_sample=8, - objective="rank:ndcg", - lambdarank_pair_method="topk", - device="cuda" -) -flat_samples = [ sample for trainss in trains for sample in trainss ] -X = numpy.concatenate([ numpy.stack((meme1, meme2)) for meme1, meme2, rating in flat_samples ]) -Y = numpy.concatenate([ numpy.stack((int(rating), int(1 - rating))) for meme1, meme2, rating in flat_samples ]) -qid = numpy.concatenate([ numpy.stack((i, i)) for i in range(len(flat_samples)) ]) -ranker.fit(X, Y, qid=qid, verbose=True) \ No newline at end of file