meme-search-engine/misc/bad-go-version/problematic_thing_2.go

package main

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"math"
	"mime/multipart"
	"net/http"
	"net/textproto"
	"regexp"
	"strings"
	"time"

	"github.com/davidbyttow/govips/v2/vips"
	"github.com/samber/lo"
	"github.com/titanous/json5"
)

const CALLBACK_REGEX string = ">AF_initDataCallback\\(({key: 'ds:1'.*?)\\);</script>"

type SegmentCoords struct {
	x int
	y int
	w int
	h int
}

type Segment struct {
	coords SegmentCoords
	text   string
}

type ScanResult []Segment

// TODO coordinates are negative sometimes and I think they shouldn't be
func rationalizeCoordsFormat1(imageW float64, imageH float64, centerXFraction float64, centerYFraction float64, widthFraction float64, heightFraction float64) SegmentCoords {
	return SegmentCoords{
		x: int(math.Round((centerXFraction - widthFraction/2) * imageW)),
		y: int(math.Round((centerYFraction - heightFraction/2) * imageH)),
		w: int(math.Round(widthFraction * imageW)),
		h: int(math.Round(heightFraction * imageH)),
	}
}

func scanImageChunk(image []byte, imageWidth int, imageHeight int) (ScanResult, error) {
	var result ScanResult
	timestamp := time.Now().UnixMicro()
	var b bytes.Buffer
	w := multipart.NewWriter(&b)
	defer w.Close()
	h := make(textproto.MIMEHeader)
	h.Set("Content-Disposition", fmt.Sprintf(`form-data; name="encoded_image"; filename="ocr%d.png"`, timestamp))
	h.Set("Content-Type", "image/png")
	fw, err := w.CreatePart(h)
	if err != nil {
		return result, err
	}
	fw.Write(image)
	w.Close()

	req, err := http.NewRequest("POST", fmt.Sprintf("https://lens.google.com/v3/upload?stcs=%d", timestamp), &b)
	if err != nil {
		return result, err
	}
	req.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 13; RMX3771) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.144 Mobile Safari/537.36")
	req.AddCookie(&http.Cookie{
		Name:  "SOCS",
		Value: "CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg",
	})
	req.Header.Set("Content-Type", w.FormDataContentType())
	client := http.Client{}
	res, err := client.Do(req)
	if err != nil {
		return result, err
	}
	defer res.Body.Close()
	body, err := io.ReadAll(res.Body)
	if err != nil {
		return result, err
	}
	re, _ := regexp.Compile(CALLBACK_REGEX)
	matches := re.FindStringSubmatch(string(body[:]))
	if len(matches) == 0 {
		return result, fmt.Errorf("invalid API response")
	}
	match := matches[1]
	var lensObject map[string]interface{}
	err = json5.Unmarshal([]byte(match), &lensObject)
	if err != nil {
		return result, err
	}

	if _, ok := lensObject["errorHasStatus"]; ok {
		return result, errors.New("lens failed")
	}

	root := lensObject["data"].([]interface{})

	var textSegments []string
	var textRegions []SegmentCoords

	// I don't know why Google did this.
	// Text segments are in one place and their locations are in another, using a very strange coordinate system.
	// At least I don't need whatever is contained in the base64 parts (which I assume are protobufs).
	// TODO: on a few images, this seems to not work for some reason.
	defer func() {
		if r := recover(); r != nil {
			// https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode.
			// In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images.
			textSegments = []string{}
			textRegions = []SegmentCoords{}
		}
	}()

	textSegmentsRaw := root[3].([]interface{})[4].([]interface{})[0].([]interface{})[0].([]interface{})
	textRegionsRaw := root[2].([]interface{})[3].([]interface{})[0].([]interface{})
	for _, x := range textRegionsRaw {
		if strings.HasPrefix(x.([]interface{})[11].(string), "text:") {
			rawCoords := x.([]interface{})[1].([]interface{})
			coords := rationalizeCoordsFormat1(float64(imageWidth), float64(imageHeight), rawCoords[0].(float64), rawCoords[1].(float64), rawCoords[2].(float64), rawCoords[3].(float64))
			textRegions = append(textRegions, coords)
		}
	}
	for _, x := range textSegmentsRaw {
		textSegment := x.(string)
		textSegments = append(textSegments, textSegment)
	}

	return lo.Map(lo.Zip2(textSegments, textRegions), func(x lo.Tuple2[string, SegmentCoords], _ int) Segment {
		return Segment{
			text:   x.A,
			coords: x.B,
		}
	}), nil
}

const MAX_DIM int = 1024

func scanImage(image *vips.ImageRef) (ScanResult, error) {
	result := ScanResult{}
	width := image.Width()
	height := image.Height()
	if width > MAX_DIM {
		width = MAX_DIM
		height = int(math.Round(float64(height) * (float64(width) / float64(image.Width()))))
	}
	downscaled, err := image.Copy()
	if err != nil {
		return result, err
	}
	downscaled.Resize(float64(width)/float64(image.Width()), vips.KernelLanczos3)
	for y := 0; y < height; y += MAX_DIM {
		chunkHeight := MAX_DIM
		if y+chunkHeight > height {
			chunkHeight = height - y
		}
		chunk, err := image.Copy() // TODO this really really should not be in-place
		if err != nil {
			return result, err
		}
		err = chunk.ExtractArea(0, y, width, height)
		if err != nil {
			return result, err
		}
		buf, _, err := chunk.ExportPng(&vips.PngExportParams{})
		if err != nil {
			return result, err
		}
		res, err := scanImageChunk(buf, width, chunkHeight)
		if err != nil {
			return result, err
		}
		for _, segment := range res {
			result = append(result, Segment{
				text: segment.text,
				coords: SegmentCoords{
					y: segment.coords.y + y,
					x: segment.coords.x,
					w: segment.coords.w,
					h: segment.coords.h,
				},
			})
		}
	}

	return result, nil
}

/*
async def scan_image_chunk(sess, image):
    # send data to inscrutable undocumented Google service
    # https://github.com/AuroraWright/owocr/blob/master/owocr/ocr.py#L193
    async with aiohttp.ClientSession() as sess:
        data = aiohttp.FormData()
        data.add_field(
            "encoded_image",
            encode_img(image),
            filename="ocr" + str(timestamp) + ".png",
            content_type="image/png"
        )
        async with sess.post(url, headers=headers, cookies=cookies, data=data, timeout=10) as res:
            body = await res.text()

    # I really worry about Google sometimes. This is not a sensible format.
    match = CALLBACK_REGEX.search(body)
    if match == None:
        raise ValueError("Invalid callback")

    lens_object = pyjson5.loads(match.group(1))
    if "errorHasStatus" in lens_object:
        raise RuntimeError("Lens failed")

    text_segments = []
    text_regions = []

    root = lens_object["data"]

    # I don't know why Google did this.
    # Text segments are in one place and their locations are in another, using a very strange coordinate system.
    # At least I don't need whatever is contained in the base64 partss (which I assume are protobufs).
    # TODO: on a few images, this seems to not work for some reason.
    try:
        text_segments = root[3][4][0][0]
        text_regions = [ rationalize_coords_format1(image.width, image.height, *x[1]) for x in root[2][3][0] if x[11].startswith("text:") ]
    except (KeyError, IndexError):
        # https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode.
        # In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images.
        return [], []

    return text_segments, text_regions

MAX_SCAN_DIM = 1000 # not actually true but close enough
def chunk_image(image: Image):
    chunks = []
    # Cut image down in X axis (I'm assuming images aren't too wide to scan in downscaled form because merging text horizontally would be annoying)
    if image.width > MAX_SCAN_DIM:
        image = image.resize((MAX_SCAN_DIM, round(image.height * (image.width / MAX_SCAN_DIM))), Image.LANCZOS)
    for y in range(0, image.height, MAX_SCAN_DIM):
        chunks.append(image.crop((0, y, image.width, min(y + MAX_SCAN_DIM, image.height))))
    return chunks

async def scan_chunks(sess: aiohttp.ClientSession, chunks: [Image]):
    # If text happens to be split across the cut line it won't get read.
    # This is because doing overlap read areas would be really annoying.
    text = ""
    regions = []
    for chunk in chunks:
        new_segments, new_regions = await scan_image_chunk(sess, chunk)
        for segment in new_segments:
            text += segment + "\n"
        for i, (segment, region) in enumerate(zip(new_segments, new_regions)):
            regions.append({ **region, "y": region["y"] + (MAX_SCAN_DIM * i), "text": segment })
    return text, regions

async def scan_image(sess: aiohttp.ClientSession, image: Image):
    return await scan_chunks(sess, chunk_image(image))

if __name__ == "__main__":
    async def main():
        async with aiohttp.ClientSession() as sess:
            print(await scan_image(sess, Image.open("/data/public/memes-or-something/linear-algebra-chess.png")))
    asyncio.run(main())
*/
Rewrite entire application (well, backend) in Rust and also Go I decided I wanted to integrate the experimental OCR thing better, so I rewrote in Go and also integrated the thumbnailer. However, Go is a bad langauge and I only used it out of spite. It turned out to have a very hard-to-fix memory leak due to some unclear interaction between libvips and both sets of bindings I tried, so I had Claude-3 transpile it to Rust then spent a while fixing the several mistakes it made and making tweaks. The new Rust version works, although I need to actually do something with the OCR data and make the index queryable concurrently. 2024-05-20 23:09:04 +00:00			`package main`

			`import (`
			`"bytes"`
			`"errors"`
			`"fmt"`
			`"io"`
			`"math"`
			`"mime/multipart"`
			`"net/http"`
			`"net/textproto"`
			`"regexp"`
			`"strings"`
			`"time"`

			`"github.com/davidbyttow/govips/v2/vips"`
			`"github.com/samber/lo"`
			`"github.com/titanous/json5"`
			`)`

			`const CALLBACK_REGEX string = ">AF_initDataCallback\\(({key: 'ds:1'.*?)\\);</script>"`

			`type SegmentCoords struct {`
			`x int`
			`y int`
			`w int`
			`h int`
			`}`

			`type Segment struct {`
			`coords SegmentCoords`
			`text string`
			`}`

			`type ScanResult []Segment`

			`// TODO coordinates are negative sometimes and I think they shouldn't be`
			`func rationalizeCoordsFormat1(imageW float64, imageH float64, centerXFraction float64, centerYFraction float64, widthFraction float64, heightFraction float64) SegmentCoords {`
			`return SegmentCoords{`
			`x: int(math.Round((centerXFraction - widthFraction/2) * imageW)),`
			`y: int(math.Round((centerYFraction - heightFraction/2) * imageH)),`
			`w: int(math.Round(widthFraction * imageW)),`
			`h: int(math.Round(heightFraction * imageH)),`
			`}`
			`}`

			`func scanImageChunk(image []byte, imageWidth int, imageHeight int) (ScanResult, error) {`
			`var result ScanResult`
			`timestamp := time.Now().UnixMicro()`
			`var b bytes.Buffer`
			`w := multipart.NewWriter(&b)`
			`defer w.Close()`
			`h := make(textproto.MIMEHeader)`
			h.Set("Content-Disposition", fmt.Sprintf(`form-data; name="encoded_image"; filename="ocr%d.png"`, timestamp))
			`h.Set("Content-Type", "image/png")`
			`fw, err := w.CreatePart(h)`
			`if err != nil {`
			`return result, err`
			`}`
			`fw.Write(image)`
			`w.Close()`

			`req, err := http.NewRequest("POST", fmt.Sprintf("https://lens.google.com/v3/upload?stcs=%d", timestamp), &b)`
			`if err != nil {`
			`return result, err`
			`}`
			`req.Header.Add("User-Agent", "Mozilla/5.0 (Linux; Android 13; RMX3771) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.144 Mobile Safari/537.36")`
			`req.AddCookie(&http.Cookie{`
			`Name: "SOCS",`
			`Value: "CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg",`
			`})`
			`req.Header.Set("Content-Type", w.FormDataContentType())`
			`client := http.Client{}`
			`res, err := client.Do(req)`
			`if err != nil {`
			`return result, err`
			`}`
			`defer res.Body.Close()`
			`body, err := io.ReadAll(res.Body)`
			`if err != nil {`
			`return result, err`
			`}`
			`re, _ := regexp.Compile(CALLBACK_REGEX)`
			`matches := re.FindStringSubmatch(string(body[:]))`
			`if len(matches) == 0 {`
			`return result, fmt.Errorf("invalid API response")`
			`}`
			`match := matches[1]`
			`var lensObject map[string]interface{}`
			`err = json5.Unmarshal([]byte(match), &lensObject)`
			`if err != nil {`
			`return result, err`
			`}`

			`if _, ok := lensObject["errorHasStatus"]; ok {`
			`return result, errors.New("lens failed")`
			`}`

			`root := lensObject["data"].([]interface{})`

			`var textSegments []string`
			`var textRegions []SegmentCoords`

			`// I don't know why Google did this.`
			`// Text segments are in one place and their locations are in another, using a very strange coordinate system.`
			`// At least I don't need whatever is contained in the base64 parts (which I assume are protobufs).`
			`// TODO: on a few images, this seems to not work for some reason.`
			`defer func() {`
			`if r := recover(); r != nil {`
			`// https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode.`
			`// In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images.`
			`textSegments = []string{}`
			`textRegions = []SegmentCoords{}`
			`}`
			`}()`

			`textSegmentsRaw := root[3].([]interface{})[4].([]interface{})[0].([]interface{})[0].([]interface{})`
			`textRegionsRaw := root[2].([]interface{})[3].([]interface{})[0].([]interface{})`
			`for _, x := range textRegionsRaw {`
			`if strings.HasPrefix(x.([]interface{})[11].(string), "text:") {`
			`rawCoords := x.([]interface{})[1].([]interface{})`
			`coords := rationalizeCoordsFormat1(float64(imageWidth), float64(imageHeight), rawCoords[0].(float64), rawCoords[1].(float64), rawCoords[2].(float64), rawCoords[3].(float64))`
			`textRegions = append(textRegions, coords)`
			`}`
			`}`
			`for _, x := range textSegmentsRaw {`
			`textSegment := x.(string)`
			`textSegments = append(textSegments, textSegment)`
			`}`

			`return lo.Map(lo.Zip2(textSegments, textRegions), func(x lo.Tuple2[string, SegmentCoords], _ int) Segment {`
			`return Segment{`
			`text: x.A,`
			`coords: x.B,`
			`}`
			`}), nil`
			`}`

			`const MAX_DIM int = 1024`

			`func scanImage(image *vips.ImageRef) (ScanResult, error) {`
			`result := ScanResult{}`
			`width := image.Width()`
			`height := image.Height()`
			`if width > MAX_DIM {`
			`width = MAX_DIM`
			`height = int(math.Round(float64(height) * (float64(width) / float64(image.Width()))))`
			`}`
			`downscaled, err := image.Copy()`
			`if err != nil {`
			`return result, err`
			`}`
			`downscaled.Resize(float64(width)/float64(image.Width()), vips.KernelLanczos3)`
			`for y := 0; y < height; y += MAX_DIM {`
			`chunkHeight := MAX_DIM`
			`if y+chunkHeight > height {`
			`chunkHeight = height - y`
			`}`
			`chunk, err := image.Copy() // TODO this really really should not be in-place`
			`if err != nil {`
			`return result, err`
			`}`
			`err = chunk.ExtractArea(0, y, width, height)`
			`if err != nil {`
			`return result, err`
			`}`
			`buf, _, err := chunk.ExportPng(&vips.PngExportParams{})`
			`if err != nil {`
			`return result, err`
			`}`
			`res, err := scanImageChunk(buf, width, chunkHeight)`
			`if err != nil {`
			`return result, err`
			`}`
			`for _, segment := range res {`
			`result = append(result, Segment{`
			`text: segment.text,`
			`coords: SegmentCoords{`
			`y: segment.coords.y + y,`
			`x: segment.coords.x,`
			`w: segment.coords.w,`
			`h: segment.coords.h,`
			`},`
			`})`
			`}`
			`}`

			`return result, nil`
			`}`

			`/*`
			`async def scan_image_chunk(sess, image):`
			`# send data to inscrutable undocumented Google service`
			`# https://github.com/AuroraWright/owocr/blob/master/owocr/ocr.py#L193`
			`async with aiohttp.ClientSession() as sess:`
			`data = aiohttp.FormData()`
			`data.add_field(`
			`"encoded_image",`
			`encode_img(image),`
			`filename="ocr" + str(timestamp) + ".png",`
			`content_type="image/png"`
			`)`
			`async with sess.post(url, headers=headers, cookies=cookies, data=data, timeout=10) as res:`
			`body = await res.text()`

			`# I really worry about Google sometimes. This is not a sensible format.`
			`match = CALLBACK_REGEX.search(body)`
			`if match == None:`
			`raise ValueError("Invalid callback")`

			`lens_object = pyjson5.loads(match.group(1))`
			`if "errorHasStatus" in lens_object:`
			`raise RuntimeError("Lens failed")`

			`text_segments = []`
			`text_regions = []`

			`root = lens_object["data"]`

			`# I don't know why Google did this.`
			`# Text segments are in one place and their locations are in another, using a very strange coordinate system.`
			`# At least I don't need whatever is contained in the base64 partss (which I assume are protobufs).`
			`# TODO: on a few images, this seems to not work for some reason.`
			`try:`
			`text_segments = root[3][4][0][0]`
			`text_regions = [ rationalize_coords_format1(image.width, image.height, *x[1]) for x in root[2][3][0] if x[11].startswith("text:") ]`
			`except (KeyError, IndexError):`
			`# https://github.com/dimdenGD/chrome-lens-ocr/blob/main/src/core.js#L316 has code for a fallback text segment read mode.`
			`# In testing, this proved unnecessary (quirks of the HTTP request? I don't know), and this only happens on textless images.`
			`return [], []`

			`return text_segments, text_regions`

			`MAX_SCAN_DIM = 1000 # not actually true but close enough`
			`def chunk_image(image: Image):`
			`chunks = []`
			`# Cut image down in X axis (I'm assuming images aren't too wide to scan in downscaled form because merging text horizontally would be annoying)`
			`if image.width > MAX_SCAN_DIM:`
			`image = image.resize((MAX_SCAN_DIM, round(image.height * (image.width / MAX_SCAN_DIM))), Image.LANCZOS)`
			`for y in range(0, image.height, MAX_SCAN_DIM):`
			`chunks.append(image.crop((0, y, image.width, min(y + MAX_SCAN_DIM, image.height))))`
			`return chunks`

			`async def scan_chunks(sess: aiohttp.ClientSession, chunks: [Image]):`
			`# If text happens to be split across the cut line it won't get read.`
			`# This is because doing overlap read areas would be really annoying.`
			`text = ""`
			`regions = []`
			`for chunk in chunks:`
			`new_segments, new_regions = await scan_image_chunk(sess, chunk)`
			`for segment in new_segments:`
			`text += segment + "\n"`
			`for i, (segment, region) in enumerate(zip(new_segments, new_regions)):`
			`regions.append({ *region, "y": region["y"] + (MAX_SCAN_DIM i), "text": segment })`
			`return text, regions`

			`async def scan_image(sess: aiohttp.ClientSession, image: Image):`
			`return await scan_chunks(sess, chunk_image(image))`

			`if __name__ == "__main__":`
			`async def main():`
			`async with aiohttp.ClientSession() as sess:`
			`print(await scan_image(sess, Image.open("/data/public/memes-or-something/linear-algebra-chess.png")))`
			`asyncio.run(main())`
			`*/`