Minor updates, prototype internal search mechanism

2026-05-02 19:51:31 +00:00 · 2024-08-31 18:36:07 +01:00
parent 5436237317
commit 8d81924804
13 changed files with 1032 additions and 39 deletions
--- a/src/fts.mjs
+++ b/src/fts.mjs
@@ -0,0 +1,99 @@
+import * as R from "ramda"
+import * as htmlToText from "html-to-text"
+import * as binaryFuseFilter from "binary-fuse-filter"
+import { xxHash32 as hash } from "js-xxhash"
+import * as msgpack from "msgpackr"
+import { BIGRAM_SEPARATOR, FREQUENCY_SEPARATOR, FREQUENCY_THRESHOLDS, tokenize } from "./fts_common.mjs"
+
+const index = []
+
+const BIGRAM_INCLUSION = 0.3
+const URL = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g
+
+export const stripHTML = html => htmlToText.convert(html.replace(URL, " "), {
+    wordwrap: false,
+    selectors: [
+        { selector: "a", options: { ignoreHref: true } },
+        { selector: "img", format: "skip" }
+    ]
+})
+
+export const pushEntry = (sourceType, entry) => {
+    const { html, url, timestamp, title, description, ignoreDescription } = entry
+    // TODO: this puts URLs inline, maybe do something with that
+    const text = (title ?? "") + " " + (!ignoreDescription ? (description && stripHTML(description)) ?? "" : "") + " " + stripHTML(html)
+    const words = tokenize(text)
+    const counts = {}
+    for (const word of words) {
+        counts[word] = (counts[word] ?? 0) + 1
+    }
+    const bigrams = {}
+    for (const [a, b] of R.zip(words, R.drop(1, words))) {
+        bigrams[a + BIGRAM_SEPARATOR + b] = (bigrams[a + BIGRAM_SEPARATOR + b] ?? 0) + 1
+    }
+    index.push({
+        url,
+        timestamp,
+        counts,
+        bigrams,
+        title,
+        description,
+        sourceType
+    })
+}
+
+export const build = () => {
+    let totalTerms = 0
+    let totalBigrams = 0
+    const totalBigramCounts = {}
+    const totalTermCounts = {}
+    for (const entry of index) {
+        for (const [bigram, count] of Object.entries(entry.bigrams)) {
+            totalBigramCounts[bigram] = (totalBigramCounts[bigram] ?? 0) + count
+            totalBigrams += count
+        }
+        for (const [word, count] of Object.entries(entry.counts)) {
+            totalTermCounts[word] = (totalTermCounts[word] ?? 0) + count
+            totalTerms += count
+        }
+    }
+    const pmi = (bigram, count) => {
+        const [a, b] = bigram.split(BIGRAM_SEPARATOR, 2)
+        // bigram provides no useful information if term is unique anyway
+        // want ascending order (lower is better)
+        if (totalTermCounts[a] === 1 || !totalTermCounts[b] === 1) { return 0 }
+        return -(count / totalBigrams) / ((totalTermCounts[a] / totalTerms) * (totalTermCounts[b] / totalTerms))
+    }
+    const pmis = new Map(Object.entries(totalBigramCounts).map(([k, v]) => [k, pmi(k, v)]))
+    const records = []
+    for (const entry of index) {
+        const keys = []
+        for (const [word, count] of Object.entries(entry.counts)) {
+            for (const threshold of FREQUENCY_THRESHOLDS) {
+                if (count >= threshold) {
+                    keys.push(hash(word + FREQUENCY_SEPARATOR + threshold))
+                }
+            }
+        }
+        const sorted = R.sortBy(x => pmis.get(x), Object.entries(entry.bigrams))
+        for (const [bigram, count] of sorted.slice(0, Math.ceil(keys.length * BIGRAM_INCLUSION))) {
+            keys.push(hash(bigram))
+        }
+        const [filter, err] = binaryFuseFilter.populateBinaryFuse8(keys)
+        if (err) {
+            throw err // Golang...
+        }
+        filter.Fingerprints = new Uint8Array(filter.Fingerprints)
+        records.push({
+            filter,
+            url: entry.url,
+            timestamp: entry.timestamp ? entry.timestamp.format("YYYY-MM-DD") : null,
+            title: entry.title,
+            description: entry.description,
+            sourceType: entry.sourceType
+        })
+    }
+    console.log(`Total terms: ${totalTerms}`)
+    console.log(`Total bigrams: ${totalBigrams}`)
+    return msgpack.pack(records)
+}
--- a/src/fts_client.mjs
+++ b/src/fts_client.mjs
@@ -0,0 +1,81 @@
+import { BIGRAM_SEPARATOR, FREQUENCY_SEPARATOR, FREQUENCY_THRESHOLDS, tokenize } from "./fts_common.mjs"
+import { populateBinaryFuse8 } from "binary-fuse-filter"
+import { xxHash32 as hash } from "js-xxhash"
+import { unpack } from "msgpackr"
+import { drop, zip } from "ramda"
+
+const SCORE_EXP = 1.9
+const BIGRAM_FACTOR = 3
+
+let index = null
+
+const query = input => {
+    const tokens = tokenize(input)
+    const bigrams = new Set()
+    for (const [a, b] of zip(tokens, drop(1, tokens))) {
+        bigrams.add(a + BIGRAM_SEPARATOR + b)
+    }
+    const cache = {}
+    const hashCached = x => {
+        if (cache[x]) { return cache[x] }
+        const ret = hash(x)
+        cache[x] = ret
+        return ret
+    }
+    const results = []
+    for (const doc of index) {
+        let score = 0
+        for (const token of tokens) {
+            let count = 0
+            for (const frequency of FREQUENCY_THRESHOLDS) {
+                const query = hashCached(token + FREQUENCY_SEPARATOR + frequency)
+                if (doc.filter.contains(query)) {
+                    if (count > 0 || frequency == FREQUENCY_THRESHOLDS[0]) {
+                        count = frequency
+                    }
+                }
+            }
+            if (count > 0) {
+                score += SCORE_EXP ** count
+            }
+        }
+        for (const bigram of bigrams) {
+            if (doc.filter.contains(hashCached(bigram))) {
+                score *= BIGRAM_FACTOR
+            }
+        }
+        if (score > 0) {
+            results.push({
+                score,
+                url: doc.url,
+                title: doc.title,
+                description: doc.description,
+                timestamp: doc.timestamp,
+                sourceType: doc.sourceType
+            })
+        }
+    }
+    results.sort((a, b) => b.score - a.score)
+    return results.slice(0, 10)
+}
+
+const loadIndex = async () => {
+    if (index) { return }
+    index = await fetch("/fts.bin").then(x => x.arrayBuffer()).then(x => new Uint8Array(x)).then(x => unpack(x))
+    index.forEach(entry => {
+        const x = entry.filter
+        const newFilter = populateBinaryFuse8([])[0]
+        newFilter.Seed = BigInt(x.Seed)
+        newFilter.SegmentLength = x.SegmentLength
+        newFilter.SegmentLengthMask = x.SegmentLengthMask
+        newFilter.SegmentCount = x.SegmentCount
+        newFilter.SegmentCountLength = x.SegmentCountLength
+        newFilter.len = x.len
+        newFilter.Fingerprints = x.Fingerprints
+        entry.filter = newFilter
+    })
+}
+
+await loadIndex()
+
+export default query
--- a/src/fts_common.mjs
+++ b/src/fts_common.mjs
@@ -0,0 +1,18 @@
+import * as stemmer from "porter2"
+
+export const BIGRAM_SEPARATOR = "\x00"
+export const FREQUENCY_SEPARATOR = "\x01"
+export const FREQUENCY_THRESHOLDS = [1, 4, 9]
+const NUMERIC = /^[0-9\.e]+$/
+
+function segmentWords(text) {
+    if (Intl && Intl.Segmenter) {
+        const segmenter = new Intl.Segmenter("en", { granularity: "word" })
+        return Array.from(segmenter.segment(text)).filter(x => x.isWordLike).filter(x => !NUMERIC.test(x.segment)).map(x => x.segment)
+    } else {
+        // Fallback path
+        return text.split(/[\s\p{P}]+/u).filter(Boolean)
+    }
+}
+
+export const tokenize = x => segmentWords(x).map(x => x.toLowerCase()).map(stemmer.stem)
--- a/src/global.json
+++ b/src/global.json
@@ -67,5 +67,6 @@
        ["bee.png", "https://citrons.xyz/a/memetic-apioform-page.html"],
        ["perceptron.png", "https://en.wikipedia.org/wiki/Perceptron"],
        ["rhombic_dodecahedron.gif", "https://en.wikipedia.org/wiki/Rhombic_dodecahedron"]
-    ]
+    ],
+    "mycorrhiza": "https://docs.osmarks.net"
 }
--- a/src/index.js
+++ b/src/index.js
@@ -22,6 +22,12 @@ const sqlite = require("better-sqlite3")
 const axios = require("axios")
 const msgpack = require("@msgpack/msgpack")
 const esbuild = require("esbuild")
+const htmlparser2 = require("htmlparser2")
+const cssSelect = require("css-select")
+const domSerializer = require("dom-serializer")
+const domutils = require("domutils")
+
+const fts = require("./fts.mjs")

 dayjs.extend(customParseFormat)

@@ -33,6 +39,7 @@ const errorPagesDir = path.join(root, "error")
 const assetsDir = path.join(root, "assets")
 const outDir = path.join(root, "out")
 const srcDir = path.join(root, "src")
+const nodeModules = path.join(root, "node_modules")

 const buildID = nanoid()
 globalData.buildID = buildID
@@ -76,7 +83,6 @@ globalData.hashBG = hashBG

 const removeExtension = x => x.replace(/\.[^/.]+$/, "")

-const mdutils = MarkdownIt().utils
 const renderContainer = (tokens, idx) => {
    let opening = true
    if (tokens[idx].type === "container__close") {
@@ -144,6 +150,7 @@ const renderContainer = (tokens, idx) => {

 const readFile = path => fsp.readFile(path, { encoding: "utf8" })
 const anchor = require("markdown-it-anchor")
+const { htmlToText } = require("html-to-text")
 const md = new MarkdownIt({ html: true })
    .use(require("markdown-it-container"), "", { render: renderContainer, validate: params => true })
    .use(require("markdown-it-footnote"))
@@ -152,6 +159,7 @@ const md = new MarkdownIt({ html: true })
            symbol: "§"
        })
    })
+    .use(require("@vscode/markdown-it-katex").default)
 const minifyHTML = x => htmlMinifier(x, {
    collapseWhitespace: true,
    sortAttributes: true,
@@ -236,7 +244,15 @@ const processExperiments = async () => {
                        return fse.copy(path.join(subdirectory, file), path.join(out, file))
                    }
                }))
-                return path.join(out, "index.html")
+                const indexPath = path.join(out, "index.html")
+                fts.pushEntry("experiment", {
+                    url: "/" + page.data.slug,
+                    title: page.data.title,
+                    description: page.data.description,
+                    html: page.content,
+                    timestamp: dayjs(await fsp.stat(indexPath).then(x => x.mtimeMs))
+                })
+                return indexPath
            },
            { processMeta: meta => {
                meta.slug = meta.slug || basename
@@ -254,6 +270,13 @@ const processBlog = async () => {
        meta.wordCount = page.content.split(/\s+/).map(x => x.trim()).filter(x => x).length
        meta.haveSidenotes = true
        meta.content = renderMarkdown(page.content)
+        fts.pushEntry("blog", {
+            html: meta.content,
+            url: "/" + meta.slug,
+            timestamp: meta.updated ?? meta.created,
+            title: meta.title,
+            description: meta.description
+        })
        return meta
    })

@@ -332,16 +355,32 @@ const writeCache = (k, v, ts=Date.now()) => {
    writeCacheStmt.run(k, Buffer.from(enc.buffer, enc.byteOffset, enc.byteLength), ts)
 }

+const DESC_CUT_LEN = 256
 const fetchMicroblog = async () => {
    const cached = readCache("microblog", 60*60*1000)
    if (cached) {
        globalData.microblog = cached
    } else {
+        // We have a server patch which removes the 20-post hardcoded limit.
+        // For some exciting reason microblog.pub does not expose pagination in the *API* components.
+        // This is a workaround.
        const posts = (await axios({ url: globalData.microblogSource, headers: { "Accept": 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"' } })).data.orderedItems
        writeCache("microblog", posts)
        globalData.microblog = posts
    }
    
+    for (const post of globalData.microblog) {
+        if (!post.object.content) { continue }
+        const desc = fts.stripHTML(post.object.content)
+        fts.pushEntry("microblog", {
+            url: post.object.id,
+            timestamp: dayjs(post.object.published),
+            html: post.object.content,
+            description: desc.length > DESC_CUT_LEN ? desc.slice(0, DESC_CUT_LEN) + "..." : desc,
+            ignoreDescription: true
+        })
+    }
+
    globalData.microblog = globalData.microblog.slice(0, 6).map((post, i) => minifyHTML(globalData.templates.activitypub({
        ...globalData,
        permalink: post.object.id,
@@ -399,13 +438,24 @@ const minifyJSTask = async () => {
 }

 const compilePageJSTask = async () => {
-    await esbuild.build({
-        entryPoints: [ path.join(srcDir, "page.js") ],
-        bundle: true,
-        outfile: path.join(outAssets, "js/page.js"),
-        minify: true,
-        sourcemap: true
-    })
+    await Promise.all([
+        esbuild.build({
+            entryPoints: [ path.join(srcDir, "page.js") ],
+            bundle: true,
+            outfile: path.join(outAssets, "js/page.js"),
+            minify: true,
+            sourcemap: true,
+            external: ["/assets/js/fts_client.js"]
+        }),
+        esbuild.build({
+            entryPoints: [ path.join(srcDir, "fts_client.mjs") ],
+            bundle: true,
+            outfile: path.join(outAssets, "js/fts_client.js"),
+            minify: true,
+            sourcemap: true,
+            format: "esm"
+        })
+    ])
 }

 const compileServiceWorkerJSTask = async () => {
@@ -421,18 +471,14 @@ const compileServiceWorkerJSTask = async () => {
    })
 }

-const genServiceWorker = async () => {
-    const serviceWorker = mustache.render(await readFile(path.join(assetsDir, "sw.js")), globalData)
-    await minifyJSFile(serviceWorker, "sw.js", path.join(outDir, "sw.js"))
-}
-
 const copyAsset = subpath => fse.copy(path.join(assetsDir, subpath), path.join(outAssets, subpath))

 const doImages = async () => {
-    copyAsset("images")
-    copyAsset("titillium-web.woff2")
-    copyAsset("titillium-web-semibold.woff2")
-    copyAsset("miracode.woff2")
+    await Promise.all(["images", "titillium-web.woff2", "titillium-web-semibold.woff2", "miracode.woff2", "misc"].map(subpath => fse.copy(path.join(assetsDir, subpath), path.join(outAssets, subpath))))
+    
+    await fse.copy(path.join(nodeModules, "katex", "dist", "fonts"), path.join(outAssets, "fonts"))
+    await fse.copy(path.join(nodeModules, "katex", "dist", "katex.min.css"), path.join(outAssets, "katex.min.css"))
+
    globalData.images = {}
    await Promise.all(
        (await fse.readdir(path.join(assetsDir, "images"), { encoding: "utf-8" })).map(async image => {
@@ -471,6 +517,37 @@ const doImages = async () => {
    )
 }

+const fetchMycorrhiza = async () => {
+    const allPages = await axios({ url: globalData.mycorrhiza + "/list" })
+    const dom = htmlparser2.parseDocument(allPages.data)
+    const urls = cssSelect.selectAll("main > ol a", dom).map(x => x.attribs.href)
+    for (const url of urls) {
+        // TODO: this can run in parallel
+        const page = await axios({ url: globalData.mycorrhiza + url })
+        const dom = htmlparser2.parseDocument(page.data)
+        const title = domutils.innerText(cssSelect.selectAll(".navi-title a, .navi-title span", dom).slice(2))
+        const article = cssSelect.selectOne("main #hypha article", dom)
+        const content = article ? domSerializer.render(article) : ""
+        let description = null
+        if (description = cssSelect.selectOne("meta[property=og:description]", dom)) {
+            description = description.attribs.content
+        }
+        fts.pushEntry("mycorrhiza", {
+            url: globalData.mycorrhiza + url,
+            title,
+            description,
+            html: content,
+            timestamp: null
+        })
+    }
+}
+
+const buildFTS = async () => {
+    console.log(chalk.yellow("Building full-text search index"))
+    const blob = fts.build()
+    await fsp.writeFile(path.join(outDir, "fts.bin"), blob)
+}
+
 const tasks = {
    errorPages: { deps: ["pagedeps"], fn: processErrorPages },
    templates: { deps: [], fn: loadTemplates },
@@ -491,7 +568,9 @@ const tasks = {
    images: { deps: ["assetsDir"], fn: doImages },
    offlinePage: { deps: ["assetsDir", "pagedeps"], fn: () => applyTemplate(globalData.templates.experiment, path.join(assetsDir, "offline.html"), () => path.join(outAssets, "offline.html"), {}) },
    assets: { deps: ["manifest", "minifyJS", "serviceWorker", "images", "compilePageJS"] },
-    main: { deps: ["writeBuildID", "index", "errorPages", "assets", "experiments", "blog", "rss"] }
+    main: { deps: ["writeBuildID", "index", "errorPages", "assets", "experiments", "blog", "rss"] },
+    searchIndex: { deps: ["blog", "fetchMicroblog", "fetchMycorrhiza", "experiments"], fn: buildFTS },
+    fetchMycorrhiza: { deps: [], fn: fetchMycorrhiza }
 }

 const compile = async () => {
--- a/src/page.js
+++ b/src/page.js
@@ -63,6 +63,14 @@ const hashString = function(str, seed = 0) {
 const colHash = (str, saturation = 100, lightness = 70) => `hsl(${hashString(str) % 360}, ${saturation}%, ${lightness}%)`
 window.colHash = colHash

+const e = (cls, parent, content, type="div") => {
+    const element = document.createElement(type)
+    element.classList.add(cls)
+    if (content) { element.appendChild(document.createTextNode(content)) }
+    if (parent) { parent.appendChild(element) }
+    return element
+}
+
 // Arbitrary Points code, wrapped in an IIFE to not pollute the global environment much more than it already is
 window.points = (async () => {
    const achievementInfo = {
@@ -242,14 +250,6 @@ window.points = (async () => {
        },
    });

-    const e = (cls, parent, content) => {
-        const element = document.createElement("div")
-        element.classList.add(cls)
-        if (content) { element.appendChild(document.createTextNode(content)) }
-        if (parent) { parent.appendChild(element) }
-        return element
-    }
-
    const achievementsContainer = e("achievements", document.body)
    const displayAchievement = (title, description, conditions, points) => {
        const elem = e("achievement", achievementsContainer)
@@ -589,4 +589,59 @@ if (customStyle) {
    document.head.appendChild(customStyleEl)
 }
 window.customStyleEl = customStyleEl
-window.customStyle = customStyle
+window.customStyle = customStyle
+
+const nameMappings = {
+    "blog": "Blog",
+    "microblog": "Microblog",
+    "experiment": "Experiments",
+    "mycorrhiza": "Documentation"
+}
+
+// replace login navbar option with search because whatever
+const loginButton = document.querySelector("nav a:last-of-type")
+loginButton.href = "#"
+loginButton.innerText = "Search"
+loginButton.onclick = async ev => {
+    ev.preventDefault()
+    const query = (await import("/assets/js/fts_client.js")).default
+    const overlay = document.createElement("div")
+    overlay.classList.add("search-overlay")
+    document.body.appendChild(overlay)
+    const input = document.createElement("input")
+    input.type = "text"
+    input.placeholder = "Search"
+    let resultsEl
+    input.oninput = () => {
+        if (resultsEl) {
+            resultsEl.remove()
+        }
+        resultsEl = document.createElement("div")
+        resultsEl.classList.add("search-results")
+        for (const result of query(input.value)) {
+            const item = e("search-result", resultsEl)
+            const titleLine = nameMappings[result.sourceType] + " / " + (result.title ?? result.timestamp)
+            const link = e("search-result-link", item, titleLine, "a")
+            link.setAttribute("href", result.url)
+            if (result.title && result.timestamp) {
+                e("deemph", item, result.timestamp)
+            }
+            if (result.description) {
+                e("description", item, result.description)
+            }
+            item.style.border = `${colHash(result.sourceType)} solid 4px`
+            item.style.background = `${colHash(result.sourceType, 50, 10)}`
+        }
+        overlay.appendChild(resultsEl)
+    }
+    input.onkeydown = ev => {
+        if (ev.key === "Enter" || ev.key === "Backspace") {
+            if (input.value === "") {
+                // quit search mode
+                overlay.remove()
+            }
+        }
+    }
+    overlay.appendChild(input)
+    input.focus()
+}
--- a/src/style.sass
+++ b/src/style.sass
@@ -361,4 +361,29 @@ table
    text-align: right

 .buttons .button
-    margin: 0.2em
+    margin: 0.2em
+
+.search-overlay
+    position: fixed
+    overflow: scroll
+    top: 0
+    left: 0
+    bottom: 0
+    right: 0
+    background: rgba(0, 0, 0, 0.9)
+    padding: 2em
+    color: white
+    z-index: 1000
+    input
+        margin: 0.5em
+        font-size: 1.5em
+        border: 4px solid green
+    a
+        color: white
+    .search-results
+        width: 100%
+        .search-result
+            padding: 0.5em
+            margin: 0.5em
+            .description
+                font-style: italic