1
0
mirror of https://github.com/osmarks/website synced 2026-05-02 19:51:31 +00:00

Minor updates, prototype internal search mechanism

This commit is contained in:
2024-08-31 18:36:07 +01:00
parent 5436237317
commit 8d81924804
13 changed files with 1032 additions and 39 deletions

99
src/fts.mjs Normal file
View File

@@ -0,0 +1,99 @@
import * as R from "ramda"
import * as htmlToText from "html-to-text"
import * as binaryFuseFilter from "binary-fuse-filter"
import { xxHash32 as hash } from "js-xxhash"
import * as msgpack from "msgpackr"
import { BIGRAM_SEPARATOR, FREQUENCY_SEPARATOR, FREQUENCY_THRESHOLDS, tokenize } from "./fts_common.mjs"
const index = []
const BIGRAM_INCLUSION = 0.3
const URL = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g
export const stripHTML = html => htmlToText.convert(html.replace(URL, " "), {
wordwrap: false,
selectors: [
{ selector: "a", options: { ignoreHref: true } },
{ selector: "img", format: "skip" }
]
})
export const pushEntry = (sourceType, entry) => {
const { html, url, timestamp, title, description, ignoreDescription } = entry
// TODO: this puts URLs inline, maybe do something with that
const text = (title ?? "") + " " + (!ignoreDescription ? (description && stripHTML(description)) ?? "" : "") + " " + stripHTML(html)
const words = tokenize(text)
const counts = {}
for (const word of words) {
counts[word] = (counts[word] ?? 0) + 1
}
const bigrams = {}
for (const [a, b] of R.zip(words, R.drop(1, words))) {
bigrams[a + BIGRAM_SEPARATOR + b] = (bigrams[a + BIGRAM_SEPARATOR + b] ?? 0) + 1
}
index.push({
url,
timestamp,
counts,
bigrams,
title,
description,
sourceType
})
}
export const build = () => {
let totalTerms = 0
let totalBigrams = 0
const totalBigramCounts = {}
const totalTermCounts = {}
for (const entry of index) {
for (const [bigram, count] of Object.entries(entry.bigrams)) {
totalBigramCounts[bigram] = (totalBigramCounts[bigram] ?? 0) + count
totalBigrams += count
}
for (const [word, count] of Object.entries(entry.counts)) {
totalTermCounts[word] = (totalTermCounts[word] ?? 0) + count
totalTerms += count
}
}
const pmi = (bigram, count) => {
const [a, b] = bigram.split(BIGRAM_SEPARATOR, 2)
// bigram provides no useful information if term is unique anyway
// want ascending order (lower is better)
if (totalTermCounts[a] === 1 || !totalTermCounts[b] === 1) { return 0 }
return -(count / totalBigrams) / ((totalTermCounts[a] / totalTerms) * (totalTermCounts[b] / totalTerms))
}
const pmis = new Map(Object.entries(totalBigramCounts).map(([k, v]) => [k, pmi(k, v)]))
const records = []
for (const entry of index) {
const keys = []
for (const [word, count] of Object.entries(entry.counts)) {
for (const threshold of FREQUENCY_THRESHOLDS) {
if (count >= threshold) {
keys.push(hash(word + FREQUENCY_SEPARATOR + threshold))
}
}
}
const sorted = R.sortBy(x => pmis.get(x), Object.entries(entry.bigrams))
for (const [bigram, count] of sorted.slice(0, Math.ceil(keys.length * BIGRAM_INCLUSION))) {
keys.push(hash(bigram))
}
const [filter, err] = binaryFuseFilter.populateBinaryFuse8(keys)
if (err) {
throw err // Golang...
}
filter.Fingerprints = new Uint8Array(filter.Fingerprints)
records.push({
filter,
url: entry.url,
timestamp: entry.timestamp ? entry.timestamp.format("YYYY-MM-DD") : null,
title: entry.title,
description: entry.description,
sourceType: entry.sourceType
})
}
console.log(`Total terms: ${totalTerms}`)
console.log(`Total bigrams: ${totalBigrams}`)
return msgpack.pack(records)
}

81
src/fts_client.mjs Normal file
View File

@@ -0,0 +1,81 @@
import { BIGRAM_SEPARATOR, FREQUENCY_SEPARATOR, FREQUENCY_THRESHOLDS, tokenize } from "./fts_common.mjs"
import { populateBinaryFuse8 } from "binary-fuse-filter"
import { xxHash32 as hash } from "js-xxhash"
import { unpack } from "msgpackr"
import { drop, zip } from "ramda"
const SCORE_EXP = 1.9
const BIGRAM_FACTOR = 3
let index = null
const query = input => {
const tokens = tokenize(input)
const bigrams = new Set()
for (const [a, b] of zip(tokens, drop(1, tokens))) {
bigrams.add(a + BIGRAM_SEPARATOR + b)
}
const cache = {}
const hashCached = x => {
if (cache[x]) { return cache[x] }
const ret = hash(x)
cache[x] = ret
return ret
}
const results = []
for (const doc of index) {
let score = 0
for (const token of tokens) {
let count = 0
for (const frequency of FREQUENCY_THRESHOLDS) {
const query = hashCached(token + FREQUENCY_SEPARATOR + frequency)
if (doc.filter.contains(query)) {
if (count > 0 || frequency == FREQUENCY_THRESHOLDS[0]) {
count = frequency
}
}
}
if (count > 0) {
score += SCORE_EXP ** count
}
}
for (const bigram of bigrams) {
if (doc.filter.contains(hashCached(bigram))) {
score *= BIGRAM_FACTOR
}
}
if (score > 0) {
results.push({
score,
url: doc.url,
title: doc.title,
description: doc.description,
timestamp: doc.timestamp,
sourceType: doc.sourceType
})
}
}
results.sort((a, b) => b.score - a.score)
return results.slice(0, 10)
}
const loadIndex = async () => {
if (index) { return }
index = await fetch("/fts.bin").then(x => x.arrayBuffer()).then(x => new Uint8Array(x)).then(x => unpack(x))
index.forEach(entry => {
const x = entry.filter
const newFilter = populateBinaryFuse8([])[0]
newFilter.Seed = BigInt(x.Seed)
newFilter.SegmentLength = x.SegmentLength
newFilter.SegmentLengthMask = x.SegmentLengthMask
newFilter.SegmentCount = x.SegmentCount
newFilter.SegmentCountLength = x.SegmentCountLength
newFilter.len = x.len
newFilter.Fingerprints = x.Fingerprints
entry.filter = newFilter
})
}
await loadIndex()
export default query

18
src/fts_common.mjs Normal file
View File

@@ -0,0 +1,18 @@
import * as stemmer from "porter2"
export const BIGRAM_SEPARATOR = "\x00"
export const FREQUENCY_SEPARATOR = "\x01"
export const FREQUENCY_THRESHOLDS = [1, 4, 9]
const NUMERIC = /^[0-9\.e]+$/
function segmentWords(text) {
if (Intl && Intl.Segmenter) {
const segmenter = new Intl.Segmenter("en", { granularity: "word" })
return Array.from(segmenter.segment(text)).filter(x => x.isWordLike).filter(x => !NUMERIC.test(x.segment)).map(x => x.segment)
} else {
// Fallback path
return text.split(/[\s\p{P}]+/u).filter(Boolean)
}
}
export const tokenize = x => segmentWords(x).map(x => x.toLowerCase()).map(stemmer.stem)

View File

@@ -67,5 +67,6 @@
["bee.png", "https://citrons.xyz/a/memetic-apioform-page.html"],
["perceptron.png", "https://en.wikipedia.org/wiki/Perceptron"],
["rhombic_dodecahedron.gif", "https://en.wikipedia.org/wiki/Rhombic_dodecahedron"]
]
],
"mycorrhiza": "https://docs.osmarks.net"
}

View File

@@ -22,6 +22,12 @@ const sqlite = require("better-sqlite3")
const axios = require("axios")
const msgpack = require("@msgpack/msgpack")
const esbuild = require("esbuild")
const htmlparser2 = require("htmlparser2")
const cssSelect = require("css-select")
const domSerializer = require("dom-serializer")
const domutils = require("domutils")
const fts = require("./fts.mjs")
dayjs.extend(customParseFormat)
@@ -33,6 +39,7 @@ const errorPagesDir = path.join(root, "error")
const assetsDir = path.join(root, "assets")
const outDir = path.join(root, "out")
const srcDir = path.join(root, "src")
const nodeModules = path.join(root, "node_modules")
const buildID = nanoid()
globalData.buildID = buildID
@@ -76,7 +83,6 @@ globalData.hashBG = hashBG
const removeExtension = x => x.replace(/\.[^/.]+$/, "")
const mdutils = MarkdownIt().utils
const renderContainer = (tokens, idx) => {
let opening = true
if (tokens[idx].type === "container__close") {
@@ -144,6 +150,7 @@ const renderContainer = (tokens, idx) => {
const readFile = path => fsp.readFile(path, { encoding: "utf8" })
const anchor = require("markdown-it-anchor")
const { htmlToText } = require("html-to-text")
const md = new MarkdownIt({ html: true })
.use(require("markdown-it-container"), "", { render: renderContainer, validate: params => true })
.use(require("markdown-it-footnote"))
@@ -152,6 +159,7 @@ const md = new MarkdownIt({ html: true })
symbol: "§"
})
})
.use(require("@vscode/markdown-it-katex").default)
const minifyHTML = x => htmlMinifier(x, {
collapseWhitespace: true,
sortAttributes: true,
@@ -236,7 +244,15 @@ const processExperiments = async () => {
return fse.copy(path.join(subdirectory, file), path.join(out, file))
}
}))
return path.join(out, "index.html")
const indexPath = path.join(out, "index.html")
fts.pushEntry("experiment", {
url: "/" + page.data.slug,
title: page.data.title,
description: page.data.description,
html: page.content,
timestamp: dayjs(await fsp.stat(indexPath).then(x => x.mtimeMs))
})
return indexPath
},
{ processMeta: meta => {
meta.slug = meta.slug || basename
@@ -254,6 +270,13 @@ const processBlog = async () => {
meta.wordCount = page.content.split(/\s+/).map(x => x.trim()).filter(x => x).length
meta.haveSidenotes = true
meta.content = renderMarkdown(page.content)
fts.pushEntry("blog", {
html: meta.content,
url: "/" + meta.slug,
timestamp: meta.updated ?? meta.created,
title: meta.title,
description: meta.description
})
return meta
})
@@ -332,16 +355,32 @@ const writeCache = (k, v, ts=Date.now()) => {
writeCacheStmt.run(k, Buffer.from(enc.buffer, enc.byteOffset, enc.byteLength), ts)
}
const DESC_CUT_LEN = 256
const fetchMicroblog = async () => {
const cached = readCache("microblog", 60*60*1000)
if (cached) {
globalData.microblog = cached
} else {
// We have a server patch which removes the 20-post hardcoded limit.
// For some exciting reason microblog.pub does not expose pagination in the *API* components.
// This is a workaround.
const posts = (await axios({ url: globalData.microblogSource, headers: { "Accept": 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"' } })).data.orderedItems
writeCache("microblog", posts)
globalData.microblog = posts
}
for (const post of globalData.microblog) {
if (!post.object.content) { continue }
const desc = fts.stripHTML(post.object.content)
fts.pushEntry("microblog", {
url: post.object.id,
timestamp: dayjs(post.object.published),
html: post.object.content,
description: desc.length > DESC_CUT_LEN ? desc.slice(0, DESC_CUT_LEN) + "..." : desc,
ignoreDescription: true
})
}
globalData.microblog = globalData.microblog.slice(0, 6).map((post, i) => minifyHTML(globalData.templates.activitypub({
...globalData,
permalink: post.object.id,
@@ -399,13 +438,24 @@ const minifyJSTask = async () => {
}
const compilePageJSTask = async () => {
await esbuild.build({
entryPoints: [ path.join(srcDir, "page.js") ],
bundle: true,
outfile: path.join(outAssets, "js/page.js"),
minify: true,
sourcemap: true
})
await Promise.all([
esbuild.build({
entryPoints: [ path.join(srcDir, "page.js") ],
bundle: true,
outfile: path.join(outAssets, "js/page.js"),
minify: true,
sourcemap: true,
external: ["/assets/js/fts_client.js"]
}),
esbuild.build({
entryPoints: [ path.join(srcDir, "fts_client.mjs") ],
bundle: true,
outfile: path.join(outAssets, "js/fts_client.js"),
minify: true,
sourcemap: true,
format: "esm"
})
])
}
const compileServiceWorkerJSTask = async () => {
@@ -421,18 +471,14 @@ const compileServiceWorkerJSTask = async () => {
})
}
const genServiceWorker = async () => {
const serviceWorker = mustache.render(await readFile(path.join(assetsDir, "sw.js")), globalData)
await minifyJSFile(serviceWorker, "sw.js", path.join(outDir, "sw.js"))
}
const copyAsset = subpath => fse.copy(path.join(assetsDir, subpath), path.join(outAssets, subpath))
const doImages = async () => {
copyAsset("images")
copyAsset("titillium-web.woff2")
copyAsset("titillium-web-semibold.woff2")
copyAsset("miracode.woff2")
await Promise.all(["images", "titillium-web.woff2", "titillium-web-semibold.woff2", "miracode.woff2", "misc"].map(subpath => fse.copy(path.join(assetsDir, subpath), path.join(outAssets, subpath))))
await fse.copy(path.join(nodeModules, "katex", "dist", "fonts"), path.join(outAssets, "fonts"))
await fse.copy(path.join(nodeModules, "katex", "dist", "katex.min.css"), path.join(outAssets, "katex.min.css"))
globalData.images = {}
await Promise.all(
(await fse.readdir(path.join(assetsDir, "images"), { encoding: "utf-8" })).map(async image => {
@@ -471,6 +517,37 @@ const doImages = async () => {
)
}
const fetchMycorrhiza = async () => {
const allPages = await axios({ url: globalData.mycorrhiza + "/list" })
const dom = htmlparser2.parseDocument(allPages.data)
const urls = cssSelect.selectAll("main > ol a", dom).map(x => x.attribs.href)
for (const url of urls) {
// TODO: this can run in parallel
const page = await axios({ url: globalData.mycorrhiza + url })
const dom = htmlparser2.parseDocument(page.data)
const title = domutils.innerText(cssSelect.selectAll(".navi-title a, .navi-title span", dom).slice(2))
const article = cssSelect.selectOne("main #hypha article", dom)
const content = article ? domSerializer.render(article) : ""
let description = null
if (description = cssSelect.selectOne("meta[property=og:description]", dom)) {
description = description.attribs.content
}
fts.pushEntry("mycorrhiza", {
url: globalData.mycorrhiza + url,
title,
description,
html: content,
timestamp: null
})
}
}
const buildFTS = async () => {
console.log(chalk.yellow("Building full-text search index"))
const blob = fts.build()
await fsp.writeFile(path.join(outDir, "fts.bin"), blob)
}
const tasks = {
errorPages: { deps: ["pagedeps"], fn: processErrorPages },
templates: { deps: [], fn: loadTemplates },
@@ -491,7 +568,9 @@ const tasks = {
images: { deps: ["assetsDir"], fn: doImages },
offlinePage: { deps: ["assetsDir", "pagedeps"], fn: () => applyTemplate(globalData.templates.experiment, path.join(assetsDir, "offline.html"), () => path.join(outAssets, "offline.html"), {}) },
assets: { deps: ["manifest", "minifyJS", "serviceWorker", "images", "compilePageJS"] },
main: { deps: ["writeBuildID", "index", "errorPages", "assets", "experiments", "blog", "rss"] }
main: { deps: ["writeBuildID", "index", "errorPages", "assets", "experiments", "blog", "rss"] },
searchIndex: { deps: ["blog", "fetchMicroblog", "fetchMycorrhiza", "experiments"], fn: buildFTS },
fetchMycorrhiza: { deps: [], fn: fetchMycorrhiza }
}
const compile = async () => {

View File

@@ -63,6 +63,14 @@ const hashString = function(str, seed = 0) {
const colHash = (str, saturation = 100, lightness = 70) => `hsl(${hashString(str) % 360}, ${saturation}%, ${lightness}%)`
window.colHash = colHash
const e = (cls, parent, content, type="div") => {
const element = document.createElement(type)
element.classList.add(cls)
if (content) { element.appendChild(document.createTextNode(content)) }
if (parent) { parent.appendChild(element) }
return element
}
// Arbitrary Points code, wrapped in an IIFE to not pollute the global environment much more than it already is
window.points = (async () => {
const achievementInfo = {
@@ -242,14 +250,6 @@ window.points = (async () => {
},
});
const e = (cls, parent, content) => {
const element = document.createElement("div")
element.classList.add(cls)
if (content) { element.appendChild(document.createTextNode(content)) }
if (parent) { parent.appendChild(element) }
return element
}
const achievementsContainer = e("achievements", document.body)
const displayAchievement = (title, description, conditions, points) => {
const elem = e("achievement", achievementsContainer)
@@ -589,4 +589,59 @@ if (customStyle) {
document.head.appendChild(customStyleEl)
}
window.customStyleEl = customStyleEl
window.customStyle = customStyle
window.customStyle = customStyle
const nameMappings = {
"blog": "Blog",
"microblog": "Microblog",
"experiment": "Experiments",
"mycorrhiza": "Documentation"
}
// replace login navbar option with search because whatever
const loginButton = document.querySelector("nav a:last-of-type")
loginButton.href = "#"
loginButton.innerText = "Search"
loginButton.onclick = async ev => {
ev.preventDefault()
const query = (await import("/assets/js/fts_client.js")).default
const overlay = document.createElement("div")
overlay.classList.add("search-overlay")
document.body.appendChild(overlay)
const input = document.createElement("input")
input.type = "text"
input.placeholder = "Search"
let resultsEl
input.oninput = () => {
if (resultsEl) {
resultsEl.remove()
}
resultsEl = document.createElement("div")
resultsEl.classList.add("search-results")
for (const result of query(input.value)) {
const item = e("search-result", resultsEl)
const titleLine = nameMappings[result.sourceType] + " / " + (result.title ?? result.timestamp)
const link = e("search-result-link", item, titleLine, "a")
link.setAttribute("href", result.url)
if (result.title && result.timestamp) {
e("deemph", item, result.timestamp)
}
if (result.description) {
e("description", item, result.description)
}
item.style.border = `${colHash(result.sourceType)} solid 4px`
item.style.background = `${colHash(result.sourceType, 50, 10)}`
}
overlay.appendChild(resultsEl)
}
input.onkeydown = ev => {
if (ev.key === "Enter" || ev.key === "Backspace") {
if (input.value === "") {
// quit search mode
overlay.remove()
}
}
}
overlay.appendChild(input)
input.focus()
}

View File

@@ -361,4 +361,29 @@ table
text-align: right
.buttons .button
margin: 0.2em
margin: 0.2em
.search-overlay
position: fixed
overflow: scroll
top: 0
left: 0
bottom: 0
right: 0
background: rgba(0, 0, 0, 0.9)
padding: 2em
color: white
z-index: 1000
input
margin: 0.5em
font-size: 1.5em
border: 4px solid green
a
color: white
.search-results
width: 100%
.search-result
padding: 0.5em
margin: 0.5em
.description
font-style: italic