mirror of
https://github.com/osmarks/website
synced 2026-04-17 04:21:24 +00:00
106 lines
3.9 KiB
JavaScript
106 lines
3.9 KiB
JavaScript
import * as R from "ramda"
|
|
import * as htmlToText from "html-to-text"
|
|
import * as binaryFuseFilter from "binary-fuse-filter"
|
|
import { xxHash32 as hash } from "js-xxhash"
|
|
import * as msgpack from "msgpackr"
|
|
import * as fs from "fs"
|
|
import { BIGRAM_SEPARATOR, FREQUENCY_SEPARATOR, FREQUENCY_THRESHOLDS, tokenize } from "./fts_common.mjs"
|
|
|
|
const index = []
|
|
const recordStrings = []
|
|
|
|
const BIGRAM_INCLUSION = 0.3
|
|
const URL = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/g
|
|
|
|
export const stripHTML = html => htmlToText.convert(html.replace(URL, " "), {
|
|
wordwrap: false,
|
|
selectors: [
|
|
{ selector: "a", options: { ignoreHref: true } },
|
|
{ selector: "img", format: "skip" }
|
|
]
|
|
})
|
|
|
|
export const pushEntry = (sourceType, entry) => {
|
|
const { html, url, timestamp, title, description, ignoreDescription } = entry
|
|
// TODO: this puts URLs inline, maybe do something with that
|
|
const text = (title ?? "") + " " + (!ignoreDescription ? (description && stripHTML(description)) ?? "" : "") + " " + stripHTML(html)
|
|
recordStrings.push(text)
|
|
const words = tokenize(text)
|
|
const counts = {}
|
|
for (const word of words) {
|
|
counts[word] = (counts[word] ?? 0) + 1
|
|
}
|
|
const bigrams = {}
|
|
for (const [a, b] of R.zip(words, R.drop(1, words))) {
|
|
bigrams[a + BIGRAM_SEPARATOR + b] = (bigrams[a + BIGRAM_SEPARATOR + b] ?? 0) + 1
|
|
}
|
|
index.push({
|
|
url,
|
|
timestamp,
|
|
counts,
|
|
bigrams,
|
|
title,
|
|
description,
|
|
sourceType
|
|
})
|
|
}
|
|
|
|
export const build = () => {
|
|
fs.writeFileSync("strings.json", JSON.stringify(recordStrings))
|
|
|
|
let totalTerms = 0
|
|
let totalBigrams = 0
|
|
const totalBigramCounts = {}
|
|
const totalTermCounts = {}
|
|
for (const entry of index) {
|
|
for (const [bigram, count] of Object.entries(entry.bigrams)) {
|
|
totalBigramCounts[bigram] = (totalBigramCounts[bigram] ?? 0) + count
|
|
totalBigrams += count
|
|
}
|
|
for (const [word, count] of Object.entries(entry.counts)) {
|
|
totalTermCounts[word] = (totalTermCounts[word] ?? 0) + count
|
|
totalTerms += count
|
|
}
|
|
}
|
|
const pmi = (bigram, count) => {
|
|
const [a, b] = bigram.split(BIGRAM_SEPARATOR, 2)
|
|
// bigram provides no useful information if term is unique anyway
|
|
// want ascending order (lower is better)
|
|
if (totalTermCounts[a] === 1 || totalTermCounts[b] === 1) { return 0 }
|
|
return -(count / totalBigrams) / ((totalTermCounts[a] / totalTerms) * (totalTermCounts[b] / totalTerms))
|
|
}
|
|
const pmis = new Map(Object.entries(totalBigramCounts).map(([k, v]) => [k, pmi(k, v)]))
|
|
const records = []
|
|
for (const entry of index) {
|
|
const keys = []
|
|
for (const [word, count] of Object.entries(entry.counts)) {
|
|
for (const threshold of FREQUENCY_THRESHOLDS) {
|
|
if (count >= threshold) {
|
|
keys.push(hash(word + FREQUENCY_SEPARATOR + threshold))
|
|
}
|
|
}
|
|
}
|
|
const sorted = R.sortBy(x => pmis.get(x), Object.entries(entry.bigrams))
|
|
for (const [bigram, count] of sorted.slice(0, Math.ceil(keys.length * BIGRAM_INCLUSION))) {
|
|
keys.push(hash(bigram))
|
|
}
|
|
const [filter, err] = binaryFuseFilter.populateBinaryFuse8(keys)
|
|
if (err) {
|
|
throw err // Golang...
|
|
}
|
|
filter.Fingerprints = new Uint8Array(filter.Fingerprints)
|
|
records.push({
|
|
filter,
|
|
url: entry.url,
|
|
timestamp: entry.timestamp ? entry.timestamp.format("YYYY-MM-DD") : null,
|
|
title: entry.title,
|
|
description: entry.description,
|
|
sourceType: entry.sourceType
|
|
})
|
|
}
|
|
console.log(`Total terms: ${totalTerms}`)
|
|
console.log(`Total bigrams: ${totalBigrams}`)
|
|
const packr = new msgpack.Packr()
|
|
return packr.pack(records)
|
|
}
|