minoteaur/src/domain.nim

255 lines
11 KiB
Nim

import tiny_sqlite
import logging
import options
import times
import zstd/compress as zstd_compress
import zstd/decompress as zstd_decompress
import sequtils
import strutils except splitWhitespace
import json
import std/jsonutils
import nimlevenshtein
import sugar
import unicode
import math
import ./util
from ./md import parsePage
let migrations = @[
#[
`pages` stores the content of all pages, as well as when they were last updated and created - this is all the information needed to render the current version of a page
It's mildly inefficient space-wise to store the latest content here AND in the revisions table (in compressed form), but dealing with this better would probably require complex logic elsewhere
which I don't think is worth it - I anticipate that media files will be much bigger, and probably significant amounts of old revisions (it would be worth investigating storing compact diffs).
`revisions` stores all changes to a page, with metadata as JSON (messagepack is generally better, but SQLite can only query JSON) and optionally a separate blob storing larger associated data
(currently, the entire page content, zstd-compressed)
rowids (INTEGER PRIMARY KEY) are explicitly extant here due to FTS external content requiring them to be stable to work but are not to be used much.
Links' toPage is not a foreign key as it's valid for the page to not exist.
]#
"""
CREATE TABLE pages (
uid INTEGER PRIMARY KEY,
page TEXT NOT NULL UNIQUE,
updated INTEGER NOT NULL,
created INTEGER NOT NULL,
content TEXT NOT NULL
);
CREATE TABLE revisions (
uid INTEGER PRIMARY KEY,
page TEXT NOT NULL REFERENCES pages(page),
timestamp INTEGER NOT NULL,
meta TEXT NOT NULL,
fullData BLOB
);
""",
"""
CREATE VIRTUAL TABLE pages_fts USING fts5 (
page, content,
tokenize='porter unicode61 remove_diacritics 2',
content=pages, content_rowid=uid
);
""",
"""
CREATE TABLE links (
uid INTEGER PRIMARY KEY,
fromPage TEXT NOT NULL REFERENCES pages(page),
toPage TEXT NOT NULL,
linkText TEXT NOT NULL,
context TEXT NOT NULL,
UNIQUE (fromPage, toPage)
);
""",
"""
CREATE TABLE files (
uid INTEGER PRIMARY KEY,
page TEXT NOT NULL REFERENCES pages(page),
filename TEXT NOT NULL,
storagePath TEXT NOT NULL,
mimeType TEXT NOT NULL,
metadata TEXT NOT NULL,
uploadedTime INTEGER NOT NULL,
UNIQUE (page, filename)
);
""",
"""
CREATE TABLE sessions (
sid INTEGER PRIMARY KEY,
timestamp INTEGER NOT NULL,
data TEXT NOT NULL
);
"""
]
type
Encoding* {.pure.} = enum
Plain = 0, Zstd = 1
RevisionType* {.pure.} = enum
NewContent = 0
RevisionMeta* = object
case kind*: RevisionType
of NewContent:
encoding*: Encoding
editDistance*: Option[int]
size*: Option[int]
words*: Option[int]
Revision* = object
meta*: RevisionMeta
time*: Time
SearchResult* = object
page*: string
rank*: float
snippet*: seq[(bool, string)]
Page* = object
page*, content*: string
created*, updated*: Time
uid*: int64
Backlink* = object
fromPage*, text*, context*: string
FileInfo* = object
filename*, mimeType*: string
uploadedTime*: Time
metadata*: JsonNode
proc migrate*(db: DbConn) =
let currentVersion = fromDbValue(get db.value("PRAGMA user_version"), int)
for mid in (currentVersion + 1) .. migrations.len:
db.transaction:
logger().log(lvlInfo, "Migrating to schema " & $mid)
db.execScript migrations[mid - 1]
# for some reason this pragma does not work using normal parameter binding
db.exec("PRAGMA user_version = " & $mid)
logger().log(lvlDebug, "DB ready")
proc parse*(s: string, T: typedesc): T = fromJson(result, parseJSON(s), Joptions(allowExtraKeys: true, allowMissingKeys: true))
proc processFullRevisionRow(row: ResultRow): (RevisionMeta, string) =
let (metaJSON, full) = row.unpack((string, seq[byte]))
let meta = parse(metaJSON, RevisionMeta)
var content = cast[string](full)
if meta.encoding == Zstd:
content = cast[string](zstd_decompress.decompress(content))
(meta, content)
proc fetchPage*(db: DbConn, page: string): Option[Page] =
# retrieve the current version of the page directly
db.one("SELECT uid, updated, created, content FROM pages WHERE page = ?", page).map(proc(row: ResultRow): Page =
let (uid, updated, created, content) = row.unpack((int64, Time, Time, string))
Page(page: page, created: created, updated: updated, content: content, uid: uid)
)
proc fetchPage*(db: DbConn, page: string, revision: Time): Option[Page] =
# retrieve page row
db.one("SELECT uid, updated, created FROM pages WHERE page = ?", page).flatMap(proc(row: ResultRow): Option[Page] =
let (uid, updated, created) = row.unpack((int64, Time, Time))
# retrieve the older revision
let rev = db.one("SELECT meta, fullData FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp = ?", page, revision)
rev.map(proc(row: ResultRow): Page =
let (meta, content) = processFullRevisionRow(row)
Page(page: page, created: created, updated: updated, content: content, uid: uid)
)
)
proc backlinks*(db: DbConn, page: string): seq[Backlink] =
db.all("SELECT fromPage, linkText, context FROM links WHERE toPage = ?", page).map(proc(row: ResultRow): Backlink =
let (fromPage, text, context) = row.unpack((string, string, string))
Backlink(fromPage: fromPage, text: text, context: context))
# count words, defined as things separated by whitespace which are not purely Markdown-ish punctuation characters
# alternative definitions may include dropping number-only words, and/or splitting at full stops too
func wordCount(s: string): int =
for word in splitWhitespace(s):
if len(word) == 0: continue
for bytechar in word:
if not (bytechar in {'#', '*', '-', '>', '`', '|', '+', '[', ']'}):
inc result
break
proc updatePage*(db: DbConn, page: string, content: string) =
let parsed = parsePage(content)
let previous = fetchPage(db, page)
# if there is no previous content, empty string instead
let previousContent = previous.map(p => p.content).get("")
# use zstandard-compressed version if it is smaller
let compressed = zstd_compress.compress(content, level=10)
var enc = Plain
var data = cast[seq[byte]](content)
if len(compressed) < len(data):
enc = Zstd
data = compressed
# generate some useful metadata and encode to JSON
let meta = $toJson(RevisionMeta(kind: NewContent, encoding: enc,
editDistance: some distance(previousContent, content), size: some len(content), words: some wordCount(content)))
let ts = getTime()
let revisionID = snowflake()
let pageID = previous.map(p => p.uid).get(snowflake())
# actually write to database
db.transaction:
if isSome previous:
# update existing data and remove FTS index entry for it
db.exec("UPDATE pages SET content = ?, updated = ? WHERE uid = ?", content, ts, pageID)
# pages_fts is an external content FTS table, so deletion has to be done like this
db.exec("INSERT INTO pages_fts (pages_fts, rowid, page, content) VALUES ('delete', ?, ?, ?)", pageID, page, previousContent)
# delete existing links from the page
db.exec("DELETE FROM links WHERE fromPage = ?", page)
else:
db.exec("INSERT INTO pages VALUES (?, ?, ?, ?, ?)", pageID, page, ts, ts, content)
# push to full text search index - TODO perhaps use the parsed text content (as used for context) instead of the raw markdown
db.exec("INSERT INTO pages_fts (rowid, page, content) VALUES (?, ?, ?)", pageID, page, content)
db.exec("INSERT INTO revisions VALUES (?, ?, ?, ?, ?)", revisionID, page, ts, meta, data)
# insert new set of links
for link in parsed.links:
db.exec("INSERT INTO links VALUES (?, ?, ?, ?, ?)", snowflake(), page, link.target, link.text, link.context)
proc fetchRevisions*(db: DbConn, page: string): seq[Revision] =
db.all("SELECT timestamp, meta FROM revisions WHERE page = ? ORDER BY timestamp DESC", page).map(proc (row: ResultRow): Revision =
let (ts, metaJSON) = row.unpack((Time, string))
Revision(time: ts, meta: parse(metaJSON, RevisionMeta))
)
proc processRevisionRow(r: ResultRow): Revision =
let (ts, meta) = r.unpack((Time, string))
Revision(time: ts, meta: parse(meta, RevisionMeta))
proc adjacentRevisions*(db: DbConn, page: string, ts: Time): (Option[Revision], Option[Revision]) =
# revision after given timestamp
let next = db.one("SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp > ? ORDER BY timestamp ASC LIMIT 1", page, ts)
# revision before given timestamp
let prev = db.one("SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp < ? ORDER BY timestamp DESC LIMIT 1", page, ts)
(next.map(processRevisionRow), prev.map(processRevisionRow))
proc processSearchRow(row: ResultRow): SearchResult =
let (page, rank, snippet) = row.unpack((string, float, string))
var pos = 0
# split snippet up into an array of highlighted/unhighlighted bits
var snips: seq[(bool, string)] = @[]
while true:
let newpos = find(snippet, "<hlstart>", pos)
if newpos == -1:
break
snips.add((false, snippet[pos .. newpos - 1]))
var endpos = find(snippet, "<hlend>", newpos)
# if no <hlend> (this *probably* shouldn't happen) then just highlight remaining rest of string
if endpos == -1:
endpos = len(snippet)
snips.add((true, snippet[newpos + len("<hlstart>") .. endpos - 1]))
pos = endpos + len("<hlend>")
snips.add((false, snippet[pos .. len(snippet) - 1]))
# filter out empty snippet fragments because they're not useful, rescale rank for nicer display
SearchResult(page: page, rank: log10(-rank * 1e7), snippet: snips.filter(x => len(x[1]) > 0))
proc search*(db: DbConn, query: string): seq[SearchResult] =
db.all("SELECT page, rank, snippet(pages_fts, 1, '<hlstart>', '<hlend>', ' ... ', 32) FROM pages_fts WHERE pages_fts MATCH ? AND rank MATCH 'bm25(5.0, 1.0)' ORDER BY rank", query).map(processSearchRow)
proc getBasicFileInfo*(db: DbConn, page, filename: string): Option[(string, string)] =
db.one("SELECT storagePath, mimeType FROM files WHERE page = ? AND filename = ?", page, filename).map(proc (r: ResultRow): (string, string) = r.unpack((string, string)))
proc getPageFiles*(db: DbConn, page: string): seq[FileInfo] =
db.all("SELECT filename, mimeType, uploadedTime, metadata FROM files WHERE page = ?", page).map(proc (r: ResultRow): FileInfo =
let (filename, mime, upload, meta) = r.unpack((string, string, Time, string))
FileInfo(filename: filename, mimetype: mime, uploadedTime: upload, metadata: parse(meta, JsonNode)))