import tiny_sqlite import logging import options import times import zstd/compress as zstd_compress import zstd/decompress as zstd_decompress import sequtils import strutils except splitWhitespace import json import std/jsonutils import nimlevenshtein import sugar import unicode import math import util from ./md import parsePage let migrations = @[ #[ `pages` stores the content of all pages, as well as when they were last updated and created - this is all the information needed to render the current version of a page It's mildly inefficient space-wise to store the latest content here AND in the revisions table (in compressed form), but dealing with this better would probably require complex logic elsewhere which I don't think is worth it - I anticipate that media files will be much bigger, and probably significant amounts of old revisions (it would be worth investigating storing compact diffs). `revisions` stores all changes to a page, with metadata as JSON (messagepack is generally better, but SQLite can only query JSON) and optionally a separate blob storing larger associated data (currently, the entire page content, zstd-compressed) rowids (INTEGER PRIMARY KEY) are explicitly extant here due to FTS external content requiring them to be stable to work but are not to be used much. ]# """ CREATE TABLE pages ( uid INTEGER PRIMARY KEY, page TEXT NOT NULL UNIQUE, updated INTEGER NOT NULL, created INTEGER NOT NULL, content TEXT NOT NULL ); CREATE TABLE revisions ( uid INTEGER PRIMARY KEY, page TEXT NOT NULL REFERENCES pages(page), timestamp INTEGER NOT NULL, meta TEXT NOT NULL, fullData BLOB ); """, """ CREATE VIRTUAL TABLE pages_fts USING fts5 ( page, content, tokenize='porter unicode61 remove_diacritics 2', content=pages, content_rowid=uid ); """, """ CREATE TABLE links ( uid INTEGER PRIMARY KEY, from TEXT NOT NULL, to TEXT NOT NULL, linkText TEXT NOT NULL, context TEXT NOT NULL ); """ ] type Encoding* {.pure} = enum Plain = 0, Zstd = 1 RevisionType* {.pure.} = enum NewContent = 0 RevisionMeta* = object case kind*: RevisionType of NewContent: encoding*: Encoding editDistance*: Option[int] size*: Option[int] words*: Option[int] Revision* = object meta*: RevisionMeta time*: Time SearchResult* = object page*: string rank*: float snippet*: seq[(bool, string)] var logger = newConsoleLogger() proc migrate*(db: DbConn) = let currentVersion = fromDbValue(get db.value("PRAGMA user_version"), int) for mid in (currentVersion + 1) .. migrations.len: db.transaction: logger.log(lvlInfo, "Migrating to schema " & $mid) db.execScript migrations[mid - 1] # for some reason this pragma does not work using normal parameter binding db.exec("PRAGMA user_version = " & $mid) logger.log(lvlDebug, "DB ready") type Page = object page*, content*: string created*, updated*: Time uid*: int64 proc parse*(s: string, T: typedesc): T = fromJson(result, parseJSON(s), Joptions(allowExtraKeys: true, allowMissingKeys: true)) proc processFullRevisionRow(row: ResultRow): (RevisionMeta, string) = let (metaJSON, full) = row.unpack((string, seq[byte])) let meta = parse(metaJSON, RevisionMeta) var content = cast[string](full) if meta.encoding == Zstd: content = cast[string](zstd_decompress.decompress(content)) (meta, content) proc fetchPage*(db: DbConn, page: string): Option[Page] = # retrieve the current version of the page directly db.one("SELECT uid, updated, created, content FROM pages WHERE page = ?", page).map(proc(row: ResultRow): Page = let (uid, updated, created, content) = row.unpack((int64, Time, Time, string)) Page(page: page, created: created, updated: updated, content: content, uid: uid) ) proc fetchPage*(db: DbConn, page: string, revision: Time): Option[Page] = # retrieve page row db.one("SELECT uid, updated, created FROM pages WHERE page = ?", page).flatMap(proc(row: ResultRow): Option[Page] = let (uid, updated, created) = row.unpack((int64, Time, Time)) # retrieve the older revision let rev = db.one("SELECT meta, fullData FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp = ?", page, revision) rev.map(proc(row: ResultRow): Page = let (meta, content) = processFullRevisionRow(row) Page(page: page, created: created, updated: updated, content: content, uid: uid) ) ) # count words, defined as things separated by whitespace which are not purely Markdown-ish punctuation characters # alternative definitions may include dropping number-only words, and/or splitting at full stops too func wordCount(s: string): int = for word in splitWhitespace(s): if len(word) == 0: continue for bytechar in word: if not (bytechar in {'#', '*', '-', '>', '`', '|', '-'}): inc result break proc updatePage*(db: DbConn, page: string, content: string) = echo parsePage(content) let previous = fetchPage(db, page) # if there is no previous content, empty string instead let previousContent = previous.map(p => p.content).get("") # use zstandard-compressed version if it is smaller let compressed = zstd_compress.compress(content, level=10) var enc = Plain var data = cast[seq[byte]](content) if len(compressed) < len(data): enc = Zstd data = compressed # generate some useful metadata and encode to JSON let meta = $toJson(RevisionMeta(kind: NewContent, encoding: enc, editDistance: some distance(previousContent, content), size: some len(content), words: some wordCount(content))) let ts = getTime() let revisionID = snowflake() let pageID = previous.map(p => p.uid).get(snowflake()) # actually write to database db.transaction: if isSome previous: # update existing data and remove FTS index entry for it db.exec("UPDATE pages SET content = ?, updated = ? WHERE uid = ?", content, ts, pageID) # pages_fts is an external content FTS table, so deletion has to be done like this db.exec("INSERT INTO pages_fts (pages_fts, rowid, page, content) VALUES ('delete', ?, ?, ?)", pageID, page, previousContent) else: db.exec("INSERT INTO pages VALUES (?, ?, ?, ?, ?)", pageID, page, ts, ts, content) # push to full text search index db.exec("INSERT INTO pages_fts (rowid, page, content) VALUES (?, ?, ?)", pageID, page, content) db.exec("INSERT INTO revisions VALUES (?, ?, ?, ?, ?)", revisionID, page, ts, meta, data) proc fetchRevisions*(db: DbConn, page: string): seq[Revision] = db.all("SELECT timestamp, meta FROM revisions WHERE page = ? ORDER BY timestamp DESC", page).map(proc (row: ResultRow): Revision = let (ts, metaJSON) = row.unpack((Time, string)) Revision(time: ts, meta: parse(metaJSON, RevisionMeta)) ) proc processRevisionRow(r: ResultRow): Revision = let (ts, meta) = r.unpack((Time, string)) Revision(time: ts, meta: parse(meta, RevisionMeta)) proc adjacentRevisions*(db: DbConn, page: string, ts: Time): (Option[Revision], Option[Revision]) = # revision after given timestamp let next = db.one("SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp > ? ORDER BY timestamp ASC LIMIT 1", page, ts) # revision before given timestamp let prev = db.one("SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp < ? ORDER BY timestamp DESC LIMIT 1", page, ts) (next.map(processRevisionRow), prev.map(processRevisionRow)) proc processSearchRow(row: ResultRow): SearchResult = let (page, rank, snippet) = row.unpack((string, float, string)) var pos = 0 # split snippet up into an array of highlighted/unhighlighted bits var snips: seq[(bool, string)] = @[] while true: let newpos = find(snippet, "", pos) if newpos == -1: break snips.add((false, snippet[pos .. newpos - 1])) var endpos = find(snippet, "", newpos) # if no (this *probably* shouldn't happen) then just highlight remaining rest of string if endpos == -1: endpos = len(snippet) snips.add((true, snippet[newpos + len("") .. endpos - 1])) pos = endpos + len("") snips.add((false, snippet[pos .. len(snippet) - 1])) # filter out empty snippet fragments because they're not useful, rescale rank for nicer display SearchResult(page: page, rank: log10(-rank * 1e7), snippet: snips.filter(x => len(x[1]) > 0)) proc search*(db: DbConn, query: string): seq[SearchResult] = db.all("SELECT page, rank, snippet(pages_fts, 1, '', '', ' ... ', 32) FROM pages_fts WHERE pages_fts MATCH ? AND rank MATCH 'bm25(5.0, 1.0)' ORDER BY rank", query).map(processSearchRow)