minoteaur/src/domain.nim

import tiny_sqlite
import logging
import options
import times
import zstd/compress as zstd_compress
import zstd/decompress as zstd_decompress
import sequtils
import strutils except splitWhitespace
import json
import std/jsonutils
import nimlevenshtein
import sugar
import unicode
import math

import ./util
from ./md import parsePage

let migrations = @[
    #[
    `pages` stores the content of all pages, as well as when they were last updated and created - this is all the information needed to render the current version of a page
    It's mildly inefficient space-wise to store the latest content here AND in the revisions table (in compressed form), but dealing with this better would probably require complex logic elsewhere
    which I don't think is worth it - I anticipate that media files will be much bigger, and probably significant amounts of old revisions (it would be worth investigating storing compact diffs).

    `revisions` stores all changes to a page, with metadata as JSON (messagepack is generally better, but SQLite can only query JSON) and optionally a separate blob storing larger associated data
    (currently, the entire page content, zstd-compressed)

    rowids (INTEGER PRIMARY KEY) are explicitly extant here due to FTS external content requiring them to be stable to work but are not to be used much.

    Links' toPage is not a foreign key as it's valid for the page to not exist.
    ]#
    """
CREATE TABLE pages (
    uid INTEGER PRIMARY KEY,
    page TEXT NOT NULL UNIQUE,
    updated INTEGER NOT NULL,
    created INTEGER NOT NULL,
    content TEXT NOT NULL
);
CREATE TABLE revisions (
    uid INTEGER PRIMARY KEY,
    page TEXT NOT NULL REFERENCES pages(page),
    timestamp INTEGER NOT NULL,
    meta TEXT NOT NULL,
    fullData BLOB
);
    """,
    """
CREATE VIRTUAL TABLE pages_fts USING fts5 (
    page, content,
    tokenize='porter unicode61 remove_diacritics 2',
    content=pages, content_rowid=uid
);
    """,
    """
CREATE TABLE links (
    uid INTEGER PRIMARY KEY,
    fromPage TEXT NOT NULL REFERENCES pages(page),
    toPage TEXT NOT NULL,
    linkText TEXT NOT NULL,
    context TEXT NOT NULL,
    UNIQUE (fromPage, toPage)
);
    """,
    """
CREATE TABLE files (
    uid INTEGER PRIMARY KEY,
    page TEXT NOT NULL REFERENCES pages(page),
    filename TEXT NOT NULL,
    storagePath TEXT NOT NULL,
    mimeType TEXT NOT NULL,
    metadata TEXT NOT NULL,
    uploadedTime INTEGER NOT NULL,
    UNIQUE (page, filename)
);
    """,
    """
CREATE TABLE sessions (
    sid INTEGER PRIMARY KEY,
    timestamp INTEGER NOT NULL,
    data TEXT NOT NULL
);
    """
]

type
    Encoding* {.pure.} = enum
        Plain = 0, Zstd = 1
    RevisionType* {.pure.} = enum
        NewContent = 0
    RevisionMeta* = object
        case kind*: RevisionType
        of NewContent:
            encoding*: Encoding
            editDistance*: Option[int]
            size*: Option[int]
            words*: Option[int]
    Revision* = object
        meta*: RevisionMeta
        time*: Time
    SearchResult* = object
        page*: string
        rank*: float
        snippet*: seq[(bool, string)]
    Page* = object
        page*, content*: string
        created*, updated*: Time
        uid*: int64
    Backlink* = object
        fromPage*, text*, context*: string
    FileInfo* = object
        filename*, mimeType*: string
        uploadedTime*: Time
        metadata*: JsonNode

proc migrate*(db: DbConn) =
    let currentVersion = fromDbValue(get db.value("PRAGMA user_version"), int)
    for mid in (currentVersion + 1) .. migrations.len:
        db.transaction:
            logger().log(lvlInfo, "Migrating to schema " & $mid)
            db.execScript migrations[mid - 1]
            # for some reason this pragma does not work using normal parameter binding
            db.exec("PRAGMA user_version = " & $mid)
    logger().log(lvlDebug, "DB ready")

proc parse*(s: string, T: typedesc): T = fromJson(result, parseJSON(s), Joptions(allowExtraKeys: true, allowMissingKeys: true))

proc processFullRevisionRow(row: ResultRow): (RevisionMeta, string) =
    let (metaJSON, full) = row.unpack((string, seq[byte]))
    let meta = parse(metaJSON, RevisionMeta)
    var content = cast[string](full)
    if meta.encoding == Zstd:
        content = cast[string](zstd_decompress.decompress(content))
    (meta, content)

proc fetchPage*(db: DbConn, page: string): Option[Page] =
    # retrieve the current version of the page directly
    db.one("SELECT uid, updated, created, content FROM pages WHERE page = ?", page).map(proc(row: ResultRow): Page =
        let (uid, updated, created, content) = row.unpack((int64, Time, Time, string))
        Page(page: page, created: created, updated: updated, content: content, uid: uid)
    )

proc fetchPage*(db: DbConn, page: string, revision: Time): Option[Page] =
    # retrieve page row
    db.one("SELECT uid, updated, created FROM pages WHERE page = ?", page).flatMap(proc(row: ResultRow): Option[Page] =
        let (uid, updated, created) = row.unpack((int64, Time, Time))
        # retrieve the older revision
        let rev = db.one("SELECT meta, fullData FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp = ?", page, revision)
        rev.map(proc(row: ResultRow): Page =
            let (meta, content) = processFullRevisionRow(row)
            Page(page: page, created: created, updated: updated, content: content, uid: uid)
        )
    )

proc backlinks*(db: DbConn, page: string): seq[Backlink] =
    db.all("SELECT fromPage, linkText, context FROM links WHERE toPage = ?", page).map(proc(row: ResultRow): Backlink =
        let (fromPage, text, context) = row.unpack((string, string, string))
        Backlink(fromPage: fromPage, text: text, context: context))

# count words, defined as things separated by whitespace which are not purely Markdown-ish punctuation characters
# alternative definitions may include dropping number-only words, and/or splitting at full stops too
func wordCount(s: string): int =
    for word in splitWhitespace(s):
        if len(word) == 0: continue
        for bytechar in word:
            if not (bytechar in {'#', '*', '-', '>', '`', '|', '+', '[', ']'}):
                inc result
                break

proc updatePage*(db: DbConn, page: string, content: string) =
    let parsed = parsePage(content)
    let previous = fetchPage(db, page)
    # if there is no previous content, empty string instead
    let previousContent = previous.map(p => p.content).get("")

    # use zstandard-compressed version if it is smaller
    let compressed = zstd_compress.compress(content, level=10)
    var enc = Plain
    var data = cast[seq[byte]](content)
    if len(compressed) < len(data):
        enc = Zstd
        data = compressed

    # generate some useful metadata and encode to JSON
    let meta = $toJson(RevisionMeta(kind: NewContent, encoding: enc,
        editDistance: some distance(previousContent, content), size: some len(content), words: some wordCount(content)))
    let ts = getTime()

    let revisionID = snowflake()
    let pageID = previous.map(p => p.uid).get(snowflake())
    # actually write to database
    db.transaction:
        if isSome previous:
            # update existing data and remove FTS index entry for it
            db.exec("UPDATE pages SET content = ?, updated = ? WHERE uid = ?", content, ts, pageID)
            # pages_fts is an external content FTS table, so deletion has to be done like this
            db.exec("INSERT INTO pages_fts (pages_fts, rowid, page, content) VALUES ('delete', ?, ?, ?)", pageID, page, previousContent)
            # delete existing links from the page
            db.exec("DELETE FROM links WHERE fromPage = ?", page)
        else:
            db.exec("INSERT INTO pages VALUES (?, ?, ?, ?, ?)", pageID, page, ts, ts, content)
        # push to full text search index - TODO perhaps use the parsed text content (as used for context) instead of the raw markdown
        db.exec("INSERT INTO pages_fts (rowid, page, content) VALUES (?, ?, ?)", pageID, page, content)
        db.exec("INSERT INTO revisions VALUES (?, ?, ?, ?, ?)", revisionID, page, ts, meta, data)
        # insert new set of links
        for link in parsed.links:
            db.exec("INSERT INTO links VALUES (?, ?, ?, ?, ?)", snowflake(), page, link.target, link.text, link.context)

proc fetchRevisions*(db: DbConn, page: string): seq[Revision] =
    db.all("SELECT timestamp, meta FROM revisions WHERE page = ? ORDER BY timestamp DESC", page).map(proc (row: ResultRow): Revision =
        let (ts, metaJSON) = row.unpack((Time, string))
        Revision(time: ts, meta: parse(metaJSON, RevisionMeta))
    )

proc processRevisionRow(r: ResultRow): Revision =
    let (ts, meta) = r.unpack((Time, string))
    Revision(time: ts, meta: parse(meta, RevisionMeta))

proc adjacentRevisions*(db: DbConn, page: string, ts: Time): (Option[Revision], Option[Revision]) =
    # revision after given timestamp
    let next = db.one("SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp > ? ORDER BY timestamp ASC LIMIT 1", page, ts)
    # revision before given timestamp
    let prev = db.one("SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, '$.kind') = 0 AND timestamp < ? ORDER BY timestamp DESC LIMIT 1", page, ts)
    (next.map(processRevisionRow), prev.map(processRevisionRow))

proc processSearchRow(row: ResultRow): SearchResult =
    let (page, rank, snippet) = row.unpack((string, float, string))
    var pos = 0
    # split snippet up into an array of highlighted/unhighlighted bits
    var snips: seq[(bool, string)] = @[]
    while true:
        let newpos = find(snippet, "<hlstart>", pos)
        if newpos == -1:
            break
        snips.add((false, snippet[pos .. newpos - 1]))
        var endpos = find(snippet, "<hlend>", newpos)
        # if no <hlend> (this *probably* shouldn't happen) then just highlight remaining rest of string
        if endpos == -1:
            endpos = len(snippet)
        snips.add((true, snippet[newpos + len("<hlstart>") .. endpos - 1]))
        pos = endpos + len("<hlend>")
    snips.add((false, snippet[pos .. len(snippet) - 1]))
    # filter out empty snippet fragments because they're not useful, rescale rank for nicer display
    SearchResult(page: page, rank: log10(-rank * 1e7), snippet: snips.filter(x => len(x[1]) > 0))

proc search*(db: DbConn, query: string): seq[SearchResult] =
    db.all("SELECT page, rank, snippet(pages_fts, 1, '<hlstart>', '<hlend>', ' ... ', 32) FROM pages_fts WHERE pages_fts MATCH ? AND rank MATCH 'bm25(5.0, 1.0)' ORDER BY rank", query).map(processSearchRow)

proc getBasicFileInfo*(db: DbConn, page, filename: string): Option[(string, string)] =
    db.one("SELECT storagePath, mimeType FROM files WHERE page = ? AND filename = ?", page, filename).map(proc (r: ResultRow): (string, string) = r.unpack((string, string)))

proc getPageFiles*(db: DbConn, page: string): seq[FileInfo] =
    db.all("SELECT filename, mimeType, uploadedTime, metadata FROM files WHERE page = ?", page).map(proc (r: ResultRow): FileInfo =
        let (filename, mime, upload, meta) = r.unpack((string, string, Time, string))
        FileInfo(filename: filename, mimetype: mime, uploadedTime: upload, metadata: parse(meta, JsonNode)))