2021-01-06 16:09:48 +00:00
import tiny_sqlite
import logging
import options
import times
2021-02-16 14:26:01 +00:00
import zstd / compress as zstd_compress
import zstd / decompress as zstd_decompress
2021-01-06 16:09:48 +00:00
import sequtils
import strutils except splitWhitespace
import json
import std / jsonutils
import nimlevenshtein
import sugar
import unicode
2021-02-16 14:26:01 +00:00
import math
2021-01-06 16:09:48 +00:00
2021-03-13 22:36:41 +00:00
import . / util
2021-02-16 14:26:01 +00:00
from . / md import parsePage
2021-01-06 16:09:48 +00:00
let migrations = @ [
2021-02-16 14:26:01 +00:00
#[
` pages ` stores the content of all pages , as well as when they were last updated and created - this is all the information needed to render the current version of a page
It ' s m i l d l y i n e f f i c i e n t s p a c e - w i s e t o s t o r e t h e l a t e s t c o n t e n t h e r e A N D i n t h e r e v i s i o n s t a b l e ( i n c o m p r e s s e d f o r m ) , b u t d e a l i n g w i t h t h i s b e t t e r w o u l d p r o b a b l y r e q u i r e c o m p l e x l o g i c e l s e w h e r e
which I don ' t t h i n k i s w o r t h i t - I a n t i c i p a t e t h a t m e d i a f i l e s w i l l b e m u c h b i g g e r , a n d p r o b a b l y s i g n i f i c a n t a m o u n t s o f o l d r e v i s i o n s ( i t w o u l d b e w o r t h i n v e s t i g a t i n g s t o r i n g c o m p a c t d i f f s ) .
` revisions ` stores all changes to a page , with metadata as JSON ( messagepack is generally better , but SQLite can only query JSON ) and optionally a separate blob storing larger associated data
( currently , the entire page content , zstd - compressed )
rowids ( INTEGER PRIMARY KEY ) are explicitly extant here due to FTS external content requiring them to be stable to work but are not to be used much .
2021-02-17 23:37:42 +00:00
Links ' t o P a g e i s n o t a f o r e i g n k e y a s i t ' s valid for the page to not exist .
2021-02-16 14:26:01 +00:00
] #
"""
CREATE TABLE pages (
uid INTEGER PRIMARY KEY ,
page TEXT NOT NULL UNIQUE ,
updated INTEGER NOT NULL ,
created INTEGER NOT NULL ,
content TEXT NOT NULL
) ;
CREATE TABLE revisions (
uid INTEGER PRIMARY KEY ,
page TEXT NOT NULL REFERENCES pages ( page ) ,
timestamp INTEGER NOT NULL ,
meta TEXT NOT NULL ,
fullData BLOB
) ;
""" ,
"""
CREATE VIRTUAL TABLE pages_fts USING fts5 (
page , content ,
tokenize = ' p o r t e r u n i c o d e 6 1 r e m o v e _ d i a c r i t i c s 2 ' ,
content = pages , content_rowid = uid
) ;
""" ,
"""
CREATE TABLE links (
uid INTEGER PRIMARY KEY ,
2021-02-17 23:37:42 +00:00
fromPage TEXT NOT NULL REFERENCES pages ( page ) ,
toPage TEXT NOT NULL ,
2021-02-16 14:26:01 +00:00
linkText TEXT NOT NULL ,
2021-02-17 23:37:42 +00:00
context TEXT NOT NULL ,
UNIQUE ( fromPage , toPage )
) ;
""" ,
"""
CREATE TABLE files (
uid INTEGER PRIMARY KEY ,
page TEXT NOT NULL REFERENCES pages ( page ) ,
filename TEXT NOT NULL ,
storagePath TEXT NOT NULL ,
mimeType TEXT NOT NULL ,
metadata TEXT NOT NULL ,
uploadedTime INTEGER NOT NULL ,
UNIQUE ( page , filename )
2021-03-13 22:36:41 +00:00
) ;
""" ,
"""
CREATE TABLE sessions (
sid INTEGER PRIMARY KEY ,
timestamp INTEGER NOT NULL ,
data TEXT NOT NULL
2021-02-16 14:26:01 +00:00
) ;
"""
2021-01-06 16:09:48 +00:00
]
type
2021-03-13 22:36:41 +00:00
Encoding * {. pure . } = enum
2021-02-16 14:26:01 +00:00
Plain = 0 , Zstd = 1
RevisionType * {. pure . } = enum
NewContent = 0
RevisionMeta * = object
case kind * : RevisionType
of NewContent :
2021-01-06 16:09:48 +00:00
encoding * : Encoding
editDistance * : Option [ int ]
size * : Option [ int ]
words * : Option [ int ]
2021-02-16 14:26:01 +00:00
Revision * = object
meta * : RevisionMeta
2021-01-06 16:09:48 +00:00
time * : Time
2021-02-16 14:26:01 +00:00
SearchResult * = object
page * : string
rank * : float
snippet * : seq [ ( bool , string ) ]
2021-02-17 23:37:42 +00:00
Page * = object
page * , content * : string
created * , updated * : Time
uid * : int64
Backlink * = object
fromPage * , text * , context * : string
2021-03-13 22:36:41 +00:00
FileInfo * = object
filename * , mimeType * : string
uploadedTime * : Time
metadata * : JsonNode
2021-01-06 16:09:48 +00:00
proc migrate * ( db : DbConn ) =
let currentVersion = fromDbValue ( get db . value ( " PRAGMA user_version " ) , int )
for mid in ( currentVersion + 1 ) .. migrations . len :
db . transaction :
2021-03-13 22:36:41 +00:00
logger ( ) . log ( lvlInfo , " Migrating to schema " & $ mid )
2021-01-06 16:09:48 +00:00
db . execScript migrations [ mid - 1 ]
# for some reason this pragma does not work using normal parameter binding
db . exec ( " PRAGMA user_version = " & $ mid )
2021-03-13 22:36:41 +00:00
logger ( ) . log ( lvlDebug , " DB ready " )
2021-01-06 16:09:48 +00:00
proc parse * ( s : string , T : typedesc ) : T = fromJson ( result , parseJSON ( s ) , Joptions ( allowExtraKeys : true , allowMissingKeys : true ) )
proc processFullRevisionRow ( row : ResultRow ) : ( RevisionMeta , string ) =
let ( metaJSON , full ) = row . unpack ( ( string , seq [ byte ] ) )
let meta = parse ( metaJSON , RevisionMeta )
var content = cast [ string ] ( full )
2021-02-16 14:26:01 +00:00
if meta . encoding = = Zstd :
content = cast [ string ] ( zstd_decompress . decompress ( content ) )
2021-01-06 16:09:48 +00:00
( meta , content )
2021-02-16 14:26:01 +00:00
proc fetchPage * ( db : DbConn , page : string ) : Option [ Page ] =
# retrieve the current version of the page directly
db . one ( " SELECT uid, updated, created, content FROM pages WHERE page = ? " , page ) . map ( proc ( row : ResultRow ) : Page =
let ( uid , updated , created , content ) = row . unpack ( ( int64 , Time , Time , string ) )
Page ( page : page , created : created , updated : updated , content : content , uid : uid )
)
proc fetchPage * ( db : DbConn , page : string , revision : Time ) : Option [ Page ] =
# retrieve page row
db . one ( " SELECT uid, updated, created FROM pages WHERE page = ? " , page ) . flatMap ( proc ( row : ResultRow ) : Option [ Page ] =
let ( uid , updated , created ) = row . unpack ( ( int64 , Time , Time ) )
# retrieve the older revision
let rev = db . one ( " SELECT meta, fullData FROM revisions WHERE page = ? AND json_extract(meta, ' $ .kind ' ) = 0 AND timestamp = ? " , page , revision )
2021-01-06 16:09:48 +00:00
rev . map ( proc ( row : ResultRow ) : Page =
let ( meta , content ) = processFullRevisionRow ( row )
2021-02-16 14:26:01 +00:00
Page ( page : page , created : created , updated : updated , content : content , uid : uid )
2021-01-06 16:09:48 +00:00
)
)
2021-02-17 23:37:42 +00:00
proc backlinks * ( db : DbConn , page : string ) : seq [ Backlink ] =
db . all ( " SELECT fromPage, linkText, context FROM links WHERE toPage = ? " , page ) . map ( proc ( row : ResultRow ) : Backlink =
let ( fromPage , text , context ) = row . unpack ( ( string , string , string ) )
Backlink ( fromPage : fromPage , text : text , context : context ) )
2021-01-06 16:09:48 +00:00
# count words, defined as things separated by whitespace which are not purely Markdown-ish punctuation characters
# alternative definitions may include dropping number-only words, and/or splitting at full stops too
func wordCount ( s : string ) : int =
for word in splitWhitespace ( s ) :
if len ( word ) = = 0 : continue
for bytechar in word :
2021-02-17 23:37:42 +00:00
if not ( bytechar in { ' # ' , ' * ' , ' - ' , ' > ' , ' ` ' , ' | ' , ' + ' , ' [ ' , ' ] ' } ) :
2021-01-06 16:09:48 +00:00
inc result
break
proc updatePage * ( db : DbConn , page : string , content : string ) =
2021-02-17 23:37:42 +00:00
let parsed = parsePage ( content )
2021-02-16 14:26:01 +00:00
let previous = fetchPage ( db , page )
# if there is no previous content, empty string instead
let previousContent = previous . map ( p = > p . content ) . get ( " " )
# use zstandard-compressed version if it is smaller
let compressed = zstd_compress . compress ( content , level = 10 )
var enc = Plain
2021-01-06 16:09:48 +00:00
var data = cast [ seq [ byte ] ] ( content )
if len ( compressed ) < len ( data ) :
2021-02-16 14:26:01 +00:00
enc = Zstd
2021-01-06 16:09:48 +00:00
data = compressed
2021-02-16 14:26:01 +00:00
# generate some useful metadata and encode to JSON
let meta = $ toJson ( RevisionMeta ( kind : NewContent , encoding : enc ,
editDistance : some distance ( previousContent , content ) , size : some len ( content ) , words : some wordCount ( content ) ) )
2021-01-06 16:09:48 +00:00
let ts = getTime ( )
2021-02-16 14:26:01 +00:00
let revisionID = snowflake ( )
let pageID = previous . map ( p = > p . uid ) . get ( snowflake ( ) )
# actually write to database
2021-01-06 16:09:48 +00:00
db . transaction :
2021-02-16 14:26:01 +00:00
if isSome previous :
# update existing data and remove FTS index entry for it
db . exec ( " UPDATE pages SET content = ?, updated = ? WHERE uid = ? " , content , ts , pageID )
# pages_fts is an external content FTS table, so deletion has to be done like this
db . exec ( " INSERT INTO pages_fts (pages_fts, rowid, page, content) VALUES ( ' delete ' , ?, ?, ?) " , pageID , page , previousContent )
2021-02-17 23:37:42 +00:00
# delete existing links from the page
db . exec ( " DELETE FROM links WHERE fromPage = ? " , page )
2021-02-16 14:26:01 +00:00
else :
db . exec ( " INSERT INTO pages VALUES (?, ?, ?, ?, ?) " , pageID , page , ts , ts , content )
2021-02-17 23:37:42 +00:00
# push to full text search index - TODO perhaps use the parsed text content (as used for context) instead of the raw markdown
2021-02-16 14:26:01 +00:00
db . exec ( " INSERT INTO pages_fts (rowid, page, content) VALUES (?, ?, ?) " , pageID , page , content )
db . exec ( " INSERT INTO revisions VALUES (?, ?, ?, ?, ?) " , revisionID , page , ts , meta , data )
2021-02-17 23:37:42 +00:00
# insert new set of links
for link in parsed . links :
db . exec ( " INSERT INTO links VALUES (?, ?, ?, ?, ?) " , snowflake ( ) , page , link . target , link . text , link . context )
2021-01-06 16:09:48 +00:00
proc fetchRevisions * ( db : DbConn , page : string ) : seq [ Revision ] =
db . all ( " SELECT timestamp, meta FROM revisions WHERE page = ? ORDER BY timestamp DESC " , page ) . map ( proc ( row : ResultRow ) : Revision =
let ( ts , metaJSON ) = row . unpack ( ( Time , string ) )
Revision ( time : ts , meta : parse ( metaJSON , RevisionMeta ) )
)
proc processRevisionRow ( r : ResultRow ) : Revision =
let ( ts , meta ) = r . unpack ( ( Time , string ) )
Revision ( time : ts , meta : parse ( meta , RevisionMeta ) )
proc adjacentRevisions * ( db : DbConn , page : string , ts : Time ) : ( Option [ Revision ] , Option [ Revision ] ) =
# revision after given timestamp
2021-02-16 14:26:01 +00:00
let next = db . one ( " SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, ' $ .kind ' ) = 0 AND timestamp > ? ORDER BY timestamp ASC LIMIT 1 " , page , ts )
2021-01-06 16:09:48 +00:00
# revision before given timestamp
2021-02-16 14:26:01 +00:00
let prev = db . one ( " SELECT timestamp, meta FROM revisions WHERE page = ? AND json_extract(meta, ' $ .kind ' ) = 0 AND timestamp < ? ORDER BY timestamp DESC LIMIT 1 " , page , ts )
( next . map ( processRevisionRow ) , prev . map ( processRevisionRow ) )
proc processSearchRow ( row : ResultRow ) : SearchResult =
let ( page , rank , snippet ) = row . unpack ( ( string , float , string ) )
var pos = 0
# split snippet up into an array of highlighted/unhighlighted bits
var snips : seq [ ( bool , string ) ] = @ [ ]
while true :
let newpos = find ( snippet , " <hlstart> " , pos )
if newpos = = - 1 :
break
snips . add ( ( false , snippet [ pos .. newpos - 1 ] ) )
var endpos = find ( snippet , " <hlend> " , newpos )
# if no <hlend> (this *probably* shouldn't happen) then just highlight remaining rest of string
if endpos = = - 1 :
endpos = len ( snippet )
snips . add ( ( true , snippet [ newpos + len ( " <hlstart> " ) .. endpos - 1 ] ) )
pos = endpos + len ( " <hlend> " )
snips . add ( ( false , snippet [ pos .. len ( snippet ) - 1 ] ) )
# filter out empty snippet fragments because they're not useful, rescale rank for nicer display
SearchResult ( page : page , rank : log10 ( - rank * 1 e7 ) , snippet : snips . filter ( x = > len ( x [ 1 ] ) > 0 ) )
proc search * ( db : DbConn , query : string ) : seq [ SearchResult ] =
2021-03-13 22:36:41 +00:00
db . all ( " SELECT page, rank, snippet(pages_fts, 1, ' <hlstart> ' , ' <hlend> ' , ' ... ' , 32) FROM pages_fts WHERE pages_fts MATCH ? AND rank MATCH ' bm25(5.0, 1.0) ' ORDER BY rank " , query ) . map ( processSearchRow )
proc getBasicFileInfo * ( db : DbConn , page , filename : string ) : Option [ ( string , string ) ] =
db . one ( " SELECT storagePath, mimeType FROM files WHERE page = ? AND filename = ? " , page , filename ) . map ( proc ( r : ResultRow ) : ( string , string ) = r . unpack ( ( string , string ) ) )
proc getPageFiles * ( db : DbConn , page : string ) : seq [ FileInfo ] =
db . all ( " SELECT filename, mimeType, uploadedTime, metadata FROM files WHERE page = ? " , page ) . map ( proc ( r : ResultRow ) : FileInfo =
let ( filename , mime , upload , meta ) = r . unpack ( ( string , string , Time , string ) )
FileInfo ( filename : filename , mimetype : mime , uploadedTime : upload , metadata : parse ( meta , JsonNode ) ) )