2021-01-06 16:09:48 +00:00
import karax / [ karaxdsl , vdom ]
2021-02-16 14:26:01 +00:00
import cmark / native as cmark except Node , Parser
2021-01-06 16:09:48 +00:00
# the builtin re library would probably be better for this - it can directly take cstrings (so better perf when dealing with the cstrings from cmark) and may be faster
# unfortunately it does not expose a findAll thing which returns the *positions* of everything for some weird reason
import regex
2021-02-16 14:26:01 +00:00
from strutils import join , find
import unicode
import sets
from . / util import pageToSlug , slugToPage , autoInitializedThreadvar
2021-01-06 16:09:48 +00:00
cmark_gfm_core_extensions_ensure_registered ( )
2021-02-16 14:26:01 +00:00
type
Node = object
raw : NodePtr
BorrowedNode = object
raw : NodePtr
Parser = object
raw : ParserPtr
proc `=copy` ( dest : var Node , source : Node ) {. error . }
proc `=destroy` ( x : var Node ) = cmark_node_free ( x . raw )
proc `=destroy` ( x : var BorrowedNode ) = discard
proc `=destroy` ( x : var Parser ) = cmark_parser_free ( x . raw )
proc borrow ( n : Node ) : BorrowedNode = BorrowedNode ( raw : n . raw )
proc newParser ( options : int64 , extensions : seq [ string ] ) : Parser =
let parser : ParserPtr = cmark_parser_new ( options . cint )
if parser = = nil : raise newException ( CatchableError , " failed to initialize parser " )
# load and enable desired syntax extensions
# these are freed with the parser (probably)
for ext in extensions :
let e : cstring = ext
let eptr = cmark_find_syntax_extension ( e )
if eptr = = nil :
cmark_parser_free ( parser )
raise newException ( LibraryError , " failed to find extension " & ext )
if cmark_parser_attach_syntax_extension ( parser , eptr ) = = 0 :
cmark_parser_free ( parser )
raise newException ( CatchableError , " failed to attach extension " & ext )
Parser ( raw : parser )
proc parse ( p : Parser , document : string ) : Node =
let
str : cstring = document
length = len ( document ) . csize_t
cmark_parser_feed ( p . raw , str , length )
let ast = cmark_parser_finish ( p . raw )
if ast = = nil : raise newException ( CatchableError , " parsing failed - should not occur " )
Node ( raw : ast )
proc nodeType ( n : BorrowedNode ) : NodeType = cmark_node_get_type ( n . raw )
proc nodeContent ( n : BorrowedNode ) : string = $ cmark_node_get_literal ( n . raw )
proc newNode ( ty : NodeType , content : string ) : Node =
let raw = cmark_node_new ( ty )
if raw = = nil : raise newException ( CatchableError , " node creation failed " )
if cmark_node_set_literal ( raw , content ) ! = 1 :
cmark_node_free ( raw )
raise newException ( CatchableError , " node content setting failed " )
Node ( raw : raw )
proc parentNode ( parentOf : BorrowedNode ) : BorrowedNode = BorrowedNode ( raw : cmark_node_parent ( parentOf . raw ) )
proc pushNodeAfter ( after : BorrowedNode , node : sink Node ) {. nodestroy . } = assert cmark_node_insert_before ( after . raw , node . raw ) = = 1
proc unlinkNode ( node : sink BorrowedNode ) : Node {. nodestroy . } =
cmark_node_unlink ( node . raw )
Node ( raw : node . raw )
proc render ( ast : Node , options : int64 , parser : Parser ) : string =
let html : cstring = cmark_render_html ( ast . raw , options . cint , cmark_parser_get_syntax_extensions ( parser . raw ) )
defer : free ( html )
result = $ html
iterator cmarkTree ( root : BorrowedNode ) : ( EventType , BorrowedNode ) {. inline . } =
var iter = cmark_iter_new ( root . raw )
if iter = = nil : raise newException ( CatchableError , " iterator initialization failed " )
defer : cmark_iter_free ( iter )
while true :
let ev = cmark_iter_next ( iter )
if ev = = etDone : break
let node : NodePtr = cmark_iter_get_node ( iter )
yield ( ev , BorrowedNode ( raw : node ) )
2021-01-06 16:09:48 +00:00
func wikilink ( page , linkText : string ) : string =
2021-02-16 14:26:01 +00:00
let vdom = buildHtml ( a ( href = pageToSlug ( page ) , class = " wikilink " ) ) : text linkText
2021-01-06 16:09:48 +00:00
$ vdom
2021-02-16 14:26:01 +00:00
autoInitializedThreadvar ( wlRegex , Regex , re" \ [ \ [([^: \ ]]+):?([^ \ ]]+)? \ ] \ ] " )
autoInitializedThreadvar ( newlinesRegex , Regex , re" \ n{2,} " )
2021-01-06 16:09:48 +00:00
proc renderToHtml * ( input : string ) : string =
2021-02-16 14:26:01 +00:00
let wlRegex = wlRegex ( )
2021-01-06 16:09:48 +00:00
let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES
2021-02-16 14:26:01 +00:00
# initialize parser with the extensions in use, parse things
let parser = newParser ( opt , @ [ " table " , " strikethrough " ] )
let doc = parse ( parser , input )
2021-01-06 16:09:48 +00:00
2021-02-16 14:26:01 +00:00
# iterate over AST using built-in cmark-gfm AST iteration thing
for ( evType , node ) in cmarkTree ( borrow ( doc ) ) :
# if it is a text node
if nodeType ( node ) = = ntText :
let ntext = nodeContent ( node )
# check for wikilinks in text node
let matches = findAll ( ntext , wlRegex )
# if there are any, put in the appropriate HTML nodes
if len ( matches ) > 0 :
var lastpos = 0
# I think this does similar things to the snippet highlight code, perhaps it could be factored out somehow
for match in matches :
let page = ntext [ match . captures [ 0 ] [ 0 ] ] # I don't know why this doesn't use Option. Perhaps sometimes there are somehow > 1 ranges.
# if there is a separate linkText field, use this, otherwise just use the page
let linkText =
if len ( match . captures [ 1 ] ) > 0 : ntext [ match . captures [ 1 ] [ 0 ] ]
else : page
let html = wikilink ( page , linkText )
# push text before this onto the tree, as well as the HTML of the wikilink
pushNodeAfter ( node , newNode ( ntText , ntext [ lastpos .. < match . boundaries . a ] ) )
pushNodeAfter ( node , newNode ( ntHtmlInline , html ) )
lastpos = match . boundaries . b + 1
# push final text, if extant
if lastpos ! = len ( ntext ) : pushNodeAfter ( node , newNode ( ntText , ntext [ lastpos .. < len ( ntext ) ] ) )
# remove original text node
discard unlinkNode ( node )
render ( doc , opt , parser )
proc textContent ( node : BorrowedNode ) : string =
let newlinesRegex = newlinesRegex ( )
for ( evType , node ) in cmarkTree ( node ) :
let ntype = nodeType ( node )
if ntype = = ntText or ntype = = ntCode :
result & = nodeContent ( node )
elif int64 ( ntype ) < CMARK_NODE_TYPE_INLINE and evType = = etExit and ntype ! = ntItem :
result & = " \n "
elif ntype = = ntSoftBreak :
result & = " "
elif ntype = = ntLineBreak :
result & = " \n "
replace ( strip ( result ) , newlinesRegex , " \n " )
proc findParagraphParent ( node : BorrowedNode ) : BorrowedNode =
result = node
while nodeType ( result ) ! = ntParagraph : result = parentNode ( result )
type
Link * = object
page * , text * , context * : string
ParsedPage * = object
links * : seq [ Link ]
fullText : string
# Generates context for a link given the surrounding string and its position in it
# Takes a given quantity of space-separated words from both sides
# If not enough exist on one side, takes more from the other
# TODO: treat a wikilink as one token
proc linkContext ( str : string , startPos : int , endPos : int , lookaround : int ) : string =
var earlierToks = splitWhitespace ( str [ 0 .. < startPos ] )
var linkText = str [ startPos .. endPos ]
var laterToks = splitWhitespace ( str [ endPos + 1 .. ^ 1 ] )
let bdlook = lookaround * 2
result =
# both are longer than necessary so take tokens symmetrically
if earlierToks . len > = lookaround and laterToks . len > = lookaround :
earlierToks [ ^ lookaround .. ^ 1 ] . join ( " " ) & linkText & laterToks [ 0 .. < lookaround ] . join ( " " )
# later is shorter than wanted, take more from earlier
elif earlierToks . len > = lookaround and laterToks . len < lookaround :
earlierToks [ ^ ( bdlook - laterToks . len ) .. ^ 1 ] . join ( " " ) & linkText & laterToks . join ( " " )
# mirrored version of previous case
elif earlierToks . len < lookaround and laterToks . len > = lookaround :
earlierToks . join ( " " ) & linkText & laterToks [ 0 .. < ( bdlook - earlierToks . len ) ] . join ( " " )
# both too short, use all of both
else : earlierToks . join ( " " ) & linkText & laterToks . join ( " " )
proc parsePage * ( input : string ) : ParsedPage =
let wlRegex = wlRegex ( )
let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES
let parser = newParser ( opt , @ [ " table " , " strikethrough " ] )
let doc = parse ( parser , input )
var wikilinks : seq [ Link ] = @ [ ]
var seenPages : HashSet [ string ]
for ( evType , node ) in cmarkTree ( borrow ( doc ) ) :
if nodeType ( node ) = = ntText :
let ntext = nodeContent ( node )
let matches = findAll ( ntext , wlRegex )
if len ( matches ) > 0 :
let paragraph = textContent ( findParagraphParent ( node ) )
var matchEnd = 0
for match in matches :
echo $ match
let page = ntext [ match . captures [ 0 ] [ 0 ] ]
let linkText =
if len ( match . captures [ 1 ] ) > 0 : ntext [ match . captures [ 1 ] [ 0 ] ]
else : page
let canonicalPage = slugToPage ( page )
if not ( canonicalPage in seenPages ) :
# matches in this text node will not necessarily line up with ones in the surrounding textual contentso look up the wikilink's source in the paragraph
# kind of hacky but should work in any scenario which isn't deliberately constructed pathologically, especially since it will only return stuff after the last link
let fullLink = ntext [ match . boundaries ]
let matchInParagraph = find ( paragraph , fullLink , matchEnd )
matchEnd = matchInParagraph + fullLink . len
let context = linkContext ( paragraph , matchInParagraph , matchEnd , 12 )
# add to wikilinks list, and deduplicate
wikilinks . add ( Link ( page : canonicalPage , text : linkText , context : context ) )
seenPages . incl ( canonicalPage )
2021-01-06 16:09:48 +00:00
2021-02-16 14:26:01 +00:00
ParsedPage ( links : wikilinks , fullText : textContent ( borrow ( doc ) ) )