minoteaur/src/md.nim

import karax/[karaxdsl, vdom]
import cmark/native as cmark except Node, Parser
# the builtin re library would probably be better for this - it can directly take cstrings (so better perf when dealing with the cstrings from cmark) and may be faster
# unfortunately it does not expose a findAll thing which returns the *positions* of everything for some weird reason
import regex
from strutils import join, find
import unicode
import sets

from ./util import pageToSlug, slugToPage, autoInitializedThreadvar

cmark_gfm_core_extensions_ensure_registered()

type
    Node = object
        raw: NodePtr
    BorrowedNode = object
        raw: NodePtr
    Parser = object
        raw: ParserPtr

proc `=copy`(dest: var Node, source: Node) {.error.}
proc `=destroy`(x: var Node) = cmark_node_free(x.raw)
proc `=destroy`(x: var BorrowedNode) = discard

proc `=destroy`(x: var Parser) = cmark_parser_free(x.raw)

proc borrow(n: Node): BorrowedNode = BorrowedNode(raw: n.raw)

proc newParser(options: int64, extensions: seq[string]): Parser =
    let parser: ParserPtr = cmark_parser_new(options.cint)
    if parser == nil: raise newException(CatchableError, "failed to initialize parser")
    # load and enable desired syntax extensions
    # these are freed with the parser (probably)
    for ext in extensions:
        let e: cstring = ext
        let eptr = cmark_find_syntax_extension(e)
        if eptr == nil:
            cmark_parser_free(parser)
            raise newException(LibraryError, "failed to find extension " & ext)
        if cmark_parser_attach_syntax_extension(parser, eptr) == 0: 
            cmark_parser_free(parser)
            raise newException(CatchableError, "failed to attach extension " & ext)
    Parser(raw: parser)

proc parse(p: Parser, document: string): Node =
    let
        str: cstring = document
        length = len(document).csize_t
    cmark_parser_feed(p.raw, str, length)
    let ast = cmark_parser_finish(p.raw)
    if ast == nil: raise newException(CatchableError, "parsing failed - should not occur")
    Node(raw: ast)

proc nodeType(n: BorrowedNode): NodeType = cmark_node_get_type(n.raw)
proc nodeContent(n: BorrowedNode): string = $cmark_node_get_literal(n.raw)

proc newNode(ty: NodeType, content: string): Node =
    let raw = cmark_node_new(ty)
    if raw == nil: raise newException(CatchableError, "node creation failed")
    if cmark_node_set_literal(raw, content) != 1:
        cmark_node_free(raw)
        raise newException(CatchableError, "node content setting failed")
    Node(raw: raw)

proc parentNode(parentOf: BorrowedNode): BorrowedNode = BorrowedNode(raw: cmark_node_parent(parentOf.raw))
proc pushNodeAfter(after: BorrowedNode, node: sink Node) {.nodestroy.} = assert cmark_node_insert_before(after.raw, node.raw) == 1
proc unlinkNode(node: sink BorrowedNode): Node {.nodestroy.} =
    cmark_node_unlink(node.raw)
    Node(raw: node.raw)

proc render(ast: Node, options: int64, parser: Parser): string =
    let html: cstring = cmark_render_html(ast.raw, options.cint, cmark_parser_get_syntax_extensions(parser.raw))
    defer: free(html)
    result = $html

iterator cmarkTree(root: BorrowedNode): (EventType, BorrowedNode) {.inline.} =
    var iter = cmark_iter_new(root.raw)
    if iter == nil: raise newException(CatchableError, "iterator initialization failed")
    defer: cmark_iter_free(iter)
    while true:
        let ev = cmark_iter_next(iter)
        if ev == etDone: break
        let node: NodePtr = cmark_iter_get_node(iter)
        yield (ev, BorrowedNode(raw: node))

func wikilink(page, linkText: string): string =
    let vdom = buildHtml(a(href=pageToSlug(page), class="wikilink")): text linkText
    $vdom

autoInitializedThreadvar(wlRegex, Regex, re"\[\[([^:\]]+):?([^\]]+)?\]\]")
autoInitializedThreadvar(newlinesRegex, Regex, re"\n{2,}")

proc renderToHtml*(input: string): string =
    let wlRegex = wlRegex()
    let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES

    # initialize parser with the extensions in use, parse things
    let parser = newParser(opt, @["table", "strikethrough"])
    let doc = parse(parser, input)

    # iterate over AST using built-in cmark-gfm AST iteration thing
    for (evType, node) in cmarkTree(borrow(doc)):
        # if it is a text node
        if nodeType(node) == ntText:
            let ntext = nodeContent(node)
            # check for wikilinks in text node
            let matches = findAll(ntext, wlRegex)
            # if there are any, put in the appropriate HTML nodes
            if len(matches) > 0:
                var lastpos = 0
                # I think this does similar things to the snippet highlight code, perhaps it could be factored out somehow
                for match in matches:
                    let page = ntext[match.captures[0][0]] # I don't know why this doesn't use Option. Perhaps sometimes there are somehow > 1 ranges.
                    # if there is a separate linkText field, use this, otherwise just use the page
                    let linkText = 
                        if len(match.captures[1]) > 0: ntext[match.captures[1][0]]
                        else: page
                    let html = wikilink(page, linkText)
                    # push text before this onto the tree, as well as the HTML of the wikilink
                    pushNodeAfter(node, newNode(ntText, ntext[lastpos..<match.boundaries.a]))
                    pushNodeAfter(node, newNode(ntHtmlInline, html))
                    lastpos = match.boundaries.b + 1
                # push final text, if extant
                if lastpos != len(ntext): pushNodeAfter(node, newNode(ntText, ntext[lastpos..<len(ntext)]))
                # remove original text node
                discard unlinkNode(node)

    render(doc, opt, parser)

proc textContent(node: BorrowedNode): string =
    let newlinesRegex = newlinesRegex()
    for (evType, node) in cmarkTree(node):
        let ntype = nodeType(node)
        if ntype == ntText or ntype == ntCode:
            result &= nodeContent(node)
        elif int64(ntype) < CMARK_NODE_TYPE_INLINE and evType == etExit and ntype != ntItem:
            result &= "\n"
        elif ntype == ntSoftBreak:
            result &= " "
        elif ntype == ntLineBreak:
            result &= "\n"
    replace(strip(result), newlinesRegex, "\n")

proc findParagraphParent(node: BorrowedNode): BorrowedNode =
    result = node
    while nodeType(result) != ntParagraph: result = parentNode(result)

type 
    Link* = object
        page*, text*, context*: string
    ParsedPage* = object
        links*: seq[Link]
        fullText: string

# Generates context for a link given the surrounding string and its position in it
# Takes a given quantity of space-separated words from both sides
# If not enough exist on one side, takes more from the other
# TODO: treat a wikilink as one token
proc linkContext(str: string, startPos: int, endPos: int, lookaround: int): string =
    var earlierToks = splitWhitespace(str[0..<startPos])
    var linkText = str[startPos..endPos]
    var laterToks = splitWhitespace(str[endPos + 1..^1])
    let bdlook = lookaround * 2
    result =
        # both are longer than necessary so take tokens symmetrically
        if earlierToks.len >= lookaround and laterToks.len >= lookaround: 
            earlierToks[^lookaround..^1].join(" ") & linkText & laterToks[0..<lookaround].join(" ")
        # later is shorter than wanted, take more from earlier
        elif earlierToks.len >= lookaround and laterToks.len < lookaround:
            earlierToks[^(bdlook - laterToks.len)..^1].join(" ") & linkText & laterToks.join(" ")
        # mirrored version of previous case
        elif earlierToks.len < lookaround and laterToks.len >= lookaround: 
            earlierToks.join(" ") & linkText & laterToks[0..<(bdlook - earlierToks.len)].join(" ")
        # both too short, use all of both
        else: earlierToks.join(" ") & linkText & laterToks.join(" ")

proc parsePage*(input: string): ParsedPage =
    let wlRegex = wlRegex()
    let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES

    let parser = newParser(opt, @["table", "strikethrough"])
    let doc = parse(parser, input)

    var wikilinks: seq[Link] = @[]
    var seenPages: HashSet[string]

    for (evType, node) in cmarkTree(borrow(doc)):
        if nodeType(node) == ntText:
            let ntext = nodeContent(node)
            let matches = findAll(ntext, wlRegex)
            if len(matches) > 0:
                let paragraph = textContent(findParagraphParent(node))
                var matchEnd = 0
                for match in matches:
                    echo $match
                    let page = ntext[match.captures[0][0]]
                    let linkText = 
                        if len(match.captures[1]) > 0: ntext[match.captures[1][0]]
                        else: page

                    let canonicalPage = slugToPage(page)
                    if not (canonicalPage in seenPages):
                        # matches in this text node will not necessarily line up with ones in the surrounding textual contentso look up the wikilink's source in the paragraph
                        # kind of hacky but should work in any scenario which isn't deliberately constructed pathologically, especially since it will only return stuff after the last link
                        let fullLink = ntext[match.boundaries]
                        let matchInParagraph = find(paragraph, fullLink, matchEnd)
                        matchEnd = matchInParagraph + fullLink.len
                        let context = linkContext(paragraph, matchInParagraph, matchEnd, 12)

                        # add to wikilinks list, and deduplicate
                        wikilinks.add(Link(page: canonicalPage, text: linkText, context: context))
                        seenPages.incl(canonicalPage)

    ParsedPage(links: wikilinks, fullText: textContent(borrow(doc)))
first commit 2021-01-06 16:09:48 +00:00			`import karax/[karaxdsl, vdom]`
search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`import cmark/native as cmark except Node, Parser`
first commit 2021-01-06 16:09:48 +00:00			`# the builtin re library would probably be better for this - it can directly take cstrings (so better perf when dealing with the cstrings from cmark) and may be faster`
			`# unfortunately it does not expose a findAll thing which returns the positions of everything for some weird reason`
			`import regex`
search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`from strutils import join, find`
			`import unicode`
			`import sets`

			`from ./util import pageToSlug, slugToPage, autoInitializedThreadvar`
first commit 2021-01-06 16:09:48 +00:00
			`cmark_gfm_core_extensions_ensure_registered()`

search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`type`
			`Node = object`
			`raw: NodePtr`
			`BorrowedNode = object`
			`raw: NodePtr`
			`Parser = object`
			`raw: ParserPtr`

			proc `=copy`(dest: var Node, source: Node) {.error.}
			proc `=destroy`(x: var Node) = cmark_node_free(x.raw)
			proc `=destroy`(x: var BorrowedNode) = discard

			proc `=destroy`(x: var Parser) = cmark_parser_free(x.raw)

			`proc borrow(n: Node): BorrowedNode = BorrowedNode(raw: n.raw)`

			`proc newParser(options: int64, extensions: seq[string]): Parser =`
			`let parser: ParserPtr = cmark_parser_new(options.cint)`
			`if parser == nil: raise newException(CatchableError, "failed to initialize parser")`
			`# load and enable desired syntax extensions`
			`# these are freed with the parser (probably)`
			`for ext in extensions:`
			`let e: cstring = ext`
			`let eptr = cmark_find_syntax_extension(e)`
			`if eptr == nil:`
			`cmark_parser_free(parser)`
			`raise newException(LibraryError, "failed to find extension " & ext)`
			`if cmark_parser_attach_syntax_extension(parser, eptr) == 0:`
			`cmark_parser_free(parser)`
			`raise newException(CatchableError, "failed to attach extension " & ext)`
			`Parser(raw: parser)`

			`proc parse(p: Parser, document: string): Node =`
			`let`
			`str: cstring = document`
			`length = len(document).csize_t`
			`cmark_parser_feed(p.raw, str, length)`
			`let ast = cmark_parser_finish(p.raw)`
			`if ast == nil: raise newException(CatchableError, "parsing failed - should not occur")`
			`Node(raw: ast)`

			`proc nodeType(n: BorrowedNode): NodeType = cmark_node_get_type(n.raw)`
			`proc nodeContent(n: BorrowedNode): string = $cmark_node_get_literal(n.raw)`

			`proc newNode(ty: NodeType, content: string): Node =`
			`let raw = cmark_node_new(ty)`
			`if raw == nil: raise newException(CatchableError, "node creation failed")`
			`if cmark_node_set_literal(raw, content) != 1:`
			`cmark_node_free(raw)`
			`raise newException(CatchableError, "node content setting failed")`
			`Node(raw: raw)`

			`proc parentNode(parentOf: BorrowedNode): BorrowedNode = BorrowedNode(raw: cmark_node_parent(parentOf.raw))`
			`proc pushNodeAfter(after: BorrowedNode, node: sink Node) {.nodestroy.} = assert cmark_node_insert_before(after.raw, node.raw) == 1`
			`proc unlinkNode(node: sink BorrowedNode): Node {.nodestroy.} =`
			`cmark_node_unlink(node.raw)`
			`Node(raw: node.raw)`

			`proc render(ast: Node, options: int64, parser: Parser): string =`
			`let html: cstring = cmark_render_html(ast.raw, options.cint, cmark_parser_get_syntax_extensions(parser.raw))`
			`defer: free(html)`
			`result = $html`

			`iterator cmarkTree(root: BorrowedNode): (EventType, BorrowedNode) {.inline.} =`
			`var iter = cmark_iter_new(root.raw)`
			`if iter == nil: raise newException(CatchableError, "iterator initialization failed")`
			`defer: cmark_iter_free(iter)`
			`while true:`
			`let ev = cmark_iter_next(iter)`
			`if ev == etDone: break`
			`let node: NodePtr = cmark_iter_get_node(iter)`
			`yield (ev, BorrowedNode(raw: node))`

first commit 2021-01-06 16:09:48 +00:00			`func wikilink(page, linkText: string): string =`
search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`let vdom = buildHtml(a(href=pageToSlug(page), class="wikilink")): text linkText`
first commit 2021-01-06 16:09:48 +00:00			`$vdom`

search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`autoInitializedThreadvar(wlRegex, Regex, re"\[\[([^:\]]+):?([^\]]+)?\]\]")`
			`autoInitializedThreadvar(newlinesRegex, Regex, re"\n{2,}")`
first commit 2021-01-06 16:09:48 +00:00
			`proc renderToHtml*(input: string): string =`
search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`let wlRegex = wlRegex()`
first commit 2021-01-06 16:09:48 +00:00			`let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES`

search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`# initialize parser with the extensions in use, parse things`
			`let parser = newParser(opt, @["table", "strikethrough"])`
			`let doc = parse(parser, input)`
first commit 2021-01-06 16:09:48 +00:00
search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`# iterate over AST using built-in cmark-gfm AST iteration thing`
			`for (evType, node) in cmarkTree(borrow(doc)):`
			`# if it is a text node`
			`if nodeType(node) == ntText:`
			`let ntext = nodeContent(node)`
			`# check for wikilinks in text node`
			`let matches = findAll(ntext, wlRegex)`
			`# if there are any, put in the appropriate HTML nodes`
			`if len(matches) > 0:`
			`var lastpos = 0`
			`# I think this does similar things to the snippet highlight code, perhaps it could be factored out somehow`
			`for match in matches:`
			`let page = ntext[match.captures[0][0]] # I don't know why this doesn't use Option. Perhaps sometimes there are somehow > 1 ranges.`
			`# if there is a separate linkText field, use this, otherwise just use the page`
			`let linkText =`
			`if len(match.captures[1]) > 0: ntext[match.captures[1][0]]`
			`else: page`
			`let html = wikilink(page, linkText)`
			`# push text before this onto the tree, as well as the HTML of the wikilink`
			`pushNodeAfter(node, newNode(ntText, ntext[lastpos..<match.boundaries.a]))`
			`pushNodeAfter(node, newNode(ntHtmlInline, html))`
			`lastpos = match.boundaries.b + 1`
			`# push final text, if extant`
			`if lastpos != len(ntext): pushNodeAfter(node, newNode(ntText, ntext[lastpos..<len(ntext)]))`
			`# remove original text node`
			`discard unlinkNode(node)`

			`render(doc, opt, parser)`

			`proc textContent(node: BorrowedNode): string =`
			`let newlinesRegex = newlinesRegex()`
			`for (evType, node) in cmarkTree(node):`
			`let ntype = nodeType(node)`
			`if ntype == ntText or ntype == ntCode:`
			`result &= nodeContent(node)`
			`elif int64(ntype) < CMARK_NODE_TYPE_INLINE and evType == etExit and ntype != ntItem:`
			`result &= "\n"`
			`elif ntype == ntSoftBreak:`
			`result &= " "`
			`elif ntype == ntLineBreak:`
			`result &= "\n"`
			`replace(strip(result), newlinesRegex, "\n")`

			`proc findParagraphParent(node: BorrowedNode): BorrowedNode =`
			`result = node`
			`while nodeType(result) != ntParagraph: result = parentNode(result)`

			`type`
			`Link* = object`
			`page, text, context*: string`
			`ParsedPage* = object`
			`links*: seq[Link]`
			`fullText: string`

			`# Generates context for a link given the surrounding string and its position in it`
			`# Takes a given quantity of space-separated words from both sides`
			`# If not enough exist on one side, takes more from the other`
			`# TODO: treat a wikilink as one token`
			`proc linkContext(str: string, startPos: int, endPos: int, lookaround: int): string =`
			`var earlierToks = splitWhitespace(str[0..<startPos])`
			`var linkText = str[startPos..endPos]`
			`var laterToks = splitWhitespace(str[endPos + 1..^1])`
			`let bdlook = lookaround * 2`
			`result =`
			`# both are longer than necessary so take tokens symmetrically`
			`if earlierToks.len >= lookaround and laterToks.len >= lookaround:`
			`earlierToks[^lookaround..^1].join(" ") & linkText & laterToks[0..<lookaround].join(" ")`
			`# later is shorter than wanted, take more from earlier`
			`elif earlierToks.len >= lookaround and laterToks.len < lookaround:`
			`earlierToks[^(bdlook - laterToks.len)..^1].join(" ") & linkText & laterToks.join(" ")`
			`# mirrored version of previous case`
			`elif earlierToks.len < lookaround and laterToks.len >= lookaround:`
			`earlierToks.join(" ") & linkText & laterToks[0..<(bdlook - earlierToks.len)].join(" ")`
			`# both too short, use all of both`
			`else: earlierToks.join(" ") & linkText & laterToks.join(" ")`

			`proc parsePage*(input: string): ParsedPage =`
			`let wlRegex = wlRegex()`
			`let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES`

			`let parser = newParser(opt, @["table", "strikethrough"])`
			`let doc = parse(parser, input)`

			`var wikilinks: seq[Link] = @[]`
			`var seenPages: HashSet[string]`

			`for (evType, node) in cmarkTree(borrow(doc)):`
			`if nodeType(node) == ntText:`
			`let ntext = nodeContent(node)`
			`let matches = findAll(ntext, wlRegex)`
			`if len(matches) > 0:`
			`let paragraph = textContent(findParagraphParent(node))`
			`var matchEnd = 0`
			`for match in matches:`
			`echo $match`
			`let page = ntext[match.captures[0][0]]`
			`let linkText =`
			`if len(match.captures[1]) > 0: ntext[match.captures[1][0]]`
			`else: page`

			`let canonicalPage = slugToPage(page)`
			`if not (canonicalPage in seenPages):`
			`# matches in this text node will not necessarily line up with ones in the surrounding textual contentso look up the wikilink's source in the paragraph`
			`# kind of hacky but should work in any scenario which isn't deliberately constructed pathologically, especially since it will only return stuff after the last link`
			`let fullLink = ntext[match.boundaries]`
			`let matchInParagraph = find(paragraph, fullLink, matchEnd)`
			`matchEnd = matchInParagraph + fullLink.len`
			`let context = linkContext(paragraph, matchInParagraph, matchEnd, 12)`

			`# add to wikilinks list, and deduplicate`
			`wikilinks.add(Link(page: canonicalPage, text: linkText, context: context))`
			`seenPages.incl(canonicalPage)`
first commit 2021-01-06 16:09:48 +00:00
search, page parsing, better cmark abstraction 2021-02-16 14:26:01 +00:00			`ParsedPage(links: wikilinks, fullText: textContent(borrow(doc)))`