minoteaur/src/md.nim

218 lines
10 KiB
Nim

import karax/[karaxdsl, vdom]
import cmark/native as cmark except Node, Parser
# the builtin re library would probably be better for this - it can directly take cstrings (so better perf when dealing with the cstrings from cmark) and may be faster
# unfortunately it does not expose a findAll thing which returns the *positions* of everything for some weird reason
import regex
from strutils import join, find, startsWith, endsWith
import unicode
import sets
from ./util import pageToSlug, slugToPage, autoInitializedThreadvar
cmark_gfm_core_extensions_ensure_registered()
type
Node = object
raw: NodePtr
BorrowedNode = object
raw: NodePtr
Parser = object
raw: ParserPtr
proc `=copy`(dest: var Node, source: Node) {.error.}
proc `=destroy`(x: var Node) = cmark_node_free(x.raw)
proc `=destroy`(x: var BorrowedNode) = discard
proc `=destroy`(x: var Parser) = cmark_parser_free(x.raw)
proc borrow(n: Node): BorrowedNode = BorrowedNode(raw: n.raw)
proc newParser(options: int64, extensions: seq[string]): Parser =
let parser: ParserPtr = cmark_parser_new(options.cint)
if parser == nil: raise newException(CatchableError, "failed to initialize parser")
# load and enable desired syntax extensions
# these are freed with the parser (probably)
for ext in extensions:
let e: cstring = ext
let eptr = cmark_find_syntax_extension(e)
if eptr == nil:
cmark_parser_free(parser)
raise newException(LibraryError, "failed to find extension " & ext)
if cmark_parser_attach_syntax_extension(parser, eptr) == 0:
cmark_parser_free(parser)
raise newException(CatchableError, "failed to attach extension " & ext)
Parser(raw: parser)
proc parse(p: Parser, document: string): Node =
let
str: cstring = document
length = len(document).csize_t
cmark_parser_feed(p.raw, str, length)
let ast = cmark_parser_finish(p.raw)
if ast == nil: raise newException(CatchableError, "parsing failed - should not occur")
Node(raw: ast)
proc nodeType(n: BorrowedNode): NodeType = cmark_node_get_type(n.raw)
proc nodeContent(n: BorrowedNode): string = $cmark_node_get_literal(n.raw)
proc newNode(ty: NodeType, content: string): Node =
let raw = cmark_node_new(ty)
if raw == nil: raise newException(CatchableError, "node creation failed")
if cmark_node_set_literal(raw, content) != 1:
cmark_node_free(raw)
raise newException(CatchableError, "node content setting failed")
Node(raw: raw)
proc parentNode(parentOf: BorrowedNode): BorrowedNode = BorrowedNode(raw: cmark_node_parent(parentOf.raw))
proc pushNodeAfter(after: BorrowedNode, node: sink Node) {.nodestroy.} = assert cmark_node_insert_before(after.raw, node.raw) == 1
proc unlinkNode(node: sink BorrowedNode): Node {.nodestroy.} =
cmark_node_unlink(node.raw)
Node(raw: node.raw)
proc render(ast: Node, options: int64, parser: Parser): string =
let html: cstring = cmark_render_html(ast.raw, options.cint, cmark_parser_get_syntax_extensions(parser.raw))
defer: free(html)
result = $html
iterator cmarkTree(root: BorrowedNode): (EventType, BorrowedNode) {.inline.} =
var iter = cmark_iter_new(root.raw)
if iter == nil: raise newException(CatchableError, "iterator initialization failed")
defer: cmark_iter_free(iter)
while true:
let ev = cmark_iter_next(iter)
if ev == etDone: break
let node: NodePtr = cmark_iter_get_node(iter)
yield (ev, BorrowedNode(raw: node))
func wikilink(page, linkText: string): string =
let vdom = buildHtml(a(href=pageToSlug(page), class="wikilink")): text linkText
$vdom
autoInitializedThreadvar(wlRegex, Regex, re"\[\[([^:\]]+):?([^\]]+)?\]\]")
autoInitializedThreadvar(newlinesRegex, Regex, re"\n{2,}")
proc renderToHtml*(input: string): string =
let wlRegex = wlRegex()
let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES
# initialize parser with the extensions in use, parse things
let parser = newParser(opt, @["table", "strikethrough"])
let doc = parse(parser, input)
# iterate over AST using built-in cmark-gfm AST iteration thing
for (evType, node) in cmarkTree(borrow(doc)):
# if it is a text node
if nodeType(node) == ntText:
let ntext = nodeContent(node)
# check for wikilinks in text node
let matches = findAll(ntext, wlRegex)
# if there are any, put in the appropriate HTML nodes
if len(matches) > 0:
var lastpos = 0
# I think this does similar things to the snippet highlight code, perhaps it could be factored out somehow
for match in matches:
let page = ntext[match.captures[0][0]] # I don't know why this doesn't use Option. Perhaps sometimes there are somehow > 1 ranges.
# if there is a separate linkText field, use this, otherwise just use the page
let linkText =
if len(match.captures[1]) > 0: ntext[match.captures[1][0]]
else: page
let html = wikilink(page, linkText)
# push text before this onto the tree, as well as the HTML of the wikilink
pushNodeAfter(node, newNode(ntText, ntext[lastpos..<match.boundaries.a]))
pushNodeAfter(node, newNode(ntHtmlInline, html))
lastpos = match.boundaries.b + 1
# push final text, if extant
if lastpos != len(ntext): pushNodeAfter(node, newNode(ntText, ntext[lastpos..<len(ntext)]))
# remove original text node
discard unlinkNode(node)
render(doc, opt, parser)
proc textContent(node: BorrowedNode): string =
let newlinesRegex = newlinesRegex()
for (evType, node) in cmarkTree(node):
let ntype = nodeType(node)
if ntype == ntText or ntype == ntCode:
result &= nodeContent(node)
elif int64(ntype) < CMARK_NODE_TYPE_INLINE and evType == etExit and ntype != ntItem:
result &= "\n"
elif ntype == ntSoftBreak:
result &= " "
elif ntype == ntLineBreak:
result &= "\n"
replace(strip(result), newlinesRegex, "\n")
proc findParagraphParent(node: BorrowedNode): BorrowedNode =
result = node
while nodeType(result) != ntParagraph: result = parentNode(result)
type
Link* = object
target*, text*, context*: string
ParsedPage* = object
links*: seq[Link]
#fullText*: string
# Generates context for a link given the surrounding string and its position in it
# Takes a given quantity of space-separated words from both sides
# If not enough exist on one side, takes more from the other
# TODO: treat a wikilink as one token
proc linkContext(str: string, startPos: int, endPos: int, lookaround: int): string =
var earlierToks = if startPos > 0: splitWhitespace(str[0..<startPos]) else: @[]
var linkText = str[startPos..endPos]
var laterToks = if endPos < str.len: splitWhitespace(str[endPos + 1..^1]) else: @[]
let bdlook = lookaround * 2
result =
# both are longer than necessary so take tokens symmetrically
if earlierToks.len >= lookaround and laterToks.len >= lookaround:
earlierToks[^lookaround..^1].join(" ") & linkText & laterToks[0..<lookaround].join(" ")
# later is shorter than wanted, take more from earlier
elif earlierToks.len >= lookaround and laterToks.len < lookaround:
earlierToks[max(earlierToks.len - bdlook + laterToks.len, 0)..^1].join(" ") & linkText & laterToks.join(" ")
# mirrored version of previous case
elif earlierToks.len < lookaround and laterToks.len >= lookaround:
earlierToks.join(" ") & linkText & laterToks[0..<min(bdlook - earlierToks.len, laterToks.len)].join(" ")
# both too short, use all of both
else: earlierToks.join(" ") & linkText & laterToks.join(" ")
# TODO: optimize
if not result.startsWith(earlierToks.join(" ")): result = "... " & result
if not result.endsWith(laterToks.join(" ")): result = result & " ..."
proc parsePage*(input: string): ParsedPage =
let wlRegex = wlRegex()
let opt = CMARK_OPT_UNSAFE or CMARK_OPT_FOOTNOTES or CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE or CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES
let parser = newParser(opt, @["table", "strikethrough"])
let doc = parse(parser, input)
var wikilinks: seq[Link] = @[]
var seenPages: HashSet[string]
for (evType, node) in cmarkTree(borrow(doc)):
if nodeType(node) == ntText:
let ntext = nodeContent(node)
let matches = findAll(ntext, wlRegex)
if len(matches) > 0:
let paragraph = textContent(findParagraphParent(node))
var matchEnd = 0
for match in matches:
let page = ntext[match.captures[0][0]]
let linkText =
if len(match.captures[1]) > 0: ntext[match.captures[1][0]]
else: page
let canonicalPage = slugToPage(page)
if not (canonicalPage in seenPages):
# matches in this text node will not necessarily line up with ones in the surrounding textual contentso look up the wikilink's source in the paragraph
# kind of hacky but should work in any scenario which isn't deliberately constructed pathologically, especially since it will only return stuff after the last link
let fullLink = ntext[match.boundaries]
let matchInParagraph = find(paragraph, fullLink, matchEnd)
matchEnd = matchInParagraph + fullLink.len - 1
let context = linkContext(paragraph, matchInParagraph, matchEnd, 12)
# add to wikilinks list, and deduplicate
wikilinks.add(Link(target: canonicalPage, text: linkText, context: context))
seenPages.incl(canonicalPage)
ParsedPage(links: wikilinks) #fullText: textContent(borrow(doc)))