mirror of
https://github.com/Jermolene/TiddlyWiki5
synced 2024-12-24 00:50:28 +00:00
Refactor utilities out of HTML parser
Some of the functions are useful general purpose parser helpers.
This commit is contained in:
parent
bd4a031df8
commit
ace57dd205
268
core/modules/parsers/parseutils.js
Normal file
268
core/modules/parsers/parseutils.js
Normal file
@ -0,0 +1,268 @@
|
||||
/*\
|
||||
title: $:/core/modules/utils/parseutils.js
|
||||
type: application/javascript
|
||||
module-type: utils
|
||||
|
||||
Utility functions concerned with parsing text into tokens.
|
||||
|
||||
Most functions have the following pattern:
|
||||
|
||||
* The parameters are:
|
||||
** `source`: the source string being parsed
|
||||
** `pos`: the current parse position within the string
|
||||
** Any further parameters are used to identify the token that is being parsed
|
||||
* The return value is:
|
||||
** null if the token was not found at the specified position
|
||||
** an object representing the token with the following standard fields:
|
||||
*** `type`: string indicating the type of the token
|
||||
*** `start`: start position of the token in the source string
|
||||
*** `end`: end position of the token in the source string
|
||||
*** Any further fields required to describe the token
|
||||
|
||||
The exception is `skipWhiteSpace`, which just returns the position after the whitespace.
|
||||
|
||||
\*/
|
||||
(function(){
|
||||
|
||||
/*jslint node: true, browser: true */
|
||||
/*global $tw: false */
|
||||
"use strict";
|
||||
|
||||
/*
|
||||
Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,}
|
||||
*/
|
||||
exports.parseWhiteSpace = function(source,pos) {
|
||||
var node = {
|
||||
type: "whitespace",
|
||||
start: pos
|
||||
};
|
||||
var re = /(\s)+/g;
|
||||
re.lastIndex = pos;
|
||||
var match = re.exec(source);
|
||||
if(match && match.index === pos) {
|
||||
node.end = pos + match[0].length;
|
||||
return node;
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
/*
|
||||
Convenience wrapper for parseWhiteSpace. Returns the position after the whitespace
|
||||
*/
|
||||
exports.skipWhiteSpace = function(source,pos) {
|
||||
var whitespace = $tw.utils.parseWhiteSpace(source,pos);
|
||||
if(whitespace) {
|
||||
return whitespace.end;
|
||||
}
|
||||
return pos;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,}
|
||||
*/
|
||||
exports.parseTokenString = function(source,pos,token) {
|
||||
var match = source.indexOf(token,pos) === pos;
|
||||
if(match) {
|
||||
return {
|
||||
type: "token",
|
||||
value: token,
|
||||
start: pos,
|
||||
end: pos + token.length
|
||||
};
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,}
|
||||
*/
|
||||
exports.parseTokenRegExp = function(source,pos,reToken) {
|
||||
var node = {
|
||||
type: "regexp",
|
||||
start: pos
|
||||
};
|
||||
reToken.lastIndex = pos;
|
||||
node.match = reToken.exec(source);
|
||||
if(node.match && node.match.index === pos) {
|
||||
node.end = pos + node.match[0].length;
|
||||
return node;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,}
|
||||
*/
|
||||
exports.parseStringLiteral = function(source,pos) {
|
||||
var node = {
|
||||
type: "string",
|
||||
start: pos
|
||||
};
|
||||
var reString = /(?:"([^"]*)")|(?:'([^']*)')/g;
|
||||
reString.lastIndex = pos;
|
||||
var match = reString.exec(source);
|
||||
if(match && match.index === pos) {
|
||||
node.value = match[1] === undefined ? match[2] : match[1];
|
||||
node.end = pos + match[0].length;
|
||||
return node;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:}
|
||||
*/
|
||||
exports.parseMacroParameter = function(source,pos) {
|
||||
var node = {
|
||||
type: "macro-parameter",
|
||||
start: pos
|
||||
};
|
||||
// Define our regexp
|
||||
var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g;
|
||||
// Skip whitespace
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for the parameter
|
||||
var token = $tw.utils.parseTokenRegExp(source,pos,reMacroParameter);
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Get the parameter details
|
||||
node.value = token.match[2] !== undefined ? token.match[2] : (
|
||||
token.match[3] !== undefined ? token.match[3] : (
|
||||
token.match[4] !== undefined ? token.match[4] : (
|
||||
token.match[5] !== undefined ? token.match[5] : (
|
||||
""
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
if(token.match[1]) {
|
||||
node.name = token.match[1];
|
||||
}
|
||||
// Update the end position
|
||||
node.end = pos;
|
||||
return node;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a macro invocation. Returns null if not found, or {type: "macrocall", name:, parameters:, start:, end:}
|
||||
*/
|
||||
exports.parseMacroInvocation = function(source,pos) {
|
||||
var node = {
|
||||
type: "macrocall",
|
||||
start: pos,
|
||||
params: []
|
||||
};
|
||||
// Define our regexps
|
||||
var reMacroName = /([^\s>"'=]+)/g;
|
||||
// Skip whitespace
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for a double less than sign
|
||||
var token = $tw.utils.parseTokenString(source,pos,"<<");
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Get the macro name
|
||||
var name = $tw.utils.parseTokenRegExp(source,pos,reMacroName);
|
||||
if(!name) {
|
||||
return null;
|
||||
}
|
||||
node.name = name.match[1];
|
||||
pos = name.end;
|
||||
// Process parameters
|
||||
var parameter = $tw.utils.parseMacroParameter(source,pos);
|
||||
while(parameter) {
|
||||
node.params.push(parameter);
|
||||
pos = parameter.end;
|
||||
// Get the next parameter
|
||||
parameter = $tw.utils.parseMacroParameter(source,pos);
|
||||
}
|
||||
// Skip whitespace
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for a double greater than sign
|
||||
token = $tw.utils.parseTokenString(source,pos,">>");
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Update the end position
|
||||
node.end = pos;
|
||||
return node;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,}
|
||||
*/
|
||||
exports.parseAttribute = function(source,pos) {
|
||||
var node = {
|
||||
start: pos
|
||||
};
|
||||
// Define our regexps
|
||||
var reAttributeName = /([^\/\s>"'=]+)/g,
|
||||
reUnquotedAttribute = /([^\/\s<>"'=]+)/g,
|
||||
reIndirectValue = /\{\{([^\}]+)\}\}/g;
|
||||
// Skip whitespace
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Get the attribute name
|
||||
var name = $tw.utils.parseTokenRegExp(source,pos,reAttributeName);
|
||||
if(!name) {
|
||||
return null;
|
||||
}
|
||||
node.name = name.match[1];
|
||||
pos = name.end;
|
||||
// Skip whitespace
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for an equals sign
|
||||
var token = $tw.utils.parseTokenString(source,pos,"=");
|
||||
if(token) {
|
||||
pos = token.end;
|
||||
// Skip whitespace
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for a string literal
|
||||
var stringLiteral = $tw.utils.parseStringLiteral(source,pos);
|
||||
if(stringLiteral) {
|
||||
pos = stringLiteral.end;
|
||||
node.type = "string";
|
||||
node.value = stringLiteral.value;
|
||||
} else {
|
||||
// Look for an indirect value
|
||||
var indirectValue = $tw.utils.parseTokenRegExp(source,pos,reIndirectValue);
|
||||
if(indirectValue) {
|
||||
pos = indirectValue.end;
|
||||
node.type = "indirect";
|
||||
node.textReference = indirectValue.match[1];
|
||||
} else {
|
||||
// Look for a unquoted value
|
||||
var unquotedValue = $tw.utils.parseTokenRegExp(source,pos,reUnquotedAttribute);
|
||||
if(unquotedValue) {
|
||||
pos = unquotedValue.end;
|
||||
node.type = "string";
|
||||
node.value = unquotedValue.match[1];
|
||||
} else {
|
||||
// Look for a macro invocation value
|
||||
var macroInvocation = $tw.utils.parseMacroInvocation(source,pos);
|
||||
if(macroInvocation) {
|
||||
pos = macroInvocation.end;
|
||||
node.type = "macro";
|
||||
node.value = macroInvocation;
|
||||
} else {
|
||||
node.type = "string";
|
||||
node.value = "true";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
node.type = "string";
|
||||
node.value = "true";
|
||||
}
|
||||
// Update the end position
|
||||
node.end = pos;
|
||||
return node;
|
||||
};
|
||||
|
||||
})();
|
@ -48,7 +48,7 @@ exports.parse = function() {
|
||||
// Advance the parser position to past the tag
|
||||
this.parser.pos = tag.end;
|
||||
// Check for an immediately following double linebreak
|
||||
var hasLineBreak = !tag.isSelfClosing && !!this.parseTokenRegExp(this.parser.source,this.parser.pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
|
||||
var hasLineBreak = !tag.isSelfClosing && !!$tw.utils.parseTokenRegExp(this.parser.source,this.parser.pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
|
||||
// Set whether we're in block mode
|
||||
tag.isBlock = this.is.block || hasLineBreak;
|
||||
// Parse the body if we need to
|
||||
@ -71,244 +71,7 @@ exports.parse = function() {
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,}
|
||||
*/
|
||||
exports.parseWhiteSpace = function(source,pos) {
|
||||
var node = {
|
||||
type: "whitespace",
|
||||
start: pos
|
||||
};
|
||||
var re = /(\s)+/g;
|
||||
re.lastIndex = pos;
|
||||
var match = re.exec(source);
|
||||
if(match && match.index === pos) {
|
||||
node.end = pos + match[0].length;
|
||||
return node;
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
/*
|
||||
Convenience wrapper for parseWhiteSpace
|
||||
*/
|
||||
exports.skipWhiteSpace = function(source,pos) {
|
||||
var whitespace = this.parseWhiteSpace(source,pos);
|
||||
if(whitespace) {
|
||||
return whitespace.end;
|
||||
}
|
||||
return pos;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,}
|
||||
*/
|
||||
exports.parseTokenString = function(source,pos,token) {
|
||||
var match = source.indexOf(token,pos) === pos;
|
||||
if(match) {
|
||||
return {
|
||||
type: "token",
|
||||
value: token,
|
||||
start: pos,
|
||||
end: pos + token.length
|
||||
};
|
||||
}
|
||||
return null;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,}
|
||||
*/
|
||||
exports.parseTokenRegExp = function(source,pos,reToken) {
|
||||
var node = {
|
||||
type: "regexp",
|
||||
start: pos
|
||||
};
|
||||
reToken.lastIndex = pos;
|
||||
node.match = reToken.exec(source);
|
||||
if(node.match && node.match.index === pos) {
|
||||
node.end = pos + node.match[0].length;
|
||||
return node;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,}
|
||||
*/
|
||||
exports.parseStringLiteral = function(source,pos) {
|
||||
var node = {
|
||||
type: "string",
|
||||
start: pos
|
||||
};
|
||||
var reString = /(?:"([^"]*)")|(?:'([^']*)')/g;
|
||||
reString.lastIndex = pos;
|
||||
var match = reString.exec(source);
|
||||
if(match && match.index === pos) {
|
||||
node.value = match[1] === undefined ? match[2] : match[1];
|
||||
node.end = pos + match[0].length;
|
||||
return node;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:}
|
||||
*/
|
||||
exports.parseMacroParameter = function(source,pos) {
|
||||
var node = {
|
||||
type: "macro-parameter",
|
||||
start: pos
|
||||
};
|
||||
// Define our regexp
|
||||
var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g;
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
// Look for the parameter
|
||||
var token = this.parseTokenRegExp(source,pos,reMacroParameter);
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Get the parameter details
|
||||
node.value = token.match[2] !== undefined ? token.match[2] : (
|
||||
token.match[3] !== undefined ? token.match[3] : (
|
||||
token.match[4] !== undefined ? token.match[4] : (
|
||||
token.match[5] !== undefined ? token.match[5] : (
|
||||
""
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
if(token.match[1]) {
|
||||
node.name = token.match[1];
|
||||
}
|
||||
// Update the end position
|
||||
node.end = pos;
|
||||
return node;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for a macro invocation. Returns null if not found, or {type: "macrocall", name:, parameters:, start:, end:}
|
||||
*/
|
||||
exports.parseMacroInvocation = function(source,pos) {
|
||||
var node = {
|
||||
type: "macrocall",
|
||||
start: pos,
|
||||
params: []
|
||||
};
|
||||
// Define our regexps
|
||||
var reMacroName = /([^\s>"'=]+)/g;
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
// Look for a double less than sign
|
||||
var token = this.parseTokenString(source,pos,"<<");
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Get the macro name
|
||||
var name = this.parseTokenRegExp(source,pos,reMacroName);
|
||||
if(!name) {
|
||||
return null;
|
||||
}
|
||||
node.name = name.match[1];
|
||||
pos = name.end;
|
||||
// Process parameters
|
||||
var parameter = this.parseMacroParameter(source,pos);
|
||||
while(parameter) {
|
||||
node.params.push(parameter);
|
||||
pos = parameter.end;
|
||||
// Get the next parameter
|
||||
parameter = this.parseMacroParameter(source,pos);
|
||||
}
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
// Look for a double greater than sign
|
||||
token = this.parseTokenString(source,pos,">>");
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Update the end position
|
||||
node.end = pos;
|
||||
return node;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,}
|
||||
*/
|
||||
exports.parseAttribute = function(source,pos) {
|
||||
var node = {
|
||||
start: pos
|
||||
};
|
||||
// Define our regexps
|
||||
var reAttributeName = /([^\/\s>"'=]+)/g,
|
||||
reUnquotedAttribute = /([^\/\s<>"'=]+)/g,
|
||||
reIndirectValue = /\{\{([^\}]+)\}\}/g;
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
// Get the attribute name
|
||||
var name = this.parseTokenRegExp(source,pos,reAttributeName);
|
||||
if(!name) {
|
||||
return null;
|
||||
}
|
||||
node.name = name.match[1];
|
||||
pos = name.end;
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
// Look for an equals sign
|
||||
var token = this.parseTokenString(source,pos,"=");
|
||||
if(token) {
|
||||
pos = token.end;
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
// Look for a string literal
|
||||
var stringLiteral = this.parseStringLiteral(source,pos);
|
||||
if(stringLiteral) {
|
||||
pos = stringLiteral.end;
|
||||
node.type = "string";
|
||||
node.value = stringLiteral.value;
|
||||
} else {
|
||||
// Look for an indirect value
|
||||
var indirectValue = this.parseTokenRegExp(source,pos,reIndirectValue);
|
||||
if(indirectValue) {
|
||||
pos = indirectValue.end;
|
||||
node.type = "indirect";
|
||||
node.textReference = indirectValue.match[1];
|
||||
} else {
|
||||
// Look for a unquoted value
|
||||
var unquotedValue = this.parseTokenRegExp(source,pos,reUnquotedAttribute);
|
||||
if(unquotedValue) {
|
||||
pos = unquotedValue.end;
|
||||
node.type = "string";
|
||||
node.value = unquotedValue.match[1];
|
||||
} else {
|
||||
// Look for a macro invocation value
|
||||
var macroInvocation = this.parseMacroInvocation(source,pos);
|
||||
if(macroInvocation) {
|
||||
pos = macroInvocation.end;
|
||||
node.type = "macro";
|
||||
node.value = macroInvocation;
|
||||
} else {
|
||||
node.type = "string";
|
||||
node.value = "true";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
node.type = "string";
|
||||
node.value = "true";
|
||||
}
|
||||
// Update the end position
|
||||
node.end = pos;
|
||||
return node;
|
||||
};
|
||||
|
||||
/*
|
||||
Look for an HTML tag. Returns null if not found, otherwise returns {type: "tag", name:, attributes: [], isSelfClosing:, start:, end:,}
|
||||
Look for an HTML tag. Returns null if not found, otherwise returns {type: "element", name:, attributes: [], isSelfClosing:, start:, end:,}
|
||||
*/
|
||||
exports.parseTag = function(source,pos,options) {
|
||||
options = options || {};
|
||||
@ -321,45 +84,45 @@ exports.parseTag = function(source,pos,options) {
|
||||
// Define our regexps
|
||||
var reTagName = /([a-zA-Z0-9\-\$]+)/g;
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for a less than sign
|
||||
token = this.parseTokenString(source,pos,"<");
|
||||
token = $tw.utils.parseTokenString(source,pos,"<");
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Get the tag name
|
||||
token = this.parseTokenRegExp(source,pos,reTagName);
|
||||
token = $tw.utils.parseTokenRegExp(source,pos,reTagName);
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
node.tag = token.match[1];
|
||||
pos = token.end;
|
||||
// Process attributes
|
||||
var attribute = this.parseAttribute(source,pos);
|
||||
var attribute = $tw.utils.parseAttribute(source,pos);
|
||||
while(attribute) {
|
||||
node.attributes[attribute.name] = attribute;
|
||||
pos = attribute.end;
|
||||
// Get the next attribute
|
||||
attribute = this.parseAttribute(source,pos);
|
||||
attribute = $tw.utils.parseAttribute(source,pos);
|
||||
}
|
||||
// Skip whitespace
|
||||
pos = this.skipWhiteSpace(source,pos);
|
||||
pos = $tw.utils.skipWhiteSpace(source,pos);
|
||||
// Look for a closing slash
|
||||
token = this.parseTokenString(source,pos,"/");
|
||||
token = $tw.utils.parseTokenString(source,pos,"/");
|
||||
if(token) {
|
||||
pos = token.end;
|
||||
node.isSelfClosing = true;
|
||||
}
|
||||
// Look for a greater than sign
|
||||
token = this.parseTokenString(source,pos,">");
|
||||
token = $tw.utils.parseTokenString(source,pos,">");
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
pos = token.end;
|
||||
// Check for a required line break
|
||||
if(options.requireLineBreak) {
|
||||
token = this.parseTokenRegExp(source,pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
|
||||
token = $tw.utils.parseTokenRegExp(source,pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
|
||||
if(!token) {
|
||||
return null;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user