Refactor utilities out of HTML parser

Some of the functions are useful general purpose parser helpers.
2025-11-07 11:03:01 +00:00 · 2014-04-17 12:00:32 +01:00
parent bd4a031df8
commit ace57dd205
2 changed files with 279 additions and 248 deletions
--- a/core/modules/parsers/parseutils.js
+++ b/core/modules/parsers/parseutils.js
@@ -0,0 +1,268 @@
 /*\
 title: $:/core/modules/utils/parseutils.js
 type: application/javascript
 module-type: utils
 Utility functions concerned with parsing text into tokens.
 Most functions have the following pattern:
 * The parameters are:
 ** `source`: the source string being parsed
 ** `pos`: the current parse position within the string
 ** Any further parameters are used to identify the token that is being parsed
 * The return value is:
 ** null if the token was not found at the specified position
 ** an object representing the token with the following standard fields:
 *** `type`: string indicating the type of the token
 *** `start`: start position of the token in the source string
 *** `end`: end position of the token in the source string
 *** Any further fields required to describe the token
 The exception is `skipWhiteSpace`, which just returns the position after the whitespace.
 \*/
 (function(){
 /*jslint node: true, browser: true */
 /*global $tw: false */
 "use strict";
 /*
 Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,}
 */
 exports.parseWhiteSpace = function(source,pos) {
 	var node = {
 		type: "whitespace",
 		start: pos
 	};
 	var re = /(\s)+/g;
 	re.lastIndex = pos;
 	var match = re.exec(source);
 	if(match && match.index === pos) {
 		node.end = pos + match[0].length;
 		return node;
 	}
 	return null;
 };
 /*
 Convenience wrapper for parseWhiteSpace. Returns the position after the whitespace
 */
 exports.skipWhiteSpace = function(source,pos) {
 	var whitespace = $tw.utils.parseWhiteSpace(source,pos);
 	if(whitespace) {
 		return whitespace.end;
 	}
 	return pos;
 };
 /*
 Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,}
 */
 exports.parseTokenString = function(source,pos,token) {
 	var match = source.indexOf(token,pos) === pos;
 	if(match) {
 		return {
 			type: "token",
 			value: token,
 			start: pos,
 			end: pos + token.length
 		};
 	}
 	return null;
 };
 /*
 Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,}
 */
 exports.parseTokenRegExp = function(source,pos,reToken) {
 	var node = {
 		type: "regexp",
 		start: pos
 	};
 	reToken.lastIndex = pos;
 	node.match = reToken.exec(source);
 	if(node.match && node.match.index === pos) {
 		node.end = pos + node.match[0].length;
 		return node;
 	} else {
 		return null;
 	}
 };
 /*
 Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,}
 */
 exports.parseStringLiteral = function(source,pos) {
 	var node = {
 		type: "string",
 		start: pos
 	};
 	var reString = /(?:"([^"]*)")|(?:'([^']*)')/g;
 	reString.lastIndex = pos;
 	var match = reString.exec(source);
 	if(match && match.index === pos) {
 		node.value = match[1] === undefined ? match[2] : match[1];
 		node.end = pos + match[0].length;
 		return node;
 	} else {
 		return null;
 	}
 };
 /*
 Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:}
 */
 exports.parseMacroParameter = function(source,pos) {
 	var node = {
 		type: "macro-parameter",
 		start: pos
 	};
 	// Define our regexp
 	var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g;
 	// Skip whitespace
 	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Look for the parameter
 	var token = $tw.utils.parseTokenRegExp(source,pos,reMacroParameter);
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Get the parameter details
 	node.value = token.match[2] !== undefined ? token.match[2] : (
 					token.match[3] !== undefined ? token.match[3] : (
 						token.match[4] !== undefined ? token.match[4] : (
 							token.match[5] !== undefined ? token.match[5] : (
 								""
 							)
 						)
 					)
 				);
 	if(token.match[1]) {
 		node.name = token.match[1];
 	}
 	// Update the end position
 	node.end = pos;
 	return node;
 };
 /*
 Look for a macro invocation. Returns null if not found, or {type: "macrocall", name:, parameters:, start:, end:}
 */
 exports.parseMacroInvocation = function(source,pos) {
 	var node = {
 		type: "macrocall",
 		start: pos,
 		params: []
 	};
 	// Define our regexps
 	var reMacroName = /([^\s>"'=]+)/g;
 	// Skip whitespace
 	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Look for a double less than sign
 	var token = $tw.utils.parseTokenString(source,pos,"<<");
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Get the macro name
 	var name = $tw.utils.parseTokenRegExp(source,pos,reMacroName);
 	if(!name) {
 		return null;
 	}
 	node.name = name.match[1];
 	pos = name.end;
 	// Process parameters
 	var parameter = $tw.utils.parseMacroParameter(source,pos);
 	while(parameter) {
 		node.params.push(parameter);
 		pos = parameter.end;
 		// Get the next parameter
 		parameter = $tw.utils.parseMacroParameter(source,pos);
 	}
 	// Skip whitespace
 	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Look for a double greater than sign
 	token = $tw.utils.parseTokenString(source,pos,">>");
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Update the end position
 	node.end = pos;
 	return node;
 };
 /*
 Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,}
 */
 exports.parseAttribute = function(source,pos) {
 	var node = {
 		start: pos
 	};
 	// Define our regexps
 	var reAttributeName = /([^\/\s>"'=]+)/g,
 		reUnquotedAttribute = /([^\/\s<>"'=]+)/g,
 		reIndirectValue = /\{\{([^\}]+)\}\}/g;
 	// Skip whitespace
 	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Get the attribute name
 	var name = $tw.utils.parseTokenRegExp(source,pos,reAttributeName);
 	if(!name) {
 		return null;
 	}
 	node.name = name.match[1];
 	pos = name.end;
 	// Skip whitespace
 	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Look for an equals sign
 	var token = $tw.utils.parseTokenString(source,pos,"=");
 	if(token) {
 		pos = token.end;
 		// Skip whitespace
 		pos = $tw.utils.skipWhiteSpace(source,pos);
 		// Look for a string literal
 		var stringLiteral = $tw.utils.parseStringLiteral(source,pos);
 		if(stringLiteral) {
 			pos = stringLiteral.end;
 			node.type = "string";
 			node.value = stringLiteral.value;
 		} else {
 			// Look for an indirect value
 			var indirectValue = $tw.utils.parseTokenRegExp(source,pos,reIndirectValue);
 			if(indirectValue) {
 				pos = indirectValue.end;
 				node.type = "indirect";
 				node.textReference = indirectValue.match[1];
 			} else {
 				// Look for a unquoted value
 				var unquotedValue = $tw.utils.parseTokenRegExp(source,pos,reUnquotedAttribute);
 				if(unquotedValue) {
 					pos = unquotedValue.end;
 					node.type = "string";
 					node.value = unquotedValue.match[1];
 				} else {
 					// Look for a macro invocation value
 					var macroInvocation = $tw.utils.parseMacroInvocation(source,pos);
 					if(macroInvocation) {
 						pos = macroInvocation.end;
 						node.type = "macro";
 						node.value = macroInvocation;
 					} else {
 						node.type = "string";
 						node.value = "true";
 					}
 				}
 			}
 		}
 	} else {
 		node.type = "string";
 		node.value = "true";
 	}
 	// Update the end position
 	node.end = pos;
 	return node;
 };
 })();
--- a/core/modules/parsers/wikiparser/rules/html.js
+++ b/core/modules/parsers/wikiparser/rules/html.js
@@ -48,7 +48,7 @@ exports.parse = function() {
 	// Advance the parser position to past the tag
 	this.parser.pos = tag.end;
 	// Check for an immediately following double linebreak
-	var hasLineBreak = !tag.isSelfClosing && !!this.parseTokenRegExp(this.parser.source,this.parser.pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
+	var hasLineBreak = !tag.isSelfClosing && !!$tw.utils.parseTokenRegExp(this.parser.source,this.parser.pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
 	// Set whether we're in block mode
 	tag.isBlock = this.is.block || hasLineBreak;
 	// Parse the body if we need to
@@ -71,244 +71,7 @@ exports.parse = function() {
 };
 /*
-Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,}
+Look for an HTML tag. Returns null if not found, otherwise returns {type: "element", name:, attributes: [], isSelfClosing:, start:, end:,}
 */
 exports.parseWhiteSpace = function(source,pos) {
 	var node = {
 		type: "whitespace",
 		start: pos
 	};
 	var re = /(\s)+/g;
 	re.lastIndex = pos;
 	var match = re.exec(source);
 	if(match && match.index === pos) {
 		node.end = pos + match[0].length;
 		return node;
 	}
 	return null;
 };
 /*
 Convenience wrapper for parseWhiteSpace
 */
 exports.skipWhiteSpace = function(source,pos) {
 	var whitespace = this.parseWhiteSpace(source,pos);
 	if(whitespace) {
 		return whitespace.end;
 	}
 	return pos;
 };
 /*
 Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,}
 */
 exports.parseTokenString = function(source,pos,token) {
 	var match = source.indexOf(token,pos) === pos;
 	if(match) {
 		return {
 			type: "token",
 			value: token,
 			start: pos,
 			end: pos + token.length
 		};
 	}
 	return null;
 };
 /*
 Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,}
 */
 exports.parseTokenRegExp = function(source,pos,reToken) {
 	var node = {
 		type: "regexp",
 		start: pos
 	};
 	reToken.lastIndex = pos;
 	node.match = reToken.exec(source);
 	if(node.match && node.match.index === pos) {
 		node.end = pos + node.match[0].length;
 		return node;
 	} else {
 		return null;
 	}
 };
 /*
 Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,}
 */
 exports.parseStringLiteral = function(source,pos) {
 	var node = {
 		type: "string",
 		start: pos
 	};
 	var reString = /(?:"([^"]*)")|(?:'([^']*)')/g;
 	reString.lastIndex = pos;
 	var match = reString.exec(source);
 	if(match && match.index === pos) {
 		node.value = match[1] === undefined ? match[2] : match[1];
 		node.end = pos + match[0].length;
 		return node;
 	} else {
 		return null;
 	}
 };
 /*
 Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:}
 */
 exports.parseMacroParameter = function(source,pos) {
 	var node = {
 		type: "macro-parameter",
 		start: pos
 	};
 	// Define our regexp
 	var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g;
 	// Skip whitespace
 	pos = this.skipWhiteSpace(source,pos);
 	// Look for the parameter
 	var token = this.parseTokenRegExp(source,pos,reMacroParameter);
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Get the parameter details
 	node.value = token.match[2] !== undefined ? token.match[2] : (
 					token.match[3] !== undefined ? token.match[3] : (
 						token.match[4] !== undefined ? token.match[4] : (
 							token.match[5] !== undefined ? token.match[5] : (
 								""
 							)
 						)
 					)
 				);
 	if(token.match[1]) {
 		node.name = token.match[1];
 	}
 	// Update the end position
 	node.end = pos;
 	return node;
 };
 /*
 Look for a macro invocation. Returns null if not found, or {type: "macrocall", name:, parameters:, start:, end:}
 */
 exports.parseMacroInvocation = function(source,pos) {
 	var node = {
 		type: "macrocall",
 		start: pos,
 		params: []
 	};
 	// Define our regexps
 	var reMacroName = /([^\s>"'=]+)/g;
 	// Skip whitespace
 	pos = this.skipWhiteSpace(source,pos);
 	// Look for a double less than sign
 	var token = this.parseTokenString(source,pos,"<<");
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Get the macro name
 	var name = this.parseTokenRegExp(source,pos,reMacroName);
 	if(!name) {
 		return null;
 	}
 	node.name = name.match[1];
 	pos = name.end;
 	// Process parameters
 	var parameter = this.parseMacroParameter(source,pos);
 	while(parameter) {
 		node.params.push(parameter);
 		pos = parameter.end;
 		// Get the next parameter
 		parameter = this.parseMacroParameter(source,pos);
 	}
 	// Skip whitespace
 	pos = this.skipWhiteSpace(source,pos);
 	// Look for a double greater than sign
 	token = this.parseTokenString(source,pos,">>");
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Update the end position
 	node.end = pos;
 	return node;
 };
 /*
 Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,}
 */
 exports.parseAttribute = function(source,pos) {
 	var node = {
 		start: pos
 	};
 	// Define our regexps
 	var reAttributeName = /([^\/\s>"'=]+)/g,
 		reUnquotedAttribute = /([^\/\s<>"'=]+)/g,
 		reIndirectValue = /\{\{([^\}]+)\}\}/g;
 	// Skip whitespace
 	pos = this.skipWhiteSpace(source,pos);
 	// Get the attribute name
 	var name = this.parseTokenRegExp(source,pos,reAttributeName);
 	if(!name) {
 		return null;
 	}
 	node.name = name.match[1];
 	pos = name.end;
 	// Skip whitespace
 	pos = this.skipWhiteSpace(source,pos);
 	// Look for an equals sign
 	var token = this.parseTokenString(source,pos,"=");
 	if(token) {
 		pos = token.end;
 		// Skip whitespace
 		pos = this.skipWhiteSpace(source,pos);
 		// Look for a string literal
 		var stringLiteral = this.parseStringLiteral(source,pos);
 		if(stringLiteral) {
 			pos = stringLiteral.end;
 			node.type = "string";
 			node.value = stringLiteral.value;
 		} else {
 			// Look for an indirect value
 			var indirectValue = this.parseTokenRegExp(source,pos,reIndirectValue);
 			if(indirectValue) {
 				pos = indirectValue.end;
 				node.type = "indirect";
 				node.textReference = indirectValue.match[1];
 			} else {
 				// Look for a unquoted value
 				var unquotedValue = this.parseTokenRegExp(source,pos,reUnquotedAttribute);
 				if(unquotedValue) {
 					pos = unquotedValue.end;
 					node.type = "string";
 					node.value = unquotedValue.match[1];
 				} else {
 					// Look for a macro invocation value
 					var macroInvocation = this.parseMacroInvocation(source,pos);
 					if(macroInvocation) {
 						pos = macroInvocation.end;
 						node.type = "macro";
 						node.value = macroInvocation;
 					} else {
 						node.type = "string";
 						node.value = "true";
 					}
 				}
 			}
 		}
 	} else {
 		node.type = "string";
 		node.value = "true";
 	}
 	// Update the end position
 	node.end = pos;
 	return node;
 };
 /*
 Look for an HTML tag. Returns null if not found, otherwise returns {type: "tag", name:, attributes: [], isSelfClosing:, start:, end:,}
 */
 exports.parseTag = function(source,pos,options) {
 	options = options || {};
@@ -321,45 +84,45 @@ exports.parseTag = function(source,pos,options) {
 	// Define our regexps
 	var reTagName = /([a-zA-Z0-9\-\$]+)/g;
 	// Skip whitespace
-	pos = this.skipWhiteSpace(source,pos);
+	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Look for a less than sign
-	token = this.parseTokenString(source,pos,"<");
+	token = $tw.utils.parseTokenString(source,pos,"<");
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Get the tag name
-	token = this.parseTokenRegExp(source,pos,reTagName);
+	token = $tw.utils.parseTokenRegExp(source,pos,reTagName);
 	if(!token) {
 		return null;
 	}
 	node.tag = token.match[1];
 	pos = token.end;
 	// Process attributes
-	var attribute = this.parseAttribute(source,pos);
+	var attribute = $tw.utils.parseAttribute(source,pos);
 	while(attribute) {
 		node.attributes[attribute.name] = attribute;
 		pos = attribute.end;
 		// Get the next attribute
-		attribute = this.parseAttribute(source,pos);
+		attribute = $tw.utils.parseAttribute(source,pos);
 	}
 	// Skip whitespace
-	pos = this.skipWhiteSpace(source,pos);
+	pos = $tw.utils.skipWhiteSpace(source,pos);
 	// Look for a closing slash
-	token = this.parseTokenString(source,pos,"/");
+	token = $tw.utils.parseTokenString(source,pos,"/");
 	if(token) {
 		pos = token.end;
 		node.isSelfClosing = true;
 	}
 	// Look for a greater than sign
-	token = this.parseTokenString(source,pos,">");
+	token = $tw.utils.parseTokenString(source,pos,">");
 	if(!token) {
 		return null;
 	}
 	pos = token.end;
 	// Check for a required line break
 	if(options.requireLineBreak) {
-		token = this.parseTokenRegExp(source,pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
+		token = $tw.utils.parseTokenRegExp(source,pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g);
 		if(!token) {
 			return null;
 		}