mirror of
https://github.com/Jermolene/TiddlyWiki5
synced 2024-12-02 22:39:56 +00:00
f6338d9109
It preserves comments and text positions, enabling us to do syntax highlighting. Hopefully.
589 lines
20 KiB
JavaScript
589 lines
20 KiB
JavaScript
/* vim: set sw=4 ts=4 et tw=78: */
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
*
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
* the License. You may obtain a copy of the License at
|
|
* http://www.mozilla.org/MPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
* for the specific language governing rights and limitations under the
|
|
* License.
|
|
*
|
|
* The Original Code is the Narcissus JavaScript engine.
|
|
*
|
|
* The Initial Developer of the Original Code is
|
|
* Brendan Eich <brendan@mozilla.org>.
|
|
* Portions created by the Initial Developer are Copyright (C) 2004
|
|
* the Initial Developer. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* Tom Austin <taustin@ucsc.edu>
|
|
* Brendan Eich <brendan@mozilla.org>
|
|
* Shu-Yu Guo <shu@rfrn.org>
|
|
* Stephan Herhut <stephan.a.herhut@intel.com>
|
|
* Dave Herman <dherman@mozilla.com>
|
|
* Dimitris Vardoulakis <dimvar@ccs.neu.edu>
|
|
* Patrick Walton <pcwalton@mozilla.com>
|
|
*
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
* use your version of this file under the terms of the MPL, indicate your
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
* the provisions above, a recipient may use your version of this file under
|
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
*
|
|
* ***** END LICENSE BLOCK ***** */
|
|
|
|
/*
|
|
* Narcissus - JS implemented in JS.
|
|
*
|
|
* Lexical scanner.
|
|
*/
|
|
|
|
Narcissus.lexer = (function() {
|
|
|
|
var definitions = Narcissus.definitions;
|
|
|
|
// Set constants in the local scope.
|
|
eval(definitions.consts);
|
|
|
|
// Banned keywords by language version
|
|
const blackLists = { 160: {}, 185: {}, harmony: {} };
|
|
blackLists[160][LET] = true;
|
|
blackLists[160][MODULE] = true;
|
|
blackLists[160][YIELD] = true;
|
|
blackLists[185][MODULE] = true;
|
|
|
|
// Build up a trie of operator tokens.
|
|
var opTokens = {};
|
|
for (var op in definitions.opTypeNames) {
|
|
if (op === '\n' || op === '.')
|
|
continue;
|
|
|
|
var node = opTokens;
|
|
for (var i = 0; i < op.length; i++) {
|
|
var ch = op[i];
|
|
if (!(ch in node))
|
|
node[ch] = {};
|
|
node = node[ch];
|
|
node.op = op;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Since JavaScript provides no convenient way to determine if a
|
|
* character is in a particular Unicode category, we use
|
|
* metacircularity to accomplish this (oh yeaaaah!)
|
|
*/
|
|
function isValidIdentifierChar(ch, first) {
|
|
// check directly for ASCII
|
|
if (ch <= "\u007F") {
|
|
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch === '$' || ch === '_' ||
|
|
(!first && (ch >= '0' && ch <= '9'))) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// create an object to test this in
|
|
var x = {};
|
|
x["x"+ch] = true;
|
|
x[ch] = true;
|
|
|
|
// then use eval to determine if it's a valid character
|
|
var valid = false;
|
|
try {
|
|
valid = (Function("x", "return (x." + (first?"":"x") + ch + ");")(x) === true);
|
|
} catch (ex) {}
|
|
|
|
return valid;
|
|
}
|
|
|
|
function isIdentifier(str) {
|
|
if (typeof str !== "string")
|
|
return false;
|
|
|
|
if (str.length === 0)
|
|
return false;
|
|
|
|
if (!isValidIdentifierChar(str[0], true))
|
|
return false;
|
|
|
|
for (var i = 1; i < str.length; i++) {
|
|
if (!isValidIdentifierChar(str[i], false))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Tokenizer :: (source, filename, line number) -> Tokenizer
|
|
*/
|
|
function Tokenizer(s, f, l) {
|
|
this.cursor = 0;
|
|
this.source = String(s);
|
|
this.tokens = [];
|
|
this.tokenIndex = 0;
|
|
this.lookahead = 0;
|
|
this.scanNewlines = false;
|
|
this.unexpectedEOF = false;
|
|
this.filename = f || "";
|
|
this.lineno = l || 1;
|
|
this.blackList = blackLists[Narcissus.options.version];
|
|
this.blockComments = null;
|
|
}
|
|
|
|
Tokenizer.prototype = {
|
|
get done() {
|
|
// We need to set scanOperand to true here because the first thing
|
|
// might be a regexp.
|
|
return this.peek(true) === END;
|
|
},
|
|
|
|
get token() {
|
|
return this.tokens[this.tokenIndex];
|
|
},
|
|
|
|
match: function (tt, scanOperand, keywordIsName) {
|
|
return this.get(scanOperand, keywordIsName) === tt || this.unget();
|
|
},
|
|
|
|
mustMatch: function (tt, keywordIsName) {
|
|
if (!this.match(tt, false, keywordIsName)) {
|
|
throw this.newSyntaxError("Missing " +
|
|
definitions.tokens[tt].toLowerCase());
|
|
}
|
|
return this.token;
|
|
},
|
|
|
|
peek: function (scanOperand) {
|
|
var tt, next;
|
|
if (this.lookahead) {
|
|
next = this.tokens[(this.tokenIndex + this.lookahead) & 3];
|
|
tt = (this.scanNewlines && next.lineno !== this.lineno)
|
|
? NEWLINE
|
|
: next.type;
|
|
} else {
|
|
tt = this.get(scanOperand);
|
|
this.unget();
|
|
}
|
|
return tt;
|
|
},
|
|
|
|
peekOnSameLine: function (scanOperand) {
|
|
this.scanNewlines = true;
|
|
var tt = this.peek(scanOperand);
|
|
this.scanNewlines = false;
|
|
return tt;
|
|
},
|
|
|
|
lastBlockComment: function() {
|
|
var length = this.blockComments.length;
|
|
return length ? this.blockComments[length - 1] : null;
|
|
},
|
|
|
|
// Eat comments and whitespace.
|
|
skip: function () {
|
|
var input = this.source;
|
|
this.blockComments = [];
|
|
for (;;) {
|
|
var ch = input[this.cursor++];
|
|
var next = input[this.cursor];
|
|
// handle \r, \r\n and (always preferable) \n
|
|
if (ch === '\r') {
|
|
// if the next character is \n, we don't care about this at all
|
|
if (next === '\n') continue;
|
|
|
|
// otherwise, we want to consider this as a newline
|
|
ch = '\n';
|
|
}
|
|
|
|
if (ch === '\n' && !this.scanNewlines) {
|
|
this.lineno++;
|
|
} else if (ch === '/' && next === '*') {
|
|
var commentStart = ++this.cursor;
|
|
for (;;) {
|
|
ch = input[this.cursor++];
|
|
if (ch === undefined)
|
|
throw this.newSyntaxError("Unterminated comment");
|
|
|
|
if (ch === '*') {
|
|
next = input[this.cursor];
|
|
if (next === '/') {
|
|
var commentEnd = this.cursor - 1;
|
|
this.cursor++;
|
|
break;
|
|
}
|
|
} else if (ch === '\n') {
|
|
this.lineno++;
|
|
}
|
|
}
|
|
this.blockComments.push(input.substring(commentStart, commentEnd));
|
|
} else if ((ch === '/' && next === '/') ||
|
|
(Narcissus.options.allowHTMLComments && ch === '<' && next === '!' &&
|
|
input[this.cursor + 1] === '-' && input[this.cursor + 2] === '-' &&
|
|
(this.cursor += 2))) {
|
|
this.cursor++;
|
|
for (;;) {
|
|
ch = input[this.cursor++];
|
|
next = input[this.cursor];
|
|
if (ch === undefined)
|
|
return;
|
|
|
|
if (ch === '\r') {
|
|
// check for \r\n
|
|
if (next !== '\n') ch = '\n';
|
|
}
|
|
|
|
if (ch === '\n') {
|
|
if (this.scanNewlines) {
|
|
this.cursor--;
|
|
} else {
|
|
this.lineno++;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
} else if (!(ch in definitions.whitespace)) {
|
|
this.cursor--;
|
|
return;
|
|
}
|
|
}
|
|
},
|
|
|
|
// Lex the exponential part of a number, if present. Return true iff an
|
|
// exponential part was found.
|
|
lexExponent: function() {
|
|
var input = this.source;
|
|
var next = input[this.cursor];
|
|
if (next === 'e' || next === 'E') {
|
|
this.cursor++;
|
|
ch = input[this.cursor++];
|
|
if (ch === '+' || ch === '-')
|
|
ch = input[this.cursor++];
|
|
|
|
if (ch < '0' || ch > '9')
|
|
throw this.newSyntaxError("Missing exponent");
|
|
|
|
do {
|
|
ch = input[this.cursor++];
|
|
} while (ch >= '0' && ch <= '9');
|
|
this.cursor--;
|
|
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
},
|
|
|
|
lexZeroNumber: function (ch) {
|
|
var token = this.token, input = this.source;
|
|
token.type = NUMBER;
|
|
|
|
ch = input[this.cursor++];
|
|
if (ch === '.') {
|
|
do {
|
|
ch = input[this.cursor++];
|
|
} while (ch >= '0' && ch <= '9');
|
|
this.cursor--;
|
|
|
|
this.lexExponent();
|
|
token.value = parseFloat(
|
|
input.substring(token.start, this.cursor));
|
|
} else if (ch === 'x' || ch === 'X') {
|
|
do {
|
|
ch = input[this.cursor++];
|
|
} while ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
|
|
(ch >= 'A' && ch <= 'F'));
|
|
this.cursor--;
|
|
|
|
token.value = parseInt(input.substring(token.start, this.cursor));
|
|
} else if (ch >= '0' && ch <= '7') {
|
|
do {
|
|
ch = input[this.cursor++];
|
|
} while (ch >= '0' && ch <= '7');
|
|
this.cursor--;
|
|
|
|
token.value = parseInt(input.substring(token.start, this.cursor));
|
|
} else {
|
|
this.cursor--;
|
|
this.lexExponent(); // 0E1, &c.
|
|
token.value = 0;
|
|
}
|
|
},
|
|
|
|
lexNumber: function (ch) {
|
|
var token = this.token, input = this.source;
|
|
token.type = NUMBER;
|
|
|
|
var floating = false;
|
|
do {
|
|
ch = input[this.cursor++];
|
|
if (ch === '.' && !floating) {
|
|
floating = true;
|
|
ch = input[this.cursor++];
|
|
}
|
|
} while (ch >= '0' && ch <= '9');
|
|
|
|
this.cursor--;
|
|
|
|
var exponent = this.lexExponent();
|
|
floating = floating || exponent;
|
|
|
|
var str = input.substring(token.start, this.cursor);
|
|
token.value = floating ? parseFloat(str) : parseInt(str);
|
|
},
|
|
|
|
lexDot: function (ch) {
|
|
var token = this.token, input = this.source;
|
|
var next = input[this.cursor];
|
|
if (next >= '0' && next <= '9') {
|
|
do {
|
|
ch = input[this.cursor++];
|
|
} while (ch >= '0' && ch <= '9');
|
|
this.cursor--;
|
|
|
|
this.lexExponent();
|
|
|
|
token.type = NUMBER;
|
|
token.value = parseFloat(
|
|
input.substring(token.start, this.cursor));
|
|
} else {
|
|
token.type = DOT;
|
|
token.assignOp = null;
|
|
token.value = '.';
|
|
}
|
|
},
|
|
|
|
lexString: function (ch) {
|
|
var token = this.token, input = this.source;
|
|
token.type = STRING;
|
|
|
|
var hasEscapes = false;
|
|
var delim = ch;
|
|
if (input.length <= this.cursor)
|
|
throw this.newSyntaxError("Unterminated string literal");
|
|
while ((ch = input[this.cursor++]) !== delim) {
|
|
if (this.cursor == input.length)
|
|
throw this.newSyntaxError("Unterminated string literal");
|
|
if (ch === '\\') {
|
|
hasEscapes = true;
|
|
if (++this.cursor == input.length)
|
|
throw this.newSyntaxError("Unterminated string literal");
|
|
}
|
|
}
|
|
|
|
token.value = hasEscapes
|
|
? eval(input.substring(token.start, this.cursor))
|
|
: input.substring(token.start + 1, this.cursor - 1);
|
|
},
|
|
|
|
lexRegExp: function (ch) {
|
|
var token = this.token, input = this.source;
|
|
token.type = REGEXP;
|
|
|
|
do {
|
|
ch = input[this.cursor++];
|
|
if (ch === '\\') {
|
|
this.cursor++;
|
|
} else if (ch === '[') {
|
|
do {
|
|
if (ch === undefined)
|
|
throw this.newSyntaxError("Unterminated character class");
|
|
|
|
if (ch === '\\')
|
|
this.cursor++;
|
|
|
|
ch = input[this.cursor++];
|
|
} while (ch !== ']');
|
|
} else if (ch === undefined) {
|
|
throw this.newSyntaxError("Unterminated regex");
|
|
}
|
|
} while (ch !== '/');
|
|
|
|
do {
|
|
ch = input[this.cursor++];
|
|
} while (ch >= 'a' && ch <= 'z');
|
|
|
|
this.cursor--;
|
|
|
|
token.value = eval(input.substring(token.start, this.cursor));
|
|
},
|
|
|
|
lexOp: function (ch) {
|
|
var token = this.token, input = this.source;
|
|
|
|
// A bit ugly, but it seems wasteful to write a trie lookup routine
|
|
// for only 3 characters...
|
|
var node = opTokens[ch];
|
|
var next = input[this.cursor];
|
|
if (next in node) {
|
|
node = node[next];
|
|
this.cursor++;
|
|
next = input[this.cursor];
|
|
if (next in node) {
|
|
node = node[next];
|
|
this.cursor++;
|
|
next = input[this.cursor];
|
|
}
|
|
}
|
|
|
|
var op = node.op;
|
|
if (definitions.assignOps[op] && input[this.cursor] === '=') {
|
|
this.cursor++;
|
|
token.type = ASSIGN;
|
|
token.assignOp = definitions.tokenIds[definitions.opTypeNames[op]];
|
|
op += '=';
|
|
} else {
|
|
token.type = definitions.tokenIds[definitions.opTypeNames[op]];
|
|
token.assignOp = null;
|
|
}
|
|
|
|
token.value = op;
|
|
},
|
|
|
|
// FIXME: Unicode escape sequences
|
|
lexIdent: function (ch, keywordIsName) {
|
|
var token = this.token;
|
|
var id = ch;
|
|
|
|
while ((ch = this.getValidIdentifierChar(false)) !== null) {
|
|
id += ch;
|
|
}
|
|
|
|
token.type = IDENTIFIER;
|
|
token.value = id;
|
|
|
|
if (keywordIsName)
|
|
return;
|
|
|
|
var kw = definitions.keywords[id];
|
|
if (kw && !(kw in this.blackList))
|
|
token.type = kw;
|
|
},
|
|
|
|
/*
|
|
* Tokenizer.get :: [boolean[, boolean]] -> token type
|
|
*
|
|
* Consume input *only* if there is no lookahead.
|
|
* Dispatch to the appropriate lexing function depending on the input.
|
|
*/
|
|
get: function (scanOperand, keywordIsName) {
|
|
var token;
|
|
while (this.lookahead) {
|
|
--this.lookahead;
|
|
this.tokenIndex = (this.tokenIndex + 1) & 3;
|
|
token = this.tokens[this.tokenIndex];
|
|
if (token.type !== NEWLINE || this.scanNewlines)
|
|
return token.type;
|
|
}
|
|
|
|
this.skip();
|
|
|
|
this.tokenIndex = (this.tokenIndex + 1) & 3;
|
|
token = this.tokens[this.tokenIndex];
|
|
if (!token)
|
|
this.tokens[this.tokenIndex] = token = {};
|
|
|
|
var input = this.source;
|
|
if (this.cursor >= input.length)
|
|
return token.type = END;
|
|
|
|
token.start = this.cursor;
|
|
token.lineno = this.lineno;
|
|
|
|
var ich = this.getValidIdentifierChar(true);
|
|
var ch = (ich === null) ? input[this.cursor++] : null;
|
|
if (ich !== null) {
|
|
this.lexIdent(ich, keywordIsName);
|
|
} else if (scanOperand && ch === '/') {
|
|
this.lexRegExp(ch);
|
|
} else if (ch in opTokens) {
|
|
this.lexOp(ch);
|
|
} else if (ch === '.') {
|
|
this.lexDot(ch);
|
|
} else if (ch >= '1' && ch <= '9') {
|
|
this.lexNumber(ch);
|
|
} else if (ch === '0') {
|
|
this.lexZeroNumber(ch);
|
|
} else if (ch === '"' || ch === "'") {
|
|
this.lexString(ch);
|
|
} else if (this.scanNewlines && (ch === '\n' || ch === '\r')) {
|
|
// if this was a \r, look for \r\n
|
|
if (ch === '\r' && input[this.cursor] === '\n') this.cursor++;
|
|
token.type = NEWLINE;
|
|
token.value = '\n';
|
|
this.lineno++;
|
|
} else {
|
|
throw this.newSyntaxError("Illegal token");
|
|
}
|
|
|
|
token.end = this.cursor;
|
|
return token.type;
|
|
},
|
|
|
|
/*
|
|
* Tokenizer.unget :: void -> undefined
|
|
*
|
|
* Match depends on unget returning undefined.
|
|
*/
|
|
unget: function () {
|
|
if (++this.lookahead === 4) throw "PANIC: too much lookahead!";
|
|
this.tokenIndex = (this.tokenIndex - 1) & 3;
|
|
},
|
|
|
|
newSyntaxError: function (m) {
|
|
m = (this.filename ? this.filename + ":" : "") + this.lineno + ": " + m;
|
|
var e = new SyntaxError(m, this.filename, this.lineno);
|
|
e.source = this.source;
|
|
e.cursor = this.lookahead
|
|
? this.tokens[(this.tokenIndex + this.lookahead) & 3].start
|
|
: this.cursor;
|
|
return e;
|
|
},
|
|
|
|
|
|
/* Gets a single valid identifier char from the input stream, or null
|
|
* if there is none.
|
|
*/
|
|
getValidIdentifierChar: function(first) {
|
|
var input = this.source;
|
|
if (this.cursor >= input.length) return null;
|
|
var ch = input[this.cursor];
|
|
|
|
// first check for \u escapes
|
|
if (ch === '\\' && input[this.cursor+1] === 'u') {
|
|
// get the character value
|
|
try {
|
|
ch = String.fromCharCode(parseInt(
|
|
input.substring(this.cursor + 2, this.cursor + 6),
|
|
16));
|
|
} catch (ex) {
|
|
return null;
|
|
}
|
|
this.cursor += 5;
|
|
}
|
|
|
|
var valid = isValidIdentifierChar(ch, first);
|
|
if (valid) this.cursor++;
|
|
return (valid ? ch : null);
|
|
},
|
|
};
|
|
|
|
|
|
return {
|
|
isIdentifier: isIdentifier,
|
|
Tokenizer: Tokenizer
|
|
};
|
|
|
|
}());
|