diff --git a/editions/text-slicer/tiddlers/Sample Text.tid b/editions/text-slicer/tiddlers/Sample Text.tid index b44e6a9ae..3848e73b5 100644 --- a/editions/text-slicer/tiddlers/Sample Text.tid +++ b/editions/text-slicer/tiddlers/Sample Text.tid @@ -1,7 +1,9 @@ title: Sample Text type: text/html - + + + @@ -9,19 +11,19 @@ type: text/html

Introduction to TiddlyWiki

-Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

-

What is TiddlyWiki used for?

+

What is TiddlyWiki used for?

Dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

-

What's with all the < angle brackets

+

What's with all the < angle brackets

-An opportunity to see how characters that need HTML encoding (like < angle brackets) are dealt with. +An opportunity to see how characters that need HTML encoding (like < angle brackets) are dealt with.

Who uses TiddlyWiki?

@@ -30,9 +32,9 @@ An opportunity to see how characters that need HTML encoding (like < angle brack Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

-

+

Consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. -

+

Who makes TiddlyWiki?

@@ -152,7 +154,7 @@ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
  • - +
  • diff --git a/editions/text-slicer/tiddlywiki.info b/editions/text-slicer/tiddlywiki.info index 11e1808de..7906ba510 100644 --- a/editions/text-slicer/tiddlywiki.info +++ b/editions/text-slicer/tiddlywiki.info @@ -2,7 +2,7 @@ "description": "Tools for slicing up long texts into individual tiddlers", "plugins": [ "tiddlywiki/text-slicer", - "tiddlywiki/xmldom" + "tiddlywiki/sax" ], "languages": [ ], diff --git a/plugins/tiddlywiki/sax/files/LICENSE b/plugins/tiddlywiki/sax/files/LICENSE new file mode 100644 index 000000000..ccffa082c --- /dev/null +++ b/plugins/tiddlywiki/sax/files/LICENSE @@ -0,0 +1,41 @@ +The ISC License + +Copyright (c) Isaac Z. Schlueter and Contributors + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR +IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +==== + +`String.fromCodePoint` by Mathias Bynens used according to terms of MIT +License, as follows: + + Copyright Mathias Bynens + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/plugins/tiddlywiki/sax/files/README.md b/plugins/tiddlywiki/sax/files/README.md new file mode 100644 index 000000000..afcd3f3dd --- /dev/null +++ b/plugins/tiddlywiki/sax/files/README.md @@ -0,0 +1,225 @@ +# sax js + +A sax-style parser for XML and HTML. + +Designed with [node](http://nodejs.org/) in mind, but should work fine in +the browser or other CommonJS implementations. + +## What This Is + +* A very simple tool to parse through an XML string. +* A stepping stone to a streaming HTML parser. +* A handy way to deal with RSS and other mostly-ok-but-kinda-broken XML + docs. + +## What This Is (probably) Not + +* An HTML Parser - That's a fine goal, but this isn't it. It's just + XML. +* A DOM Builder - You can use it to build an object model out of XML, + but it doesn't do that out of the box. +* XSLT - No DOM = no querying. +* 100% Compliant with (some other SAX implementation) - Most SAX + implementations are in Java and do a lot more than this does. +* An XML Validator - It does a little validation when in strict mode, but + not much. +* A Schema-Aware XSD Thing - Schemas are an exercise in fetishistic + masochism. +* A DTD-aware Thing - Fetching DTDs is a much bigger job. + +## Regarding `Hello, world!').close(); + +// stream usage +// takes the same options as the parser +var saxStream = require("sax").createStream(strict, options) +saxStream.on("error", function (e) { + // unhandled errors will throw, since this is a proper node + // event emitter. + console.error("error!", e) + // clear the error + this._parser.error = null + this._parser.resume() +}) +saxStream.on("opentag", function (node) { + // same object as above +}) +// pipe is supported, and it's readable/writable +// same chunks coming in also go out. +fs.createReadStream("file.xml") + .pipe(saxStream) + .pipe(fs.createWriteStream("file-copy.xml")) +``` + + +## Arguments + +Pass the following arguments to the parser function. All are optional. + +`strict` - Boolean. Whether or not to be a jerk. Default: `false`. + +`opt` - Object bag of settings regarding string formatting. All default to `false`. + +Settings supported: + +* `trim` - Boolean. Whether or not to trim text and comment nodes. +* `normalize` - Boolean. If true, then turn any whitespace into a single + space. +* `lowercase` - Boolean. If true, then lowercase tag names and attribute names + in loose mode, rather than uppercasing them. +* `xmlns` - Boolean. If true, then namespaces are supported. +* `position` - Boolean. If false, then don't track line/col/position. +* `strictEntities` - Boolean. If true, only parse [predefined XML + entities](http://www.w3.org/TR/REC-xml/#sec-predefined-ent) + (`&`, `'`, `>`, `<`, and `"`) + +## Methods + +`write` - Write bytes onto the stream. You don't have to do this all at +once. You can keep writing as much as you want. + +`close` - Close the stream. Once closed, no more data may be written until +it is done processing the buffer, which is signaled by the `end` event. + +`resume` - To gracefully handle errors, assign a listener to the `error` +event. Then, when the error is taken care of, you can call `resume` to +continue parsing. Otherwise, the parser will not continue while in an error +state. + +## Members + +At all times, the parser object will have the following members: + +`line`, `column`, `position` - Indications of the position in the XML +document where the parser currently is looking. + +`startTagPosition` - Indicates the position where the current tag starts. + +`closed` - Boolean indicating whether or not the parser can be written to. +If it's `true`, then wait for the `ready` event to write again. + +`strict` - Boolean indicating whether or not the parser is a jerk. + +`opt` - Any options passed into the constructor. + +`tag` - The current tag being dealt with. + +And a bunch of other stuff that you probably shouldn't touch. + +## Events + +All events emit with a single argument. To listen to an event, assign a +function to `on`. Functions get executed in the this-context of +the parser object. The list of supported events are also in the exported +`EVENTS` array. + +When using the stream interface, assign handlers using the EventEmitter +`on` function in the normal fashion. + +`error` - Indication that something bad happened. The error will be hanging +out on `parser.error`, and must be deleted before parsing can continue. By +listening to this event, you can keep an eye on that kind of stuff. Note: +this happens *much* more in strict mode. Argument: instance of `Error`. + +`text` - Text node. Argument: string of text. + +`doctype` - The ``. Argument: +object with `name` and `body` members. Attributes are not parsed, as +processing instructions have implementation dependent semantics. + +`sgmldeclaration` - Random SGML declarations. Stuff like `` +would trigger this kind of event. This is a weird thing to support, so it +might go away at some point. SAX isn't intended to be used to parse SGML, +after all. + +`opentagstart` - Emitted immediately when the tag name is available, +but before any attributes are encountered. Argument: object with a +`name` field and an empty `attributes` set. Note that this is the +same object that will later be emitted in the `opentag` event. + +`opentag` - An opening tag. Argument: object with `name` and `attributes`. +In non-strict mode, tag names are uppercased, unless the `lowercase` +option is set. If the `xmlns` option is set, then it will contain +namespace binding information on the `ns` member, and will have a +`local`, `prefix`, and `uri` member. + +`closetag` - A closing tag. In loose mode, tags are auto-closed if their +parent closes. In strict mode, well-formedness is enforced. Note that +self-closing tags will have `closeTag` emitted immediately after `openTag`. +Argument: tag name. + +`attribute` - An attribute node. Argument: object with `name` and `value`. +In non-strict mode, attribute names are uppercased, unless the `lowercase` +option is set. If the `xmlns` option is set, it will also contains namespace +information. + +`comment` - A comment node. Argument: the string of the comment. + +`opencdata` - The opening tag of a ``) of a `` tags trigger a `"script"` +event, and their contents are not checked for special xml characters. +If you pass `noscript: true`, then this behavior is suppressed. + +## Reporting Problems + +It's best to write a failing test if you find an issue. I will always +accept pull requests with failing tests if they demonstrate intended +behavior, but it is very hard to figure out what issue you're describing +without a test. Writing a test is also the best way for you yourself +to figure out if you really understand the issue you think you have with +sax-js. diff --git a/plugins/tiddlywiki/sax/files/lib/sax.js b/plugins/tiddlywiki/sax/files/lib/sax.js new file mode 100644 index 000000000..795d607ef --- /dev/null +++ b/plugins/tiddlywiki/sax/files/lib/sax.js @@ -0,0 +1,1565 @@ +;(function (sax) { // wrapper for non-node envs + sax.parser = function (strict, opt) { return new SAXParser(strict, opt) } + sax.SAXParser = SAXParser + sax.SAXStream = SAXStream + sax.createStream = createStream + + // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. + // When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)), + // since that's the earliest that a buffer overrun could occur. This way, checks are + // as rare as required, but as often as necessary to ensure never crossing this bound. + // Furthermore, buffers are only tested at most once per write(), so passing a very + // large string into write() might have undesirable effects, but this is manageable by + // the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme + // edge case, result in creating at most one complete copy of the string passed in. + // Set to Infinity to have unlimited buffers. + sax.MAX_BUFFER_LENGTH = 64 * 1024 + + var buffers = [ + 'comment', 'sgmlDecl', 'textNode', 'tagName', 'doctype', + 'procInstName', 'procInstBody', 'entity', 'attribName', + 'attribValue', 'cdata', 'script' + ] + + sax.EVENTS = [ + 'text', + 'processinginstruction', + 'sgmldeclaration', + 'doctype', + 'comment', + 'opentagstart', + 'attribute', + 'opentag', + 'closetag', + 'opencdata', + 'cdata', + 'closecdata', + 'error', + 'end', + 'ready', + 'script', + 'opennamespace', + 'closenamespace' + ] + + function SAXParser (strict, opt) { + if (!(this instanceof SAXParser)) { + return new SAXParser(strict, opt) + } + + var parser = this + clearBuffers(parser) + parser.q = parser.c = '' + parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH + parser.opt = opt || {} + parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags + parser.looseCase = parser.opt.lowercase ? 'toLowerCase' : 'toUpperCase' + parser.tags = [] + parser.closed = parser.closedRoot = parser.sawRoot = false + parser.tag = parser.error = null + parser.strict = !!strict + parser.noscript = !!(strict || parser.opt.noscript) + parser.state = S.BEGIN + parser.strictEntities = parser.opt.strictEntities + parser.ENTITIES = parser.strictEntities ? Object.create(sax.XML_ENTITIES) : Object.create(sax.ENTITIES) + parser.attribList = [] + + // namespaces form a prototype chain. + // it always points at the current tag, + // which protos to its parent tag. + if (parser.opt.xmlns) { + parser.ns = Object.create(rootNS) + } + + // mostly just for error reporting + parser.trackPosition = parser.opt.position !== false + if (parser.trackPosition) { + parser.position = parser.line = parser.column = 0 + } + emit(parser, 'onready') + } + + if (!Object.create) { + Object.create = function (o) { + function F () {} + F.prototype = o + var newf = new F() + return newf + } + } + + if (!Object.keys) { + Object.keys = function (o) { + var a = [] + for (var i in o) if (o.hasOwnProperty(i)) a.push(i) + return a + } + } + + function checkBufferLength (parser) { + var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10) + var maxActual = 0 + for (var i = 0, l = buffers.length; i < l; i++) { + var len = parser[buffers[i]].length + if (len > maxAllowed) { + // Text/cdata nodes can get big, and since they're buffered, + // we can get here under normal conditions. + // Avoid issues by emitting the text node now, + // so at least it won't get any bigger. + switch (buffers[i]) { + case 'textNode': + closeText(parser) + break + + case 'cdata': + emitNode(parser, 'oncdata', parser.cdata) + parser.cdata = '' + break + + case 'script': + emitNode(parser, 'onscript', parser.script) + parser.script = '' + break + + default: + error(parser, 'Max buffer length exceeded: ' + buffers[i]) + } + } + maxActual = Math.max(maxActual, len) + } + // schedule the next check for the earliest possible buffer overrun. + var m = sax.MAX_BUFFER_LENGTH - maxActual + parser.bufferCheckPosition = m + parser.position + } + + function clearBuffers (parser) { + for (var i = 0, l = buffers.length; i < l; i++) { + parser[buffers[i]] = '' + } + } + + function flushBuffers (parser) { + closeText(parser) + if (parser.cdata !== '') { + emitNode(parser, 'oncdata', parser.cdata) + parser.cdata = '' + } + if (parser.script !== '') { + emitNode(parser, 'onscript', parser.script) + parser.script = '' + } + } + + SAXParser.prototype = { + end: function () { end(this) }, + write: write, + resume: function () { this.error = null; return this }, + close: function () { return this.write(null) }, + flush: function () { flushBuffers(this) } + } + + var Stream + try { + Stream = require('stream').Stream + } catch (ex) { + Stream = function () {} + } + + var streamWraps = sax.EVENTS.filter(function (ev) { + return ev !== 'error' && ev !== 'end' + }) + + function createStream (strict, opt) { + return new SAXStream(strict, opt) + } + + function SAXStream (strict, opt) { + if (!(this instanceof SAXStream)) { + return new SAXStream(strict, opt) + } + + Stream.apply(this) + + this._parser = new SAXParser(strict, opt) + this.writable = true + this.readable = true + + var me = this + + this._parser.onend = function () { + me.emit('end') + } + + this._parser.onerror = function (er) { + me.emit('error', er) + + // if didn't throw, then means error was handled. + // go ahead and clear error, so we can write again. + me._parser.error = null + } + + this._decoder = null + + streamWraps.forEach(function (ev) { + Object.defineProperty(me, 'on' + ev, { + get: function () { + return me._parser['on' + ev] + }, + set: function (h) { + if (!h) { + me.removeAllListeners(ev) + me._parser['on' + ev] = h + return h + } + me.on(ev, h) + }, + enumerable: true, + configurable: false + }) + }) + } + + SAXStream.prototype = Object.create(Stream.prototype, { + constructor: { + value: SAXStream + } + }) + + SAXStream.prototype.write = function (data) { + if (typeof Buffer === 'function' && + typeof Buffer.isBuffer === 'function' && + Buffer.isBuffer(data)) { + if (!this._decoder) { + var SD = require('string_decoder').StringDecoder + this._decoder = new SD('utf8') + } + data = this._decoder.write(data) + } + + this._parser.write(data.toString()) + this.emit('data', data) + return true + } + + SAXStream.prototype.end = function (chunk) { + if (chunk && chunk.length) { + this.write(chunk) + } + this._parser.end() + return true + } + + SAXStream.prototype.on = function (ev, handler) { + var me = this + if (!me._parser['on' + ev] && streamWraps.indexOf(ev) !== -1) { + me._parser['on' + ev] = function () { + var args = arguments.length === 1 ? [arguments[0]] : Array.apply(null, arguments) + args.splice(0, 0, ev) + me.emit.apply(me, args) + } + } + + return Stream.prototype.on.call(me, ev, handler) + } + + // this really needs to be replaced with character classes. + // XML allows all manner of ridiculous numbers and digits. + var CDATA = '[CDATA[' + var DOCTYPE = 'DOCTYPE' + var XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace' + var XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/' + var rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE } + + // http://www.w3.org/TR/REC-xml/#NT-NameStartChar + // This implementation works on strings, a single character at a time + // as such, it cannot ever support astral-plane characters (10000-EFFFF) + // without a significant breaking change to either this parser, or the + // JavaScript language. Implementation of an emoji-capable xml parser + // is left as an exercise for the reader. + var nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/ + + var nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/ + + var entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/ + var entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/ + + function isWhitespace (c) { + return c === ' ' || c === '\n' || c === '\r' || c === '\t' + } + + function isQuote (c) { + return c === '"' || c === '\'' + } + + function isAttribEnd (c) { + return c === '>' || isWhitespace(c) + } + + function isMatch (regex, c) { + return regex.test(c) + } + + function notMatch (regex, c) { + return !isMatch(regex, c) + } + + var S = 0 + sax.STATE = { + BEGIN: S++, // leading byte order mark or whitespace + BEGIN_WHITESPACE: S++, // leading whitespace + TEXT: S++, // general stuff + TEXT_ENTITY: S++, // & and such. + OPEN_WAKA: S++, // < + SGML_DECL: S++, // + SCRIPT: S++, //