Add tagged captures for a better (more correct) form of look behind.

This commit is contained in:
Calvin Rose 2019-01-16 22:38:11 -05:00
parent 612a245961
commit 2c1b506213
3 changed files with 234 additions and 154 deletions

View File

@ -128,23 +128,28 @@ a piece of text, some forms may push Janet values onto the capture stack as a si
text matches the main peg language, `(peg/match)` will return the final capture stack as an array.
Capture specials will only push captures to the capture stack if their child pattern matches the text.
Most captures specials will match the same text as their first argument pattern.
Most captures specials will match the same text as their first argument pattern. Also most specials
that produce captures can take an optional argument `tag` that applies a keyword tag to the capture.
These tagged captures can then be recaptured via the `(backref tag)` special in subsequent matches.
Tagged captures, when combined with the `(cmt)` special, provide a powerful form of look-behind
that can make many grammars simpler.
| Pattern Signature | What it captures |
| ------- | ---------------- |
| `(capture patt)` | Captures all of the text in patt if patt matches, If patt contains any captures, then those captures will be pushed to the capture stack before the total text. |
| `(<- patt)` | Alias for `(capture patt)` |
| `(group patt) ` | Captures an array of all of the captures in patt.
| `(replace patt subst)` | Replaces the captures produced by patt by applying subst to them. If subst is a table or struct, will push `(get subst last-capture)` to the capture stack after removing the old captures. If a subst is a function, will call subst with the captures of patt as arguments and push the result to the capture stack. Otherwise, will push subst literally to the capture stack. |
| `(/ patt subst)` | Alias for `(replace patt subst)` |
| `(constant k)` | Captures a constant value and advances no characters. |
| `(argument n)` | Captures the nth extra argument to the match function and does not advance. |
| `(position)` | Captures the current index into the text and advances no input. |
| `($)` | Alias for `(position)`. |
| `(accumulate patt)` | Capture a string that is the concatenation of all captures in patt. This will try to be efficient and not create intermediate strings if possible. |
| `(% patt)` | Alias for `(accumulate patt)`
| `(cmt patt fun)` | Invokes fun with all of the captures of patt as arguments (if patt matches). If the result is truthy, then captures the result. The whole expression fails if fun returns false or nil. |
| `(backref n)` | Duplicates the nth capture and pushes it to the stack again (0 is the first capture). If n is negative, indexes from the top of the stack (-1 pushes the previously captured value to the stack). If n does not map to a valid stack index then the match fails. |
| `(capture patt ?tag)` | Captures all of the text in patt if patt matches, If patt contains any captures, then those captures will be pushed to the capture stack before the total text. |
| `(<- patt ?tag)` | Alias for `(capture patt ?tag)` |
| `(group patt ?tag) ` | Captures an array of all of the captures in patt.
| `(replace patt subst ?tag)` | Replaces the captures produced by patt by applying subst to them. If subst is a table or struct, will push `(get subst last-capture)` to the capture stack after removing the old captures. If a subst is a function, will call subst with the captures of patt as arguments and push the result to the capture stack. Otherwise, will push subst literally to the capture stack. |
| `(/ patt subst ?tag)` | Alias for `(replace patt subst ?tag)` |
| `(constant k ?tag)` | Captures a constant value and advances no characters. |
| `(argument n ?tag)` | Captures the nth extra argument to the match function and does not advance. |
| `(position ?tag)` | Captures the current index into the text and advances no input. |
| `($ ?tag)` | Alias for `(position ?tag)`. |
| `(accumulate patt ?tag)` | Capture a string that is the concatenation of all captures in patt. This will try to be efficient and not create intermediate strings if possible. |
| `(% patt ?tag)` | Alias for `(accumulate patt ?tag)`
| `(cmt patt fun ?tag)` | Invokes fun with all of the captures of patt as arguments (if patt matches). If the result is truthy, then captures the result. The whole expression fails if fun returns false or nil. |
| `(backref tag ?tag)` | Duplicates the last capture with the tag `tag`. If no such capture exists then the match fails. |
| `(-> tag ?tag)` | Alias for `(backref tag)`. |
| `(error patt)` | Throws a Janet error if patt matches. The error thrown will be the last capture ofpatt, or a generic error if patt produces no captures. |
## Grammars and Recursion

View File

@ -43,14 +43,15 @@ typedef enum {
RULE_IFNOT, /* [rule_a, rule_b (b if not a)] */
RULE_NOT, /* [rule] */
RULE_BETWEEN, /* [lo, hi, rule] */
RULE_CAPTURE, /* [rule] */
RULE_GETTAG, /* [searchtag, tag] */
RULE_CAPTURE, /* [rule, tag] */
RULE_POSITION, /* [tag] */
RULE_ARGUMENT, /* [argument-index] */
RULE_CONSTANT, /* [constant] */
RULE_ACCUMULATE, /* [rule] */
RULE_GROUP, /* [rule] */
RULE_REPLACE, /* [rule, constant] */
RULE_MATCHTIME, /* [rule, constant] */
RULE_ARGUMENT, /* [argument-index, tag] */
RULE_CONSTANT, /* [constant, tag] */
RULE_ACCUMULATE, /* [rule, tag] */
RULE_GROUP, /* [rule, tag] */
RULE_REPLACE, /* [rule, constant, tag] */
RULE_MATCHTIME, /* [rule, constant, tag] */
RULE_ERROR, /* [rule] */
} Opcode;
@ -62,6 +63,7 @@ typedef struct {
const Janet *constants;
JanetArray *captures;
JanetBuffer *scratch;
JanetBuffer *tags;
const Janet *extrav;
int32_t extrac;
int32_t depth;
@ -92,14 +94,18 @@ static CapState cap_save(PegState *s) {
static void cap_load(PegState *s, CapState cs) {
s->scratch->count = cs.scratch;
s->captures->count = cs.cap;
s->tags->count = cs.cap;
}
/* Add a capture */
static void pushcap(PegState *s, Janet capture) {
static void pushcap(PegState *s, Janet capture, uint32_t tag) {
if (s->mode == PEG_MODE_ACCUMULATE)
janet_to_string_b(s->scratch, capture);
if (s->mode == PEG_MODE_NORMAL)
if (s->mode == PEG_MODE_NORMAL ||
(tag && s->mode == PEG_MODE_ACCUMULATE)) {
janet_array_push(s->captures, capture);
janet_buffer_push_u8(s->tags, tag);
}
}
/* Prevent stack overflow */
@ -267,9 +273,24 @@ tail:
return text;
}
/* Capturing rules */
case RULE_GETTAG:
{
uint32_t search = rule[1];
uint32_t tag = rule[2];
for (int32_t i = s->tags->count - 1; i >= 0; i--) {
if (s->tags->data[i] == search) {
pushcap(s, s->captures->data[i], tag);
return text;
}
}
return NULL;
}
case RULE_POSITION:
{
pushcap(s, janet_wrap_number((double)(text - s->text_start)));
pushcap(s, janet_wrap_number((double)(text - s->text_start)), rule[1]);
return text;
}
@ -277,19 +298,20 @@ tail:
{
int32_t index = ((int32_t *)rule)[1];
Janet capture = (index >= s->extrac) ? janet_wrap_nil() : s->extrav[index];
pushcap(s, capture);
pushcap(s, capture, rule[2]);
return text;
}
case RULE_CONSTANT:
{
pushcap(s, s->constants[rule[1]]);
pushcap(s, s->constants[rule[1]], rule[2]);
return text;
}
case RULE_CAPTURE:
{
if (s->mode == PEG_MODE_NOCAPTURE) {
uint32_t tag = rule[2];
if (!tag && s->mode == PEG_MODE_NOCAPTURE) {
rule = s->bytecode + rule[1];
goto tail;
}
@ -298,19 +320,20 @@ tail:
up1(s);
if (!result) return NULL;
/* Specialized pushcap - avoid intermediate string creation */
if (s->mode == PEG_MODE_ACCUMULATE) {
if (!tag && s->mode == PEG_MODE_ACCUMULATE) {
janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
} else {
janet_array_push(s->captures, janet_stringv(text, (int32_t)(result - text)));
pushcap(s, janet_stringv(text, (int32_t)(result - text)), tag);
}
return result;
}
case RULE_ACCUMULATE:
{
uint32_t tag = rule[2];
int oldmode = s->mode;
/* No capture mode, skip captures. Accumulate inside accumulate also does nothing. */
if (oldmode != PEG_MODE_NORMAL) {
if (!tag && oldmode != PEG_MODE_NORMAL) {
rule = s->bytecode + rule[1];
goto tail;
}
@ -323,14 +346,15 @@ tail:
if (!result) return NULL;
Janet cap = janet_stringv(s->scratch->data + cs.scratch, s->scratch->count - cs.scratch);
cap_load(s, cs);
pushcap(s, cap);
pushcap(s, cap, tag);
return result;
}
case RULE_GROUP:
{
uint32_t tag = rule[2];
int oldmode = s->mode;
if (oldmode == PEG_MODE_NOCAPTURE) {
if (!tag && oldmode == PEG_MODE_NOCAPTURE) {
rule = s->bytecode + rule[1];
goto tail;
}
@ -348,15 +372,16 @@ tail:
sizeof(Janet) * num_sub_captures);
sub_captures->count = num_sub_captures;
cap_load(s, cs);
pushcap(s, janet_wrap_array(sub_captures));
pushcap(s, janet_wrap_array(sub_captures), tag);
return result;
}
case RULE_REPLACE:
case RULE_MATCHTIME:
{
uint32_t tag = rule[3];
int oldmode = s->mode;
if (rule[0] == RULE_REPLACE && oldmode == PEG_MODE_NOCAPTURE) {
if (!tag && rule[0] == RULE_REPLACE && oldmode == PEG_MODE_NOCAPTURE) {
rule = s->bytecode + rule[1];
goto tail;
}
@ -394,7 +419,7 @@ tail:
}
cap_load(s, cs);
if (rule[0] == RULE_MATCHTIME && !janet_truthy(cap)) return NULL;
pushcap(s, cap);
pushcap(s, cap, tag);
return result;
}
@ -429,76 +454,17 @@ tail:
typedef struct {
JanetTable *grammar;
JanetTable *memoized;
JanetTable *tags;
Janet *constants;
uint32_t *bytecode;
Janet form;
int depth;
uint32_t nexttag;
} Builder;
/* Forward declaration to allow recursion */
static uint32_t compile1(Builder *b, Janet peg);
/*
* Emission
*/
static uint32_t emit_constant(Builder *b, Janet c) {
uint32_t cindex = (uint32_t) janet_v_count(b->constants);
janet_v_push(b->constants, c);
return cindex;
}
/* Reserve space in bytecode for a rule. When a special emits a rule,
* it must place that rule immediately on the bytecode stack. This lets
* the compiler know where the rule is going to be before it is complete,
* allowing recursive rules. */
typedef struct {
Builder *builder;
uint32_t index;
int32_t size;
} Reserve;
static Reserve reserve(Builder *b, int32_t size) {
Reserve r;
r.index = janet_v_count(b->bytecode);
r.builder = b;
r.size = size;
for (int32_t i = 0; i < size; i++)
janet_v_push(b->bytecode, 0);
return r;
}
/* Emit a rule in the builder. Returns the index of the new rule */
static void emit_rule(Reserve r, int32_t op, int32_t n, const uint32_t *body) {
janet_assert(r.size == n + 1, "bad reserve");
r.builder->bytecode[r.index] = op;
memcpy(r.builder->bytecode + r.index + 1, body, n * sizeof(uint32_t));
}
/* For RULE_LITERAL */
static void emit_bytes(Builder *b, uint32_t op, int32_t len, const uint8_t *bytes) {
uint32_t next_rule = janet_v_count(b->bytecode);
janet_v_push(b->bytecode, op);
janet_v_push(b->bytecode, len);
int32_t words = ((len + 3) >> 2);
for (int32_t i = 0; i < words; i++)
janet_v_push(b->bytecode, 0);
memcpy(b->bytecode + next_rule + 2, bytes, len);
}
/* For fixed arity rules of arities 1, 2, and 3 */
static void emit_1(Reserve r, uint32_t op, uint32_t arg) {
emit_rule(r, op, 1, &arg);
}
static void emit_2(Reserve r, uint32_t op, uint32_t arg1, uint32_t arg2) {
uint32_t arr[2] = {arg1, arg2};
emit_rule(r, op, 2, arr);
}
static void emit_3(Reserve r, uint32_t op, uint32_t arg1, uint32_t arg2, uint32_t arg3) {
uint32_t arr[3] = {arg1, arg2, arg3};
emit_rule(r, op, 3, arr);
}
/*
* Errors
*/
@ -562,6 +528,84 @@ static int32_t peg_getnat(Builder *b, Janet x) {
return i;
}
/*
* Emission
*/
static uint32_t emit_constant(Builder *b, Janet c) {
uint32_t cindex = (uint32_t) janet_v_count(b->constants);
janet_v_push(b->constants, c);
return cindex;
}
static uint32_t emit_tag(Builder *b, Janet t) {
if (!janet_checktype(t, JANET_KEYWORD))
peg_panicf(b, "expected keyword for capture tag, got %v", t);
Janet check = janet_table_get(b->tags, t);
if (janet_checktype(check, JANET_NIL)) {
uint32_t tag = b->nexttag++;
if (tag > 255) {
peg_panicf(b, "too many tags - up to 255 tags are supported per peg");
}
Janet val = janet_wrap_number(tag);
janet_table_put(b->tags, t, val);
return tag;
} else {
return (uint32_t) janet_unwrap_number(check);
}
}
/* Reserve space in bytecode for a rule. When a special emits a rule,
* it must place that rule immediately on the bytecode stack. This lets
* the compiler know where the rule is going to be before it is complete,
* allowing recursive rules. */
typedef struct {
Builder *builder;
uint32_t index;
int32_t size;
} Reserve;
static Reserve reserve(Builder *b, int32_t size) {
Reserve r;
r.index = janet_v_count(b->bytecode);
r.builder = b;
r.size = size;
for (int32_t i = 0; i < size; i++)
janet_v_push(b->bytecode, 0);
return r;
}
/* Emit a rule in the builder. Returns the index of the new rule */
static void emit_rule(Reserve r, int32_t op, int32_t n, const uint32_t *body) {
janet_assert(r.size == n + 1, "bad reserve");
r.builder->bytecode[r.index] = op;
memcpy(r.builder->bytecode + r.index + 1, body, n * sizeof(uint32_t));
}
/* For RULE_LITERAL */
static void emit_bytes(Builder *b, uint32_t op, int32_t len, const uint8_t *bytes) {
uint32_t next_rule = janet_v_count(b->bytecode);
janet_v_push(b->bytecode, op);
janet_v_push(b->bytecode, len);
int32_t words = ((len + 3) >> 2);
for (int32_t i = 0; i < words; i++)
janet_v_push(b->bytecode, 0);
memcpy(b->bytecode + next_rule + 2, bytes, len);
}
/* For fixed arity rules of arities 1, 2, and 3 */
static void emit_1(Reserve r, uint32_t op, uint32_t arg) {
emit_rule(r, op, 1, &arg);
}
static void emit_2(Reserve r, uint32_t op, uint32_t arg1, uint32_t arg2) {
uint32_t arr[2] = {arg1, arg2};
emit_rule(r, op, 2, arr);
}
static void emit_3(Reserve r, uint32_t op, uint32_t arg1, uint32_t arg2, uint32_t arg3) {
uint32_t arr[3] = {arg1, arg2, arg3};
emit_rule(r, op, 3, arr);
}
/*
* Specials
*/
@ -645,30 +689,6 @@ static void spec_ifnot(Builder *b, int32_t argc, const Janet *argv) {
spec_branch(b, argc, argv, RULE_IFNOT);
}
/* Rule of the form [rule] */
static void spec_onerule(Builder *b, int32_t argc, const Janet *argv, uint32_t op) {
peg_fixarity(b, argc, 1);
Reserve r = reserve(b, 2);
uint32_t rule = compile1(b, argv[0]);
emit_1(r, op, rule);
}
static void spec_not(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_NOT);
}
static void spec_capture(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_CAPTURE);
}
static void spec_accumulate(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_ACCUMULATE);
}
static void spec_group(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_GROUP);
}
static void spec_error(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_ERROR);
}
static void spec_between(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 3);
Reserve r = reserve(b, 4);
@ -678,34 +698,6 @@ static void spec_between(Builder *b, int32_t argc, const Janet *argv) {
emit_3(r, RULE_BETWEEN, lo, hi, subrule);
}
static void spec_position(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 0);
Reserve r = reserve(b, 1);
(void) argv;
emit_rule(r, RULE_POSITION, 0, NULL);
}
static void spec_argument(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 1);
Reserve r = reserve(b, 2);
int32_t index = peg_getnat(b, argv[0]);
emit_1(r, RULE_ARGUMENT, index);
}
static void spec_constant(Builder *b, int32_t argc, const Janet *argv) {
janet_fixarity(argc, 1);
Reserve r = reserve(b, 2);
emit_1(r, RULE_CONSTANT, emit_constant(b, argv[0]));
}
static void spec_replace(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 2);
Reserve r = reserve(b, 3);
uint32_t subrule = compile1(b, argv[0]);
uint32_t constant = emit_constant(b, argv[1]);
emit_2(r, RULE_REPLACE, subrule, constant);
}
static void spec_repeater(Builder *b, int32_t argc, const Janet *argv, int32_t min) {
peg_fixarity(b, argc, 1);
Reserve r = reserve(b, 4);
@ -743,17 +735,93 @@ static void spec_opt(Builder *b, int32_t argc, const Janet *argv) {
emit_3(r, RULE_BETWEEN, 0, 1, subrule);
}
static void spec_matchtime(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 2);
/* Rule of the form [rule] */
static void spec_onerule(Builder *b, int32_t argc, const Janet *argv, uint32_t op) {
peg_fixarity(b, argc, 1);
Reserve r = reserve(b, 2);
uint32_t rule = compile1(b, argv[0]);
emit_1(r, op, rule);
}
static void spec_not(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_NOT);
}
static void spec_error(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_ERROR);
}
/* Rule of the form [rule, tag] */
static void spec_cap1(Builder *b, int32_t argc, const Janet *argv, uint32_t op) {
peg_arity(b, argc, 1, 2);
Reserve r = reserve(b, 3);
uint32_t tag = (argc == 2) ? emit_tag(b, argv[1]) : 0;
uint32_t rule = compile1(b, argv[0]);
emit_2(r, op, rule, tag);
}
static void spec_capture(Builder *b, int32_t argc, const Janet *argv) {
spec_cap1(b, argc, argv, RULE_CAPTURE);
}
static void spec_accumulate(Builder *b, int32_t argc, const Janet *argv) {
spec_cap1(b, argc, argv, RULE_ACCUMULATE);
}
static void spec_group(Builder *b, int32_t argc, const Janet *argv) {
spec_cap1(b, argc, argv, RULE_GROUP);
}
static void spec_reference(Builder *b, int32_t argc, const Janet *argv) {
peg_arity(b, argc, 1, 2);
Reserve r = reserve(b, 3);
uint32_t search = emit_tag(b, argv[0]);
uint32_t tag = (argc == 2) ? emit_tag(b, argv[1]) : 0;
emit_2(r, RULE_GETTAG, search, tag);
}
static void spec_position(Builder *b, int32_t argc, const Janet *argv) {
peg_arity(b, argc, 0, 1);
Reserve r = reserve(b, 2);
uint32_t tag = (argc) ? emit_tag(b, argv[0]) : 0;
(void) argv;
emit_1(r, RULE_POSITION, tag);
}
static void spec_argument(Builder *b, int32_t argc, const Janet *argv) {
peg_arity(b, argc, 1, 2);
Reserve r = reserve(b, 3);
uint32_t tag = (argc == 2) ? emit_tag(b, argv[1]) : 0;
int32_t index = peg_getnat(b, argv[0]);
emit_2(r, RULE_ARGUMENT, index, tag);
}
static void spec_constant(Builder *b, int32_t argc, const Janet *argv) {
janet_arity(argc, 1, 2);
Reserve r = reserve(b, 3);
uint32_t tag = (argc == 2) ? emit_tag(b, argv[1]) : 0;
emit_2(r, RULE_CONSTANT, emit_constant(b, argv[0]), tag);
}
static void spec_replace(Builder *b, int32_t argc, const Janet *argv) {
peg_arity(b, argc, 2, 3);
Reserve r = reserve(b, 4);
uint32_t subrule = compile1(b, argv[0]);
uint32_t constant = emit_constant(b, argv[1]);
uint32_t tag = (argc == 3) ? emit_tag(b, argv[2]) : 0;
emit_3(r, RULE_REPLACE, subrule, constant, tag);
}
static void spec_matchtime(Builder *b, int32_t argc, const Janet *argv) {
peg_arity(b, argc, 2, 3);
Reserve r = reserve(b, 4);
uint32_t subrule = compile1(b, argv[0]);
Janet fun = argv[1];
if (!janet_checktype(fun, JANET_FUNCTION) &&
!janet_checktype(fun, JANET_CFUNCTION)) {
peg_panicf(b, "expected function|cfunction, got %v", fun);
}
uint32_t tag = (argc == 3) ? emit_tag(b, argv[2]) : 0;
uint32_t cindex = emit_constant(b, fun);
emit_2(r, RULE_MATCHTIME, subrule, cindex);
emit_3(r, RULE_MATCHTIME, subrule, cindex, tag);
}
/* Special compiler form */
@ -770,6 +838,7 @@ static const SpecialPair specials[] = {
{"%", spec_accumulate},
{"*", spec_sequence},
{"+", spec_choice},
{"->", spec_reference},
{"/", spec_replace},
{"<-", spec_capture},
{">", spec_look},
@ -779,6 +848,7 @@ static const SpecialPair specials[] = {
{"argument", spec_argument},
{"at-least", spec_atleast},
{"at-most", spec_atmost},
{"backref", spec_reference},
{"between", spec_between},
{"capture", spec_capture},
{"choice", spec_choice},
@ -939,8 +1009,10 @@ static Peg *compile_peg(Janet x) {
Builder builder;
builder.grammar = janet_table(0);
builder.memoized = janet_table(0);
builder.tags = janet_table(0);
builder.constants = NULL;
builder.bytecode = NULL;
builder.nexttag = 1;
builder.form = x;
builder.depth = JANET_RECURSION_GUARD;
uint32_t main_rule = compile1(&builder, x);
@ -986,6 +1058,7 @@ static Janet cfun_match(int32_t argc, Janet *argv) {
s.depth = JANET_RECURSION_GUARD;
s.captures = janet_array(0);
s.scratch = janet_buffer(10);
s.tags = janet_buffer(10);
s.constants = peg->constants;
s.bytecode = peg->bytecode;

View File

@ -294,7 +294,7 @@
~{:pad (any "=")
:open (* "[" (<- :pad :n) "[")
:close (* "]" (cmt (* (-> :n) (<- :pad)) ,=) "]")
:main (cmt (* :open (any (if-not :close 1)) :close -1) ,=)})
:main (* :open (any (if-not :close 1)) :close -1)})
(check-match wrapped-string "[[]]" true)
(check-match wrapped-string "[==[a]==]" true)
@ -308,8 +308,10 @@
(check-match wrapped-string "[===[]==]===]" true)
(def janet-longstring
~{:delim (capture (some "`"))
:main (cmt (* :delim (any (if-not (* (not (> -1 "`")) :delim) 1)) (not (> -1 "`")) :delim -1) ,=)})
~{:delim (some "`")
:open (capture :delim :n)
:close (cmt (* (not (> -1 "`")) (-> :n) (<- :delim)) ,=)
:main (* :open (any (if-not :close 1)) :close -1)})
(check-match janet-longstring "`john" false)
(check-match janet-longstring "abc" false)