From ea750863008affed0daa8521f1d1915f81616866 Mon Sep 17 00:00:00 2001 From: Ian Henry Date: Mon, 4 Dec 2023 23:34:40 -0800 Subject: [PATCH] add a new (sub) PEG special (sub) will first match one pattern, then match another pattern against the text that the first pattern advanced over. --- src/core/peg.c | 54 ++++++++++++++++++++++++++++++++++++++++---- src/include/janet.h | 3 ++- test/suite-peg.janet | 50 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/src/core/peg.c b/src/core/peg.c index 11504cda..d77200e9 100644 --- a/src/core/peg.c +++ b/src/core/peg.c @@ -39,6 +39,10 @@ typedef struct { const uint8_t *text_start; const uint8_t *text_end; + /* text_end will be restricted in a (sub) rule, but + outer_text_end will always contain the real end of + input, which we need to generate a line mapping */ + const uint8_t *outer_text_end; const uint32_t *bytecode; const Janet *constants; JanetArray *captures; @@ -114,12 +118,12 @@ static LineCol get_linecol_from_position(PegState *s, int32_t position) { /* Generate if not made yet */ if (s->linemaplen < 0) { int32_t newline_count = 0; - for (const uint8_t *c = s->text_start; c < s->text_end; c++) { + for (const uint8_t *c = s->text_start; c < s->outer_text_end; c++) { if (*c == '\n') newline_count++; } int32_t *mem = janet_smalloc(sizeof(int32_t) * newline_count); size_t index = 0; - for (const uint8_t *c = s->text_start; c < s->text_end; c++) { + for (const uint8_t *c = s->text_start; c < s->outer_text_end; c++) { if (*c == '\n') mem[index++] = (int32_t)(c - s->text_start); } s->linemaplen = newline_count; @@ -179,7 +183,7 @@ static const uint8_t *peg_rule( const uint32_t *rule, const uint8_t *text) { tail: - switch (*rule & 0x1F) { + switch (*rule) { default: janet_panic("unexpected opcode"); return NULL; @@ -482,6 +486,30 @@ tail: return result; } + case RULE_SUB: { + const uint8_t *text_start = text; + const uint32_t *rule_window = s->bytecode + rule[1]; + const uint32_t *rule_subpattern = s->bytecode + rule[2]; + down1(s); + const uint8_t *window_end = peg_rule(s, rule_window, text); + up1(s); + if (!window_end) { + return NULL; + } + const uint8_t *saved_end = s->text_end; + s->text_end = window_end; + down1(s); + const uint8_t *next_text = peg_rule(s, rule_subpattern, text_start); + up1(s); + s->text_end = saved_end; + + if (!next_text) { + return NULL; + } + + return window_end; + } + case RULE_REPLACE: case RULE_MATCHTIME: { uint32_t tag = rule[3]; @@ -1107,6 +1135,14 @@ static void spec_matchtime(Builder *b, int32_t argc, const Janet *argv) { emit_3(r, RULE_MATCHTIME, subrule, cindex, tag); } +static void spec_sub(Builder *b, int32_t argc, const Janet *argv) { + peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 3); + uint32_t subrule1 = peg_compile1(b, argv[0]); + uint32_t subrule2 = peg_compile1(b, argv[1]); + emit_2(r, RULE_SUB, subrule1, subrule2); +} + #ifdef JANET_INT_TYPES #define JANET_MAX_READINT_WIDTH 8 #else @@ -1190,6 +1226,7 @@ static const SpecialPair peg_specials[] = { {"sequence", spec_sequence}, {"set", spec_set}, {"some", spec_some}, + {"sub", spec_sub}, {"thru", spec_thru}, {"to", spec_to}, {"uint", spec_uint_le}, @@ -1431,7 +1468,7 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) { uint32_t instr = bytecode[i]; uint32_t *rule = bytecode + i; op_flags[i] |= 0x02; - switch (instr & 0x1F) { + switch (instr) { case RULE_LITERAL: i += 2 + ((rule[1] + 3) >> 2); break; @@ -1524,6 +1561,14 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) { op_flags[rule[1]] |= 0x01; i += 4; break; + case RULE_SUB: + /* [rule, rule] */ + if (rule[1] >= blen) goto bad; + if (rule[2] >= blen) goto bad; + op_flags[rule[1]] |= 0x01; + op_flags[rule[2]] |= 0x01; + i += 3; + break; case RULE_ERROR: case RULE_DROP: case RULE_NOT: @@ -1677,6 +1722,7 @@ static PegCall peg_cfun_init(int32_t argc, Janet *argv, int get_replace) { ret.s.mode = PEG_MODE_NORMAL; ret.s.text_start = ret.bytes.bytes; ret.s.text_end = ret.bytes.bytes + ret.bytes.len; + ret.s.outer_text_end = ret.s.text_end; ret.s.depth = JANET_RECURSION_GUARD; ret.s.captures = janet_array(0); ret.s.tagged_captures = janet_array(0); diff --git a/src/include/janet.h b/src/include/janet.h index 5724af2b..4e486145 100644 --- a/src/include/janet.h +++ b/src/include/janet.h @@ -2140,7 +2140,8 @@ typedef enum { RULE_LINE, /* [tag] */ RULE_COLUMN, /* [tag] */ RULE_UNREF, /* [rule, tag] */ - RULE_CAPTURE_NUM /* [rule, tag] */ + RULE_CAPTURE_NUM, /* [rule, tag] */ + RULE_SUB /* [rule, rule] */ } JanetPegOpcod; typedef struct { diff --git a/test/suite-peg.janet b/test/suite-peg.janet index a237c853..cdd6f3cd 100644 --- a/test/suite-peg.janet +++ b/test/suite-peg.janet @@ -263,6 +263,8 @@ (marshpeg '(if-not "abcdf" 123)) (marshpeg ~(cmt "abcdf" ,identity)) (marshpeg '(group "abc")) +(marshpeg '(sub "abcdf" "abc")) +(marshpeg '(* (sub 1 1))) # Peg swallowing errors # 159651117 @@ -660,5 +662,53 @@ (peg/match '(if (not (* (constant 7) "a")) "hello") "hello") @[]) "peg if not") +(defn test [name peg input expected] + (assert (deep= (peg/match peg input) expected) name)) + +(test "sub: matches the same input twice" + ~(sub "abcd" "abc") + "abcdef" + @[]) + +(test "sub: second pattern cannot match more than the first pattern" + ~(sub "abcd" "abcde") + "abcdef" + nil) + +(test "sub: fails if first pattern fails" + ~(sub "x" "abc") + "abcdef" + nil) + +(test "sub: fails if second pattern fails" + ~(sub "abc" "x") + "abcdef" + nil) + +(test "sub: keeps captures from both patterns" + ~(sub '"abcd" '"abc") + "abcdef" + @["abcd" "abc"]) + +(test "sub: second pattern can reference captures from first" + ~(* (constant 5 :tag) (sub (capture "abc" :tag) (backref :tag))) + "abcdef" + @[5 "abc" "abc"]) + +(test "sub: second pattern can't see past what the first pattern matches" + ~(sub "abc" (* "abc" -1)) + "abcdef" + @[]) + +(test "sub: positions inside second match are still relative to the entire input" + ~(* "one\ntw" (sub "o" (* ($) (line) (column)))) + "one\ntwo\nthree\n" + @[6 2 3]) + +(test "sub: advances to the end of the first pattern's match" + ~(* (sub "abc" "ab") "d") + "abcdef" + @[]) + (end-suite)