add a new (sub) PEG special

(sub) will first match one pattern, then match another pattern against the
text that the first pattern advanced over.
This commit is contained in:
Ian Henry 2023-12-04 23:34:40 -08:00
parent 772f4c26e8
commit ea75086300
No known key found for this signature in database
3 changed files with 102 additions and 5 deletions

View File

@ -39,6 +39,10 @@
typedef struct {
const uint8_t *text_start;
const uint8_t *text_end;
/* text_end will be restricted in a (sub) rule, but
outer_text_end will always contain the real end of
input, which we need to generate a line mapping */
const uint8_t *outer_text_end;
const uint32_t *bytecode;
const Janet *constants;
JanetArray *captures;
@ -114,12 +118,12 @@ static LineCol get_linecol_from_position(PegState *s, int32_t position) {
/* Generate if not made yet */
if (s->linemaplen < 0) {
int32_t newline_count = 0;
for (const uint8_t *c = s->text_start; c < s->text_end; c++) {
for (const uint8_t *c = s->text_start; c < s->outer_text_end; c++) {
if (*c == '\n') newline_count++;
}
int32_t *mem = janet_smalloc(sizeof(int32_t) * newline_count);
size_t index = 0;
for (const uint8_t *c = s->text_start; c < s->text_end; c++) {
for (const uint8_t *c = s->text_start; c < s->outer_text_end; c++) {
if (*c == '\n') mem[index++] = (int32_t)(c - s->text_start);
}
s->linemaplen = newline_count;
@ -179,7 +183,7 @@ static const uint8_t *peg_rule(
const uint32_t *rule,
const uint8_t *text) {
tail:
switch (*rule & 0x1F) {
switch (*rule) {
default:
janet_panic("unexpected opcode");
return NULL;
@ -482,6 +486,30 @@ tail:
return result;
}
case RULE_SUB: {
const uint8_t *text_start = text;
const uint32_t *rule_window = s->bytecode + rule[1];
const uint32_t *rule_subpattern = s->bytecode + rule[2];
down1(s);
const uint8_t *window_end = peg_rule(s, rule_window, text);
up1(s);
if (!window_end) {
return NULL;
}
const uint8_t *saved_end = s->text_end;
s->text_end = window_end;
down1(s);
const uint8_t *next_text = peg_rule(s, rule_subpattern, text_start);
up1(s);
s->text_end = saved_end;
if (!next_text) {
return NULL;
}
return window_end;
}
case RULE_REPLACE:
case RULE_MATCHTIME: {
uint32_t tag = rule[3];
@ -1107,6 +1135,14 @@ static void spec_matchtime(Builder *b, int32_t argc, const Janet *argv) {
emit_3(r, RULE_MATCHTIME, subrule, cindex, tag);
}
static void spec_sub(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 2);
Reserve r = reserve(b, 3);
uint32_t subrule1 = peg_compile1(b, argv[0]);
uint32_t subrule2 = peg_compile1(b, argv[1]);
emit_2(r, RULE_SUB, subrule1, subrule2);
}
#ifdef JANET_INT_TYPES
#define JANET_MAX_READINT_WIDTH 8
#else
@ -1190,6 +1226,7 @@ static const SpecialPair peg_specials[] = {
{"sequence", spec_sequence},
{"set", spec_set},
{"some", spec_some},
{"sub", spec_sub},
{"thru", spec_thru},
{"to", spec_to},
{"uint", spec_uint_le},
@ -1431,7 +1468,7 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) {
uint32_t instr = bytecode[i];
uint32_t *rule = bytecode + i;
op_flags[i] |= 0x02;
switch (instr & 0x1F) {
switch (instr) {
case RULE_LITERAL:
i += 2 + ((rule[1] + 3) >> 2);
break;
@ -1524,6 +1561,14 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) {
op_flags[rule[1]] |= 0x01;
i += 4;
break;
case RULE_SUB:
/* [rule, rule] */
if (rule[1] >= blen) goto bad;
if (rule[2] >= blen) goto bad;
op_flags[rule[1]] |= 0x01;
op_flags[rule[2]] |= 0x01;
i += 3;
break;
case RULE_ERROR:
case RULE_DROP:
case RULE_NOT:
@ -1677,6 +1722,7 @@ static PegCall peg_cfun_init(int32_t argc, Janet *argv, int get_replace) {
ret.s.mode = PEG_MODE_NORMAL;
ret.s.text_start = ret.bytes.bytes;
ret.s.text_end = ret.bytes.bytes + ret.bytes.len;
ret.s.outer_text_end = ret.s.text_end;
ret.s.depth = JANET_RECURSION_GUARD;
ret.s.captures = janet_array(0);
ret.s.tagged_captures = janet_array(0);

View File

@ -2140,7 +2140,8 @@ typedef enum {
RULE_LINE, /* [tag] */
RULE_COLUMN, /* [tag] */
RULE_UNREF, /* [rule, tag] */
RULE_CAPTURE_NUM /* [rule, tag] */
RULE_CAPTURE_NUM, /* [rule, tag] */
RULE_SUB /* [rule, rule] */
} JanetPegOpcod;
typedef struct {

View File

@ -263,6 +263,8 @@
(marshpeg '(if-not "abcdf" 123))
(marshpeg ~(cmt "abcdf" ,identity))
(marshpeg '(group "abc"))
(marshpeg '(sub "abcdf" "abc"))
(marshpeg '(* (sub 1 1)))
# Peg swallowing errors
# 159651117
@ -660,5 +662,53 @@
(peg/match '(if (not (* (constant 7) "a")) "hello") "hello")
@[]) "peg if not")
(defn test [name peg input expected]
(assert (deep= (peg/match peg input) expected) name))
(test "sub: matches the same input twice"
~(sub "abcd" "abc")
"abcdef"
@[])
(test "sub: second pattern cannot match more than the first pattern"
~(sub "abcd" "abcde")
"abcdef"
nil)
(test "sub: fails if first pattern fails"
~(sub "x" "abc")
"abcdef"
nil)
(test "sub: fails if second pattern fails"
~(sub "abc" "x")
"abcdef"
nil)
(test "sub: keeps captures from both patterns"
~(sub '"abcd" '"abc")
"abcdef"
@["abcd" "abc"])
(test "sub: second pattern can reference captures from first"
~(* (constant 5 :tag) (sub (capture "abc" :tag) (backref :tag)))
"abcdef"
@[5 "abc" "abc"])
(test "sub: second pattern can't see past what the first pattern matches"
~(sub "abc" (* "abc" -1))
"abcdef"
@[])
(test "sub: positions inside second match are still relative to the entire input"
~(* "one\ntw" (sub "o" (* ($) (line) (column))))
"one\ntwo\nthree\n"
@[6 2 3])
(test "sub: advances to the end of the first pattern's match"
~(* (sub "abc" "ab") "d")
"abcdef"
@[])
(end-suite)