From 61f38fab37f8e01c43df9e65cbbb6b357e888c8d Mon Sep 17 00:00:00 2001 From: Ian Henry Date: Wed, 27 Dec 2023 08:26:50 -0800 Subject: [PATCH] add a new (split) PEG special This works similarly to string/split, but the separator is a PEG. --- src/core/peg.c | 50 +++++++++++++++++++++++++++++++++++++++++++- src/include/janet.h | 3 ++- test/suite-peg.janet | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/src/core/peg.c b/src/core/peg.c index d9562482..1916b93e 100644 --- a/src/core/peg.c +++ b/src/core/peg.c @@ -39,7 +39,7 @@ typedef struct { const uint8_t *text_start; const uint8_t *text_end; - /* text_end will be restricted in a (sub) rule, but + /* text_end can be restricted by some rules, but outer_text_end will always contain the real end of input, which we need to generate a line mapping */ const uint8_t *outer_text_end; @@ -510,6 +510,44 @@ tail: return window_end; } + case RULE_SPLIT: { + const uint8_t *saved_end = s->text_end; + const uint32_t *rule_separator = s->bytecode + rule[1]; + const uint32_t *rule_subpattern = s->bytecode + rule[2]; + + const uint8_t *separator_end = NULL; + do { + const uint8_t *text_start = text; + CapState cs = cap_save(s); + down1(s); + while (text <= s->text_end) { + separator_end = peg_rule(s, rule_separator, text); + cap_load(s, cs); + if (separator_end) { + break; + } + text++; + } + up1(s); + + if (separator_end) { + s->text_end = text; + text = separator_end; + } + + down1(s); + const uint8_t *subpattern_end = peg_rule(s, rule_subpattern, text_start); + up1(s); + s->text_end = saved_end; + + if (!subpattern_end) { + return NULL; + } + } while (separator_end); + + return s->text_end; + } + case RULE_REPLACE: case RULE_MATCHTIME: { uint32_t tag = rule[3]; @@ -1143,6 +1181,14 @@ static void spec_sub(Builder *b, int32_t argc, const Janet *argv) { emit_2(r, RULE_SUB, subrule1, subrule2); } +static void spec_split(Builder *b, int32_t argc, const Janet *argv) { + peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 3); + uint32_t subrule1 = peg_compile1(b, argv[0]); + uint32_t subrule2 = peg_compile1(b, argv[1]); + emit_2(r, RULE_SPLIT, subrule1, subrule2); +} + #ifdef JANET_INT_TYPES #define JANET_MAX_READINT_WIDTH 8 #else @@ -1226,6 +1272,7 @@ static const SpecialPair peg_specials[] = { {"sequence", spec_sequence}, {"set", spec_set}, {"some", spec_some}, + {"split", spec_split}, {"sub", spec_sub}, {"thru", spec_thru}, {"to", spec_to}, @@ -1562,6 +1609,7 @@ static void *peg_unmarshal(JanetMarshalContext *ctx) { i += 4; break; case RULE_SUB: + case RULE_SPLIT: /* [rule, rule] */ if (rule[1] >= blen) goto bad; if (rule[2] >= blen) goto bad; diff --git a/src/include/janet.h b/src/include/janet.h index 4e486145..1cfb970f 100644 --- a/src/include/janet.h +++ b/src/include/janet.h @@ -2141,7 +2141,8 @@ typedef enum { RULE_COLUMN, /* [tag] */ RULE_UNREF, /* [rule, tag] */ RULE_CAPTURE_NUM, /* [rule, tag] */ - RULE_SUB /* [rule, rule] */ + RULE_SUB, /* [rule, rule] */ + RULE_SPLIT /* [rule, rule] */ } JanetPegOpcod; typedef struct { diff --git a/test/suite-peg.janet b/test/suite-peg.janet index cdd6f3cd..e0c85e66 100644 --- a/test/suite-peg.janet +++ b/test/suite-peg.janet @@ -265,6 +265,7 @@ (marshpeg '(group "abc")) (marshpeg '(sub "abcdf" "abc")) (marshpeg '(* (sub 1 1))) +(marshpeg '(split "," (+ "a" "b" "c"))) # Peg swallowing errors # 159651117 @@ -710,5 +711,50 @@ "abcdef" @[]) +(test "split: basic functionality" + ~(split "," '1) + "a,b,c" + @["a" "b" "c"]) + +(test "split: drops captures from separator pattern" + ~(split '"," '1) + "a,b,c" + @["a" "b" "c"]) + +(test "split: can match empty subpatterns" + ~(split "," ':w*) + ",a,,bar,,,c,," + @["" "a" "" "bar" "" "" "c" "" ""]) + +(test "split: subpattern is limited to only text before the separator" + ~(split "," '(to -1)) + "a,,bar,c" + @["a" "" "bar" "c"]) + +(test "split: fails if any subpattern fails" + ~(split "," '"a") + "a,a,b" + nil) + +(test "split: separator does not have to match anything" + ~(split "x" '(to -1)) + "a,a,b" + @["a,a,b"]) + +(test "split: always consumes entire input" + ~(split 1 '"") + "abc" + @["" "" "" ""]) + +(test "split: separator can be an arbitrary PEG" + ~(split :s+ '(to -1)) + "a b c" + @["a" "b" "c"]) + +(test "split: does not advance past the end of the input" + ~(* (split "," ':w+) 0) + "a,b,c" + @["a" "b" "c"]) + (end-suite)