diff --git a/doc/Peg.md b/doc/Peg.md index a1ac23b2..9e95a293 100644 --- a/doc/Peg.md +++ b/doc/Peg.md @@ -7,14 +7,15 @@ parsers, simple operations on strings (splitting a string on commas), and regula The pre-built or custom-built parser is usually the most robust solution, but can be very complex to maintain and may not exist for many languages. String functions are not powerful enough for a large class of languages, and regular expressions can be hard to read -(which characters are escaped?) and underpowered (don't parse HTML with regex!). +(which characters are escaped?) and under-powered (don't parse HTML with regex!). PEGs, or Parsing Expression Grammars, are another formalism for recognizing languages that are easier to write as a custom parser and more powerful than regular expressions. They also -can produce grammars that are easily unerstandable and moderatly fast. PEGs can also be compiled -to a bytecode format that can be reused. +can produce grammars that are easily understandable and fast. PEGs can also be compiled +to a bytecode format that can be reused. Janet offers the `peg` module for writing and +evaluating PEGs. -Below is a siimple example for checking if a string is a valid IP address. Notice how +Below is a simple example for checking if a string is a valid IP address. Notice how the grammar is descriptive enough that you can read it even if you don't know the peg syntax (example is translated from a (RED language blog post)[https://www.red-lang.org/2013/11/041-introducing-parse.html]). ``` @@ -34,6 +35,26 @@ syntax (example is translated from a (RED language blog post)[https://www.red-la (peg/match ip-address "256.0.0.0") # -> nil ``` +## The API + +The `peg` module has few functions because the complexity is exposed through the +pattern syntax. Note that there is only one match function, `peg/match`. Variations +on matching, such as parsing or searching, can be implemented inside patterns. +PEGs can also be compiled ahead of time with `peg/compile` if a PEG will be reused +many times. + +### `(peg/match peg text [,start=0]) + +Match a peg against some text. Returns an array of captured data if the text +matches, or nil if there is no match. The caller can provide an optional start +index to begin matching the text at, otherwise the PEG starts on the first character +of text. A peg can either a compile PEG object or peg source. + +### `(peg/compile peg)` + +Compiles a peg source data structure into a new PEG. Throws an error if there are problems +with the peg code. + ## Primitive Patterns Larger patterns are built up with primitive patterns, which recognize individual @@ -42,6 +63,7 @@ is considered a byte, so PEGs will work on any string of bytes. No special meani given to the 0 byte, or the string terminator in many languages. | Pattern | Alias | What it Matches | +| -----------------| ----- | ----------------| | string ("cat") | | The literal string. | | integer (3) | | Matches a number of characters, and advances that many characters. If negative, matches if not that many characters and does not advance. For example, -1 will match the end of a string | | `(range "az" "AZ")` | | Matches characters in a range and advances 1 character. Multiple ranges can be combined together. | @@ -49,8 +71,42 @@ given to the 0 byte, or the string terminator in many languages. ## Combining Patterns -These primitve patterns are combined with a few specials to match a wide number of languages. +These primitive patterns are combined with a few specials to match a wide number of languages. These specials +can be thought of as the looping and branching forms in a traditional language +(that is how they are implemented when compiled to bytecode). +| Pattern | Alias | What it matches | +| ------- | ----- | --------------- | +| `(choice a b c ...)` | `(+ a b c ...)` | Tries to match a, then b, and so on. Will succeed on the first successful match, and fails if none of the arguments match the text. | +| `(sequence a b c)` | `(* a b c ...)` | Tries to match a, b, c and so on in sequence. If any of these arguments fail to match the text, the whole pattern fails. | +| `(any x)` | | Matches 0 or more repetitions of x. | +| `(some x)` | | Matches 1 or more repetitions of x. | +| `(between min max x)` | | Matches between min and max (inclusive) or more occurrences of x. | +| `(at-least n x)` | | Matches at least n occurrences of x. | +| `(at-most n x)` | | Matches at most n occurrences of x. | +| `(if cond patt)` | | | Tries to match patt only if cond matches as well. cond will not produce any captures. | +| `(if-not cond patt)` | | Tries to match only if cond does not match. cond will not produce any captures. | +| `(not patt)` | `(! patt)` | Matches only if patt does not match. Will not produce captures or advance any characters. | +| `(look offset patt)` | `(> offset patt)` | Matches only if patt matches at a fixed offset. offset can be any integer. patt will not produce captures and the peg will not advance any characters. | + +## Captures + +So far we have only been concerned with "does this text match this language?". This is useful, but +it is often more useful to extract data from text if it does match a peg. The `peg` module +uses that concept of a capture stack to extract data from text. As the PEG is trying to match +a piece of text, some forms may push Janet values onto the capture stack as a side effect. If the +text matches the main peg language, `(peg/match)` will return the final capture stack as an array. + +Capture specials will only push captures to the capture stack if their child pattern matches the text. +Most captures specials will match the same text as their first argument pattern. + +| Pattern | Alias | What it captures | +| ------- | ----- | --------------- | +| `(capture patt)` | `(<- patt)` | Captures all of the text in patt if patt matches, If patt contains any captures, then those +captures will be pushed to the capture stack before the total text. | +| `(group patt) ` | | Pops all of the captures in patt off of the capture stack and pushes them in an array +if patt matches. +| `(replace patt subst)` | `(/ patt subst)` | Replaces the captures produced by patt by applying subst to them. If subst is a table or struct, will push `(get subst last-capture)` to the capture stack after removing the old captures. If a subst is a function, will call subst with the captures of patt as arguments and push the result to the capture stack. Otherwise, will push subst literally to the capture stack. | ## Grammars and Recursion diff --git a/src/core/compile.c b/src/core/compile.c index ca456752..9ee1b285 100644 --- a/src/core/compile.c +++ b/src/core/compile.c @@ -403,7 +403,9 @@ static JanetSlot janetc_call(JanetFopts opts, JanetSlot *slots, JanetSlot fun) { } if (!specialized) { janetc_pushslots(c, slots); - if (opts.flags & JANET_FOPTS_TAIL) { + if ((opts.flags & JANET_FOPTS_TAIL) && + /* Prevent top level tail calls for better errors */ + !(c->scope->flags & JANET_SCOPE_TOP)) { janetc_emit_s(c, JOP_TAILCALL, fun, 0); retslot = janetc_cslot(janet_wrap_nil()); retslot.flags = JANET_SLOT_RETURNED; @@ -553,7 +555,7 @@ JanetSlot janetc_value(JanetFopts opts, Janet x) { } break; case JANET_SYMBOL: - ret = janetc_resolve(opts.compiler, janet_unwrap_symbol(x)); + ret = janetc_resolve(c, janet_unwrap_symbol(x)); break; case JANET_ARRAY: ret = janetc_array(opts, x); @@ -576,13 +578,13 @@ JanetSlot janetc_value(JanetFopts opts, Janet x) { if (c->result.status == JANET_COMPILE_ERROR) return janetc_cslot(janet_wrap_nil()); if (opts.flags & JANET_FOPTS_TAIL) - ret = janetc_return(opts.compiler, ret); + ret = janetc_return(c, ret); if (opts.flags & JANET_FOPTS_HINT) { - janetc_copy(opts.compiler, opts.hint, ret); + janetc_copy(c, opts.hint, ret); ret = opts.hint; } c->current_mapping = last_mapping; - opts.compiler->recursion_guard++; + c->recursion_guard++; return ret; } diff --git a/src/core/peg.c b/src/core/peg.c index 7f599a88..9cbe62b1 100644 --- a/src/core/peg.c +++ b/src/core/peg.c @@ -39,7 +39,8 @@ typedef enum { RULE_LOOK, /* [offset, rule] */ RULE_CHOICE, /* [len, rules...] */ RULE_SEQUENCE, /* [len, rules...] */ - RULE_IFNOT, /* [rule_a, rule_b (a if not b)] */ + RULE_IF, /* [rule_a, rule_b (b if a)] */ + RULE_IFNOT, /* [rule_a, rule_b (b if not a)] */ RULE_NOT, /* [rule] */ RULE_BETWEEN, /* [lo, hi, rule] */ RULE_CAPTURE, /* [rule] */ @@ -207,6 +208,7 @@ tail: rule = s->bytecode + args[len - 1]; goto tail; } + case RULE_IF: case RULE_IFNOT: { const uint32_t *rule_a = s->bytecode + rule[1]; @@ -214,11 +216,11 @@ tail: int oldmode = s->mode; s->mode = PEG_MODE_NOCAPTURE; down1(s); - const uint8_t *result = peg_rule(s, rule_b, text); + const uint8_t *result = peg_rule(s, rule_a, text); up1(s); s->mode = oldmode; - if (result) return NULL; - rule = rule_a; + if (rule[0] == RULE_IF ? !result : !!result) return NULL; + rule = rule_b; goto tail; } case RULE_NOT: @@ -356,29 +358,23 @@ tail: } else { /* RULE_REPLACE */ Janet constant = s->constants[rule[2]]; - int32_t nbytes = (int32_t)(result - text); switch (janet_type(constant)) { default: cap = constant; break; case JANET_STRUCT: cap = janet_struct_get(janet_unwrap_struct(constant), - janet_stringv(text, nbytes)); + s->captures->data[s->captures->count - 1]); break; case JANET_TABLE: cap = janet_table_get(janet_unwrap_table(constant), - janet_stringv(text, nbytes)); + s->captures->data[s->captures->count - 1]); break; case JANET_CFUNCTION: - janet_array_push(s->captures, - janet_stringv(text, nbytes)); - JanetCFunction cfunc = janet_unwrap_cfunction(constant); - cap = cfunc(s->captures->count - cs.cap, + cap = janet_unwrap_cfunction(constant)(s->captures->count - cs.cap, s->captures->data + cs.cap); break; case JANET_FUNCTION: - janet_array_push(s->captures, - janet_stringv(text, nbytes)); cap = janet_call(janet_unwrap_function(constant), s->captures->count - cs.cap, s->captures->data + cs.cap); @@ -634,12 +630,20 @@ static void spec_sequence(Builder *b, int32_t argc, const Janet *argv) { spec_variadic(b, argc, argv, RULE_SEQUENCE); } -static void spec_ifnot(Builder *b, int32_t argc, const Janet *argv) { +/* For (if a b) and (if-not a b) */ +static void spec_branch(Builder *b, int32_t argc, const Janet *argv, uint32_t rule) { peg_fixarity(b, argc, 2); Reserve r = reserve(b, 3); uint32_t rule_a = compile1(b, argv[0]); uint32_t rule_b = compile1(b, argv[1]); - emit_2(r, RULE_IFNOT, rule_a, rule_b); + emit_2(r, rule, rule_a, rule_b); +} + +static void spec_if(Builder *b, int32_t argc, const Janet *argv) { + spec_branch(b, argc, argv, RULE_IF); +} +static void spec_ifnot(Builder *b, int32_t argc, const Janet *argv) { + spec_branch(b, argc, argv, RULE_IFNOT); } /* Rule of the form [rule] */ @@ -663,18 +667,6 @@ static void spec_group(Builder *b, int32_t argc, const Janet *argv) { spec_onerule(b, argc, argv, RULE_GROUP); } -static void spec_exponent(Builder *b, int32_t argc, const Janet *argv) { - peg_fixarity(b, argc, 2); - Reserve r = reserve(b, 4); - int32_t n = peg_getinteger(b, argv[1]); - uint32_t subrule = compile1(b, argv[0]); - if (n < 0) { - emit_3(r, RULE_BETWEEN, 0, -n, subrule); - } else { - emit_3(r, RULE_BETWEEN, n, UINT32_MAX, subrule); - } -} - static void spec_between(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 3); Reserve r = reserve(b, 4); @@ -778,10 +770,9 @@ static const SpecialPair specials[] = { {"!", spec_not}, {"*", spec_sequence}, {"+", spec_choice}, - {"-", spec_ifnot}, {"/", spec_replace}, + {"<-", spec_capture}, {">", spec_look}, - {"^", spec_exponent}, {"any", spec_any}, {"argument", spec_argument}, {"at-least", spec_atleast}, @@ -793,6 +784,7 @@ static const SpecialPair specials[] = { {"cmt", spec_matchtime}, {"constant", spec_constant}, {"group", spec_group}, + {"if", spec_if}, {"if-not", spec_ifnot}, {"look", spec_look}, {"not", spec_not}, diff --git a/test/suite3.janet b/test/suite3.janet index fd012dc2..aac0e083 100644 --- a/test/suite3.janet +++ b/test/suite3.janet @@ -238,8 +238,8 @@ (def csv '{:field (+ - (* `"` (| (any (+ (- 1 `"`) (/ `""` `"`)))) `"`) - (| (any (- 1 (set ",\n"))))) + (* `"` (| (any (+ (if-not `"` 1) (/ `""` `"`)))) `"`) + (| (any (if-not (set ",\n") 1)))) :main (* :field (any (* "," :field)) (+ "\n" -1))}) (defn check-csv @@ -258,12 +258,12 @@ # Functions in grammar -(def grmr-triple ~(| (any (/ 1 ,(fn [x] (string x x x)))))) +(def grmr-triple ~(| (any (/ (<- 1) ,(fn [x] (string x x x)))))) (check-deep grmr-triple "abc" @["aaabbbccc"]) (check-deep grmr-triple "" @[""]) (check-deep grmr-triple " " @[" "]) -(def counter ~(/ (group (^ (capture 1) 0)) ,length)) +(def counter ~(/ (group (any (<- 1))) ,length)) (check-deep counter "abcdefg" @[7]) # Capture Backtracking @@ -294,7 +294,7 @@ ~{:pad (any "=") :open (* "[" (capture :pad) "[") :close (* "]" (cmt (* (backref 0) (capture :pad)) ,=) "]") - :main (* :open (any (if-not 1 :close)) :close -1)}) + :main (* :open (any (if-not :close 1)) :close -1)}) (check-match wrapped-string "[[]]" true) (check-match wrapped-string "[==[a]==]" true) @@ -309,7 +309,7 @@ (def janet-longstring ~{:open (capture (some "`")) :close (cmt (* (backref 0) :open) ,=) - :main (* :open (any (if-not 1 :close)) (not (> -1 "`")) :close -1)}) + :main (* :open (any (if-not :close 1)) (not (> -1 "`")) :close -1)}) (check-match janet-longstring "`john" false) (check-match janet-longstring "abc" false)