diff --git a/Makefile b/Makefile index 7a6463b9..1603906c 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ LIBDIR=$(PREFIX)/lib BINDIR=$(PREFIX)/bin JANET_BUILD?="\"$(shell git log --pretty=format:'%h' -n 1)\"" -CFLAGS=-std=c99 -Wall -Wextra -Isrc/include -fpic -O2 -fvisibility=hidden \ +CFLAGS=-std=c99 -Wall -Wextra -Isrc/include -fpic -g -O2 -fvisibility=hidden \ -DJANET_BUILD=$(JANET_BUILD) CLIBS=-lm -ldl JANET_TARGET=build/janet diff --git a/doc/Peg.md b/doc/Peg.md new file mode 100644 index 00000000..a1ac23b2 --- /dev/null +++ b/doc/Peg.md @@ -0,0 +1,62 @@ +# Peg (Parsing Expression Grammars) + +A common task for developers is to recognize patterns in text, be it +filtering emails from a list or extracting data from a CSV file. Programming +languages and libraries usually offer a number of tools for this, including prebuilt +parsers, simple operations on strings (splitting a string on commas), and regular expressions. +The pre-built or custom-built parser is usually the most robust solution, but can +be very complex to maintain and may not exist for many languages. String functions are not +powerful enough for a large class of languages, and regular expressions can be hard to read +(which characters are escaped?) and underpowered (don't parse HTML with regex!). + +PEGs, or Parsing Expression Grammars, are another formalism for recognizing languages that +are easier to write as a custom parser and more powerful than regular expressions. They also +can produce grammars that are easily unerstandable and moderatly fast. PEGs can also be compiled +to a bytecode format that can be reused. + +Below is a siimple example for checking if a string is a valid IP address. Notice how +the grammar is descriptive enough that you can read it even if you don't know the peg +syntax (example is translated from a (RED language blog post)[https://www.red-lang.org/2013/11/041-introducing-parse.html]). +``` +(def ip-address + '{:dig (range "09") + :0-4 (range "04") + :0-5 (range "05") + :byte (choice + (sequence "25" :0-5) + (sequence "2" :0-4 :dig) + (sequence "1" :dig :dig) + (between 1 2 :dig)) + :main (sequence :byte "." :byte "." :byte "." :byte)}) + +(peg/match ip-address "0.0.0.0") # -> @[] +(peg/match ip-address "elephant") # -> nil +(peg/match ip-address "256.0.0.0") # -> nil +``` + +## Primitive Patterns + +Larger patterns are built up with primitive patterns, which recognize individual +characters, string literals, or a given number of characters. A character in Janet +is considered a byte, so PEGs will work on any string of bytes. No special meaning is +given to the 0 byte, or the string terminator in many languages. + +| Pattern | Alias | What it Matches | +| string ("cat") | | The literal string. | +| integer (3) | | Matches a number of characters, and advances that many characters. If negative, matches if not that many characters and does not advance. For example, -1 will match the end of a string | +| `(range "az" "AZ")` | | Matches characters in a range and advances 1 character. Multiple ranges can be combined together. | +| `(set "abcd")` | | Match any character in the argument string. Advances 1 character. | + +## Combining Patterns + +These primitve patterns are combined with a few specials to match a wide number of languages. + + +## Grammars and Recursion + +Parsing Expression Grammars try to match an input text with a pattern in a greedy manner. +This means that if a rule fails to match, that rule will fail and not try again. The only +backtracking provided in a peg is provided by the `(choice x y z ...)` special, which will +try rules in order until one succeeds, and the whole pattern succeeds. If no sub pattern +succeeds, then the whole pattern fails. Note that this means that the order of `x y z` in choice +DOES matter. If y matches everything that z matches, z will never succeed. diff --git a/src/core/peg.c b/src/core/peg.c index 315c7f58..904befa0 100644 --- a/src/core/peg.c +++ b/src/core/peg.c @@ -455,17 +455,35 @@ static uint32_t emit_constant(Builder *b, Janet c) { return cindex; } +/* Reserve space in bytecode for a rule. When a special emits a rule, + * it must place that rule immediately on the bytecode stack. This lets + * the compiler know where the rule is going to be before it is complete, + * allowing recursive rules. */ +typedef struct { + Builder *builder; + uint32_t index; + int32_t size; +} Reserve; + +static Reserve reserve(Builder *b, int32_t size) { + Reserve r; + r.index = janet_v_count(b->bytecode); + r.builder = b; + r.size = size; + for (int32_t i = 0; i < size; i++) + janet_v_push(b->bytecode, 0); + return r; +} + /* Emit a rule in the builder. Returns the index of the new rule */ -static uint32_t emit_rule(Builder *b, uint32_t op, int32_t n, const uint32_t *body) { - uint32_t next_rule = janet_v_count(b->bytecode); - janet_v_push(b->bytecode, op); - for (int32_t i = 0; i < n; i++) - janet_v_push(b->bytecode, body[i]); - return next_rule; +static void emit_rule(Reserve r, int32_t op, int32_t n, const uint32_t *body) { + janet_assert(r.size == n + 1, "bad reserve"); + r.builder->bytecode[r.index] = op; + memcpy(r.builder->bytecode + r.index + 1, body, n * sizeof(uint32_t)); } /* For RULE_LITERAL */ -static uint32_t emit_bytes(Builder *b, uint32_t op, int32_t len, const uint8_t *bytes) { +static void emit_bytes(Builder *b, uint32_t op, int32_t len, const uint8_t *bytes) { uint32_t next_rule = janet_v_count(b->bytecode); janet_v_push(b->bytecode, op); janet_v_push(b->bytecode, len); @@ -473,20 +491,19 @@ static uint32_t emit_bytes(Builder *b, uint32_t op, int32_t len, const uint8_t * for (int32_t i = 0; i < words; i++) janet_v_push(b->bytecode, 0); memcpy(b->bytecode + next_rule + 2, bytes, len); - return next_rule; } /* For fixed arity rules of arities 1, 2, and 3 */ -static uint32_t emit_1(Builder *b, uint32_t op, uint32_t arg) { - return emit_rule(b, op, 1, &arg); +static void emit_1(Reserve r, uint32_t op, uint32_t arg) { + return emit_rule(r, op, 1, &arg); } -static uint32_t emit_2(Builder *b, uint32_t op, uint32_t arg1, uint32_t arg2) { +static void emit_2(Reserve r, uint32_t op, uint32_t arg1, uint32_t arg2) { uint32_t arr[2] = {arg1, arg2}; - return emit_rule(b, op, 2, arr); + return emit_rule(r, op, 2, arr); } -static uint32_t emit_3(Builder *b, uint32_t op, uint32_t arg1, uint32_t arg2, uint32_t arg3) { +static void emit_3(Reserve r, uint32_t op, uint32_t arg1, uint32_t arg2, uint32_t arg3) { uint32_t arr[3] = {arg1, arg2, arg3}; - return emit_rule(b, op, 3, arr); + return emit_rule(r, op, 3, arr); } /* @@ -560,43 +577,47 @@ static void bitmap_set(uint32_t *bitmap, uint8_t c) { bitmap[c >> 5] |= ((uint32_t)1) << (c & 0x1F); } -static uint32_t spec_range(Builder *b, int32_t argc, const Janet *argv) { +static void spec_range(Builder *b, int32_t argc, const Janet *argv) { peg_arity(b, argc, 1, -1); if (argc == 1) { + Reserve r = reserve(b, 2); const uint8_t *str = peg_getrange(b, argv[0]); uint32_t arg = str[0] | (str[1] << 16); - return emit_1(b, RULE_RANGE, arg); + emit_1(r, RULE_RANGE, arg); } else { /* Compile as a set */ + Reserve r = reserve(b, 9); uint32_t bitmap[8] = {0}; for (int32_t i = 0; i < argc; i++) { const uint8_t *str = peg_getrange(b, argv[i]); for (uint32_t c = str[0]; c <= str[1]; c++) bitmap_set(bitmap, c); } - return emit_rule(b, RULE_SET, 8, bitmap); + emit_rule(r, RULE_SET, 8, bitmap); } } -static uint32_t spec_set(Builder *b, int32_t argc, const Janet *argv) { +static void spec_set(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 1); + Reserve r = reserve(b, 9); const uint8_t *str = peg_getset(b, argv[0]); uint32_t bitmap[8] = {0}; for (int32_t i = 0; i < janet_string_length(str); i++) bitmap_set(bitmap, str[i]); - return emit_rule(b, RULE_SET, 8, bitmap); + emit_rule(r, RULE_SET, 8, bitmap); } -static uint32_t spec_look(Builder *b, int32_t argc, const Janet *argv) { +static void spec_look(Builder *b, int32_t argc, const Janet *argv) { peg_arity(b, argc, 1, 2); + Reserve r = reserve(b, 3); int32_t rulearg = argc == 2 ? 1 : 0; int32_t offset = argc == 2 ? peg_getinteger(b, argv[0]) : 0; uint32_t subrule = compile1(b, argv[rulearg]); - return emit_2(b, RULE_LOOK, (uint32_t) offset, subrule); + emit_2(r, RULE_LOOK, (uint32_t) offset, subrule); } /* Rule of the form [len, rules...] */ -static uint32_t spec_variadic(Builder *b, int32_t argc, const Janet *argv, uint32_t op) { +static void spec_variadic(Builder *b, int32_t argc, const Janet *argv, uint32_t op) { uint32_t rule = janet_v_count(b->bytecode); janet_v_push(b->bytecode, op); janet_v_push(b->bytecode, argc); @@ -606,128 +627,138 @@ static uint32_t spec_variadic(Builder *b, int32_t argc, const Janet *argv, uint3 uint32_t rulei = compile1(b, argv[i]); b->bytecode[rule + 2 + i] = rulei; } - return rule; } -static uint32_t spec_choice(Builder *b, int32_t argc, const Janet *argv) { - return spec_variadic(b, argc, argv, RULE_CHOICE); +static void spec_choice(Builder *b, int32_t argc, const Janet *argv) { + spec_variadic(b, argc, argv, RULE_CHOICE); } -static uint32_t spec_sequence(Builder *b, int32_t argc, const Janet *argv) { - return spec_variadic(b, argc, argv, RULE_SEQUENCE); +static void spec_sequence(Builder *b, int32_t argc, const Janet *argv) { + spec_variadic(b, argc, argv, RULE_SEQUENCE); } -static uint32_t spec_ifnot(Builder *b, int32_t argc, const Janet *argv) { +static void spec_ifnot(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 3); uint32_t rule_a = compile1(b, argv[0]); uint32_t rule_b = compile1(b, argv[1]); - return emit_2(b, RULE_IFNOT, rule_a, rule_b); + emit_2(r, RULE_IFNOT, rule_a, rule_b); } /* Rule of the form [rule] */ -static uint32_t spec_onerule(Builder *b, int32_t argc, const Janet *argv, uint32_t op) { +static void spec_onerule(Builder *b, int32_t argc, const Janet *argv, uint32_t op) { peg_fixarity(b, argc, 1); + Reserve r = reserve(b, 2); uint32_t rule = compile1(b, argv[0]); - return emit_1(b, op, rule); + emit_1(r, op, rule); } -static uint32_t spec_not(Builder *b, int32_t argc, const Janet *argv) { - return spec_onerule(b, argc, argv, RULE_NOT); +static void spec_not(Builder *b, int32_t argc, const Janet *argv) { + spec_onerule(b, argc, argv, RULE_NOT); } -static uint32_t spec_capture(Builder *b, int32_t argc, const Janet *argv) { - return spec_onerule(b, argc, argv, RULE_CAPTURE); +static void spec_capture(Builder *b, int32_t argc, const Janet *argv) { + spec_onerule(b, argc, argv, RULE_CAPTURE); } -static uint32_t spec_substitute(Builder *b, int32_t argc, const Janet *argv) { - return spec_onerule(b, argc, argv, RULE_SUBSTITUTE); +static void spec_substitute(Builder *b, int32_t argc, const Janet *argv) { + spec_onerule(b, argc, argv, RULE_SUBSTITUTE); } -static uint32_t spec_group(Builder *b, int32_t argc, const Janet *argv) { - return spec_onerule(b, argc, argv, RULE_GROUP); +static void spec_group(Builder *b, int32_t argc, const Janet *argv) { + spec_onerule(b, argc, argv, RULE_GROUP); } -static uint32_t spec_exponent(Builder *b, int32_t argc, const Janet *argv) { +static void spec_exponent(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 4); int32_t n = peg_getinteger(b, argv[1]); uint32_t subrule = compile1(b, argv[0]); if (n < 0) { - return emit_3(b, RULE_BETWEEN, 0, -n, subrule); + emit_3(r, RULE_BETWEEN, 0, -n, subrule); } else { - return emit_3(b, RULE_BETWEEN, n, UINT32_MAX, subrule); + emit_3(r, RULE_BETWEEN, n, UINT32_MAX, subrule); } } -static uint32_t spec_between(Builder *b, int32_t argc, const Janet *argv) { +static void spec_between(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 3); + Reserve r = reserve(b, 4); int32_t lo = peg_getnat(b, argv[0]); int32_t hi = peg_getnat(b, argv[1]); uint32_t subrule = compile1(b, argv[2]); - return emit_3(b, RULE_BETWEEN, lo, hi, subrule); + emit_3(r, RULE_BETWEEN, lo, hi, subrule); } -static uint32_t spec_position(Builder *b, int32_t argc, const Janet *argv) { +static void spec_position(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 0); + Reserve r = reserve(b, 1); (void) argv; - return emit_rule(b, RULE_POSITION, 0, NULL); + emit_rule(r, RULE_POSITION, 0, NULL); } -static uint32_t spec_reference(Builder *b, int32_t argc, const Janet *argv) { +static void spec_reference(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 1); + Reserve r = reserve(b, 2); int32_t index = peg_getinteger(b, argv[0]); if (index < 0) { - return emit_1(b, RULE_BACKINDEX, -index); + emit_1(r, RULE_BACKINDEX, -index); } else { - return emit_1(b, RULE_REPINDEX, index); + emit_1(r, RULE_REPINDEX, index); } } -static uint32_t spec_argument(Builder *b, int32_t argc, const Janet *argv) { +static void spec_argument(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 1); - int32_t index = peg_getinteger(b, argv[0]); - if (index < 0) - peg_panicf(b, "argument index must be natural number, got %v", argv[0]); - return emit_1(b, RULE_ARGUMENT, index); + Reserve r = reserve(b, 2); + int32_t index = peg_getnat(b, argv[0]); + emit_1(r, RULE_ARGUMENT, index); } -static uint32_t spec_constant(Builder *b, int32_t argc, const Janet *argv) { +static void spec_constant(Builder *b, int32_t argc, const Janet *argv) { janet_fixarity(argc, 1); - return emit_1(b, RULE_CONSTANT, emit_constant(b, argv[0])); + Reserve r = reserve(b, 2); + emit_1(r, RULE_CONSTANT, emit_constant(b, argv[0])); } -static uint32_t spec_replace(Builder *b, int32_t argc, const Janet *argv) { +static void spec_replace(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 3); uint32_t subrule = compile1(b, argv[0]); uint32_t constant = emit_constant(b, argv[1]); - return emit_2(b, RULE_REPLACE, subrule, constant); + emit_2(r, RULE_REPLACE, subrule, constant); } /* For some and any, really just short-hand for (^ rule n) */ -static uint32_t spec_repeater(Builder *b, int32_t argc, const Janet *argv, int32_t min) { +static void spec_repeater(Builder *b, int32_t argc, const Janet *argv, int32_t min) { peg_fixarity(b, argc, 1); + Reserve r = reserve(b, 4); uint32_t subrule = compile1(b, argv[0]); - return emit_3(b, RULE_BETWEEN, min, UINT32_MAX, subrule); + emit_3(r, RULE_BETWEEN, min, UINT32_MAX, subrule); } -static uint32_t spec_some(Builder *b, int32_t argc, const Janet *argv) { - return spec_repeater(b, argc, argv, 1); +static void spec_some(Builder *b, int32_t argc, const Janet *argv) { + spec_repeater(b, argc, argv, 1); } -static uint32_t spec_any(Builder *b, int32_t argc, const Janet *argv) { - return spec_repeater(b, argc, argv, 0); +static void spec_any(Builder *b, int32_t argc, const Janet *argv) { + spec_repeater(b, argc, argv, 0); } -static uint32_t spec_atleast(Builder *b, int32_t argc, const Janet *argv) { +static void spec_atleast(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 4); int32_t n = peg_getnat(b, argv[0]); uint32_t subrule = compile1(b, argv[1]); - return emit_3(b, RULE_BETWEEN, n, UINT32_MAX, subrule); + emit_3(r, RULE_BETWEEN, n, UINT32_MAX, subrule); } -static uint32_t spec_atmost(Builder *b, int32_t argc, const Janet *argv) { +static void spec_atmost(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 4); int32_t n = peg_getnat(b, argv[0]); uint32_t subrule = compile1(b, argv[1]); - return emit_3(b, RULE_BETWEEN, 0, n, subrule); + emit_3(r, RULE_BETWEEN, 0, n, subrule); } -static uint32_t spec_matchtime(Builder *b, int32_t argc, const Janet *argv) { +static void spec_matchtime(Builder *b, int32_t argc, const Janet *argv) { peg_fixarity(b, argc, 2); + Reserve r = reserve(b, 3); uint32_t subrule = compile1(b, argv[0]); Janet fun = argv[1]; if (!janet_checktype(fun, JANET_FUNCTION) && @@ -735,11 +766,11 @@ static uint32_t spec_matchtime(Builder *b, int32_t argc, const Janet *argv) { peg_panicf(b, "expected function|cfunction, got %v", fun); } uint32_t cindex = emit_constant(b, fun); - return emit_2(b, RULE_MATCHTIME, subrule, cindex); + emit_2(r, RULE_MATCHTIME, subrule, cindex); } /* Special compiler form */ -typedef uint32_t (*Special)(Builder *b, int32_t argc, const Janet *argv); +typedef void (*Special)(Builder *b, int32_t argc, const Janet *argv); typedef struct { const char *name; Special special; @@ -797,7 +828,11 @@ static uint32_t compile1(Builder *b, Janet peg) { } /* The final rule to return */ - uint32_t rule; + uint32_t rule = janet_v_count(b->bytecode); + if (!janet_checktype(peg, JANET_KEYWORD) && + !janet_checktype(peg, JANET_STRUCT)) { + janet_table_put(b->memoized, peg, janet_wrap_number(rule)); + } switch (janet_type(peg)) { default: @@ -806,10 +841,11 @@ static uint32_t compile1(Builder *b, Janet peg) { case JANET_NUMBER: { int32_t n = peg_getinteger(b, peg); + Reserve r = reserve(b, 2); if (n < 0) { - rule = emit_1(b, RULE_NOTNCHAR, -n); + emit_1(r, RULE_NOTNCHAR, -n); } else { - rule = emit_1(b, RULE_NCHAR, n); + emit_1(r, RULE_NCHAR, n); } break; } @@ -817,7 +853,7 @@ static uint32_t compile1(Builder *b, Janet peg) { { const uint8_t *str = janet_unwrap_string(peg); int32_t len = janet_string_length(str); - rule = emit_bytes(b, RULE_LITERAL, len, str); + emit_bytes(b, RULE_LITERAL, len, str); break; } case JANET_KEYWORD: @@ -826,11 +862,7 @@ static uint32_t compile1(Builder *b, Janet peg) { if (janet_checktype(check, JANET_NIL)) peg_panicf(b, "unknown rule"); rule = compile1(b, check); - /* We don't want to memoize references, as they will become invalid - * if we go out of scope */ - b->depth++; - b->form = old_form; - return rule; + break; } case JANET_STRUCT: { @@ -859,18 +891,14 @@ static uint32_t compile1(Builder *b, Janet peg) { sym); if (!sp) peg_panicf(b, "unknown special %S", sym); - rule = sp->special(b, len - 1, tup + 1); + sp->special(b, len - 1, tup + 1); break; } } - /* Add rule to memoized table */ - janet_table_put(b->memoized, peg, janet_wrap_number(rule)); - /* Increase depth again */ b->depth++; b->form = old_form; - return rule; } diff --git a/test/suite3.janet b/test/suite3.janet index 3aa09b0a..985c568e 100644 --- a/test/suite3.janet +++ b/test/suite3.janet @@ -278,4 +278,13 @@ (check-deep scanner "-1.3e-7" @[-1.3e-7]) (check-deep scanner "123A" nil) +# Recursive grammars + +(def g '{:main (+ (* "a" :main "b") "c")}) + +(check-match g "c" true) +(check-match g "acb" true) +(check-match g "aacbb" true) +(check-match g "aadbb" false) + (end-suite)