1
0
mirror of https://github.com/janet-lang/janet synced 2025-10-31 07:33:01 +00:00

More work on peg. Disable indexed backrefs and replace substitution

with accumulation.
This commit is contained in:
Calvin Rose
2019-01-16 21:11:55 -05:00
parent 4b8edef58c
commit 612a245961
5 changed files with 130 additions and 146 deletions

View File

@@ -44,11 +44,10 @@ typedef enum {
RULE_NOT, /* [rule] */
RULE_BETWEEN, /* [lo, hi, rule] */
RULE_CAPTURE, /* [rule] */
RULE_POSITION, /* [] */
RULE_POSITION, /* [tag] */
RULE_ARGUMENT, /* [argument-index] */
RULE_REPINDEX, /* [capture-index] */
RULE_CONSTANT, /* [constant] */
RULE_SUBSTITUTE, /* [rule] */
RULE_ACCUMULATE, /* [rule] */
RULE_GROUP, /* [rule] */
RULE_REPLACE, /* [rule, constant] */
RULE_MATCHTIME, /* [rule, constant] */
@@ -59,7 +58,6 @@ typedef enum {
typedef struct {
const uint8_t *text_start;
const uint8_t *text_end;
const uint8_t *subst_end;
const uint32_t *bytecode;
const Janet *constants;
JanetArray *captures;
@@ -69,7 +67,7 @@ typedef struct {
int32_t depth;
enum {
PEG_MODE_NORMAL,
PEG_MODE_SUBSTITUTE,
PEG_MODE_ACCUMULATE,
PEG_MODE_NOCAPTURE
} mode;
} PegState;
@@ -78,7 +76,6 @@ typedef struct {
* to save state at branches, and then reload
* if one branch fails and try a new branch. */
typedef struct {
const uint8_t *subst_end;
int32_t cap;
int32_t scratch;
} CapState;
@@ -86,7 +83,6 @@ typedef struct {
/* Save the current capture state */
static CapState cap_save(PegState *s) {
CapState cs;
cs.subst_end = s->subst_end;
cs.scratch = s->scratch->count;
cs.cap = s->captures->count;
return cs;
@@ -94,24 +90,16 @@ static CapState cap_save(PegState *s) {
/* Load a saved capture state in the case of failure */
static void cap_load(PegState *s, CapState cs) {
s->subst_end = cs.subst_end;
s->scratch->count = cs.scratch;
s->captures->count = cs.cap;
}
/* Add a capture */
static void pushcap(PegState *s,
Janet capture,
const uint8_t *text,
const uint8_t *result) {
if (s->mode == PEG_MODE_SUBSTITUTE) {
janet_buffer_push_bytes(s->scratch, s->subst_end,
(int32_t)(text - s->subst_end));
static void pushcap(PegState *s, Janet capture) {
if (s->mode == PEG_MODE_ACCUMULATE)
janet_to_string_b(s->scratch, capture);
s->subst_end = result;
} else if (s->mode == PEG_MODE_NORMAL) {
if (s->mode == PEG_MODE_NORMAL)
janet_array_push(s->captures, capture);
}
}
/* Prevent stack overflow */
@@ -120,7 +108,14 @@ static void pushcap(PegState *s,
} while (0)
#define up1(s) ((s)->depth++)
/* Evaluate a peg rule */
/* Evaluate a peg rule
* Pre-conditions: s is in a valid state
* Post-conditions: If there is a match, returns a pointer to the next text.
* All captures on the capture stack are valid. If there is no match,
* returns NULL. Extra captures from successful child expressions can be
* left on the capture stack. If s->mode was PEG_MODE_NOCAPTURE, captures MUST
* not be changed, though.
*/
static const uint8_t *peg_rule(
PegState *s,
const uint32_t *rule,
@@ -130,22 +125,26 @@ tail:
default:
janet_panic("unexpected opcode");
return NULL;
case RULE_LITERAL:
{
uint32_t len = rule[1];
if (text + len > s->text_end) return NULL;
return memcmp(text, rule + 2, len) ? NULL : text + len;
}
case RULE_NCHAR:
{
uint32_t n = rule[1];
return (text + n > s->text_end) ? NULL : text + n;
}
case RULE_NOTNCHAR:
{
uint32_t n = rule[1];
return (text + n > s->text_end) ? text : NULL;
}
case RULE_RANGE:
{
uint8_t lo = rule[1] & 0xFF;
@@ -156,6 +155,7 @@ tail:
? text + 1
: NULL;
}
case RULE_SET:
{
uint32_t word = rule[1 + (text[0] >> 5)];
@@ -164,6 +164,7 @@ tail:
? text + 1
: NULL;
}
case RULE_LOOK:
{
text += ((int32_t *)rule)[1];
@@ -176,6 +177,7 @@ tail:
s->mode = oldmode;
return result ? text : NULL;
}
case RULE_CHOICE:
{
uint32_t len = rule[1];
@@ -195,6 +197,7 @@ tail:
rule = s->bytecode + args[len - 1];
goto tail;
}
case RULE_SEQUENCE:
{
uint32_t len = rule[1];
@@ -208,6 +211,7 @@ tail:
rule = s->bytecode + args[len - 1];
goto tail;
}
case RULE_IF:
case RULE_IFNOT:
{
@@ -223,6 +227,7 @@ tail:
rule = rule_b;
goto tail;
}
case RULE_NOT:
{
const uint32_t *rule_a = s->bytecode + rule[1];
@@ -234,6 +239,7 @@ tail:
s->mode = oldmode;
return (result) ? NULL : text;
}
case RULE_BETWEEN:
{
uint32_t lo = rule[1];
@@ -243,7 +249,13 @@ tail:
const uint8_t *next_text;
CapState cs = cap_save(s);
down1(s);
while (captured < hi && (next_text = peg_rule(s, rule_a, text))) {
while (captured < hi) {
CapState cs2 = cap_save(s);
next_text = peg_rule(s, rule_a, text);
if (!next_text || next_text == text) {
cap_load(s, cs2);
break;
}
captured++;
text = next_text;
}
@@ -254,133 +266,100 @@ tail:
}
return text;
}
case RULE_POSITION:
{
pushcap(s, janet_wrap_number((double)(text - s->text_start)), text, text);
pushcap(s, janet_wrap_number((double)(text - s->text_start)));
return text;
}
case RULE_ARGUMENT:
{
int32_t index = ((int32_t *)rule)[1];
Janet capture = (index >= s->extrac) ? janet_wrap_nil() : s->extrav[index];
pushcap(s, capture, text, text);
return text;
}
case RULE_REPINDEX:
{
int32_t index = ((int32_t *)rule)[1];
if (index < 0) index += s->captures->count;
if (index >= s->captures->count || index < 0) return NULL;
Janet capture = s->captures->data[index];
pushcap(s, capture, text, text);
pushcap(s, capture);
return text;
}
case RULE_CONSTANT:
{
pushcap(s, s->constants[rule[1]], text, text);
pushcap(s, s->constants[rule[1]]);
return text;
}
case RULE_CAPTURE:
{
int oldmode = s->mode;
if (oldmode == PEG_MODE_NOCAPTURE) {
if (s->mode == PEG_MODE_NOCAPTURE) {
rule = s->bytecode + rule[1];
goto tail;
}
if (oldmode == PEG_MODE_SUBSTITUTE) s->mode = PEG_MODE_NOCAPTURE;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
if (oldmode != PEG_MODE_SUBSTITUTE)
pushcap(s, janet_stringv(text, (int32_t)(result - text)), text, result);
/* Specialized pushcap - avoid intermediate string creation */
if (s->mode == PEG_MODE_ACCUMULATE) {
janet_buffer_push_bytes(s->scratch, text, (int32_t)(result - text));
} else {
janet_array_push(s->captures, janet_stringv(text, (int32_t)(result - text)));
}
return result;
}
case RULE_SUBSTITUTE:
case RULE_GROUP:
case RULE_REPLACE:
case RULE_ACCUMULATE:
{
/* In no-capture mode, all captures simply become their matching pattern */
int oldmode = s->mode;
if (oldmode == PEG_MODE_NOCAPTURE) {
/* No capture mode, skip captures. Accumulate inside accumulate also does nothing. */
if (oldmode != PEG_MODE_NORMAL) {
rule = s->bytecode + rule[1];
goto tail;
}
/* Save previous state. Will use this to reload state before
* pushing grammar. Each of these rules pushes exactly 1 new
* capture, regardless of the sub rule. */
CapState cs = cap_save(s);
/* Set sub mode as needed. Modes affect how captures are recorded (pushed to stack,
* pushed to byte buffer, or ignored) */
if (rule[0] == RULE_GROUP) s->mode = PEG_MODE_NORMAL;
if (rule[0] == RULE_REPLACE) s->mode = PEG_MODE_NORMAL;
if (rule[0] == RULE_SUBSTITUTE) {
s->mode = PEG_MODE_SUBSTITUTE;
s->subst_end = text;
}
s->mode = PEG_MODE_ACCUMULATE;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
/* The replacement capture */
Janet cap;
/* Figure out what to push based on opcode */
if (rule[0] == RULE_GROUP) {
int32_t num_sub_captures = s->captures->count - cs.cap;
JanetArray *sub_captures = janet_array(num_sub_captures);
memcpy(sub_captures->data,
s->captures->data + cs.cap,
sizeof(Janet) * num_sub_captures);
sub_captures->count = num_sub_captures;
cap = janet_wrap_array(sub_captures);
} else if (rule[0] == RULE_SUBSTITUTE) {
janet_buffer_push_bytes(s->scratch, s->subst_end,
(int32_t)(result - s->subst_end));
cap = janet_stringv(s->scratch->data + cs.scratch,
s->scratch->count - cs.scratch);
} else { /* RULE_REPLACE */
Janet constant = s->constants[rule[2]];
switch (janet_type(constant)) {
default:
cap = constant;
break;
case JANET_STRUCT:
cap = janet_struct_get(janet_unwrap_struct(constant),
s->captures->data[s->captures->count - 1]);
break;
case JANET_TABLE:
cap = janet_table_get(janet_unwrap_table(constant),
s->captures->data[s->captures->count - 1]);
break;
case JANET_CFUNCTION:
cap = janet_unwrap_cfunction(constant)(s->captures->count - cs.cap,
s->captures->data + cs.cap);
break;
case JANET_FUNCTION:
cap = janet_call(janet_unwrap_function(constant),
s->captures->count - cs.cap,
s->captures->data + cs.cap);
break;
}
}
/* Reset old state and then push capture */
Janet cap = janet_stringv(s->scratch->data + cs.scratch, s->scratch->count - cs.scratch);
cap_load(s, cs);
pushcap(s, cap, text, result);
pushcap(s, cap);
return result;
}
case RULE_GROUP:
{
int oldmode = s->mode;
if (oldmode == PEG_MODE_NOCAPTURE) {
rule = s->bytecode + rule[1];
goto tail;
}
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
const uint8_t *result = peg_rule(s, s->bytecode + rule[1], text);
up1(s);
s->mode = oldmode;
if (!result) return NULL;
int32_t num_sub_captures = s->captures->count - cs.cap;
JanetArray *sub_captures = janet_array(num_sub_captures);
memcpy(sub_captures->data,
s->captures->data + cs.cap,
sizeof(Janet) * num_sub_captures);
sub_captures->count = num_sub_captures;
cap_load(s, cs);
pushcap(s, janet_wrap_array(sub_captures));
return result;
}
case RULE_REPLACE:
case RULE_MATCHTIME:
{
int oldmode = s->mode;
if (rule[0] == RULE_REPLACE && oldmode == PEG_MODE_NOCAPTURE) {
rule = s->bytecode + rule[1];
goto tail;
}
CapState cs = cap_save(s);
s->mode = PEG_MODE_NORMAL;
down1(s);
@@ -389,28 +368,36 @@ tail:
s->mode = oldmode;
if (!result) return NULL;
/* Now check captures with provided function */
int32_t argc = s->captures->count - cs.cap;
Janet *argv = s->captures->data + cs.cap;
Janet fval = s->constants[rule[2]];
Janet cap;
if (janet_checktype(fval, JANET_FUNCTION)) {
cap = janet_call(janet_unwrap_function(fval), argc, argv);
} else {
JanetCFunction cfun = janet_unwrap_cfunction(fval);
cap = cfun(argc, argv);
Janet constant = s->constants[rule[2]];
switch (janet_type(constant)) {
default:
cap = constant;
break;
case JANET_STRUCT:
cap = janet_struct_get(janet_unwrap_struct(constant),
s->captures->data[s->captures->count - 1]);
break;
case JANET_TABLE:
cap = janet_table_get(janet_unwrap_table(constant),
s->captures->data[s->captures->count - 1]);
break;
case JANET_CFUNCTION:
cap = janet_unwrap_cfunction(constant)(s->captures->count - cs.cap,
s->captures->data + cs.cap);
break;
case JANET_FUNCTION:
cap = janet_call(janet_unwrap_function(constant),
s->captures->count - cs.cap,
s->captures->data + cs.cap);
break;
}
cap_load(s, cs);
/* Capture failed */
if (!janet_truthy(cap)) return NULL;
/* Capture worked, so use new capture */
pushcap(s, cap, text, result);
if (rule[0] == RULE_MATCHTIME && !janet_truthy(cap)) return NULL;
pushcap(s, cap);
return result;
}
case RULE_ERROR:
{
int oldmode = s->mode;
@@ -672,8 +659,8 @@ static void spec_not(Builder *b, int32_t argc, const Janet *argv) {
static void spec_capture(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_CAPTURE);
}
static void spec_substitute(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_SUBSTITUTE);
static void spec_accumulate(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_ACCUMULATE);
}
static void spec_group(Builder *b, int32_t argc, const Janet *argv) {
spec_onerule(b, argc, argv, RULE_GROUP);
@@ -698,13 +685,6 @@ static void spec_position(Builder *b, int32_t argc, const Janet *argv) {
emit_rule(r, RULE_POSITION, 0, NULL);
}
static void spec_reference(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 1);
Reserve r = reserve(b, 2);
int32_t index = peg_getinteger(b, argv[0]);
emit_1(r, RULE_REPINDEX, index);
}
static void spec_argument(Builder *b, int32_t argc, const Janet *argv) {
peg_fixarity(b, argc, 1);
Reserve r = reserve(b, 2);
@@ -787,18 +767,18 @@ typedef struct {
static const SpecialPair specials[] = {
{"!", spec_not},
{"$", spec_position},
{"%", spec_substitute},
{"%", spec_accumulate},
{"*", spec_sequence},
{"+", spec_choice},
{"/", spec_replace},
{"<-", spec_capture},
{">", spec_look},
{"?", spec_opt},
{"accumulate", spec_accumulate},
{"any", spec_any},
{"argument", spec_argument},
{"at-least", spec_atleast},
{"at-most", spec_atmost},
{"backref", spec_reference},
{"between", spec_between},
{"capture", spec_capture},
{"choice", spec_choice},
@@ -817,7 +797,6 @@ static const SpecialPair specials[] = {
{"sequence", spec_sequence},
{"set", spec_set},
{"some", spec_some},
{"substitute", spec_substitute},
};
/* Compile a janet value into a rule and return the rule index. */