janet/src/core/parse.c

1323 lines
43 KiB
C

/*
* Copyright (c) 2023 Calvin Rose
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef JANET_AMALG
#include "features.h"
#include <janet.h>
#include "util.h"
#endif
#define JANET_PARSER_DEAD 0x1
#define JANET_PARSER_GENERATED_ERROR 0x2
/* Check if a character is whitespace */
static int is_whitespace(uint8_t c) {
return c == ' '
|| c == '\t'
|| c == '\n'
|| c == '\r'
|| c == '\0'
|| c == '\v'
|| c == '\f';
}
/* Code generated by tools/symcharsgen.c.
* The table contains 256 bits, where each bit is 1
* if the corresponding ascii code is a symbol char, and 0
* if not. The upper characters are also considered symbol
* chars and are then checked for utf-8 compliance. */
static const uint32_t symchars[8] = {
0x00000000, 0xf7ffec72, 0xc7ffffff, 0x07fffffe,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
/* Check if a character is a valid symbol character
* symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_| */
int janet_is_symbol_char(uint8_t c) {
return symchars[c >> 5] & ((uint32_t)1 << (c & 0x1F));
}
/* Validate some utf8. Useful for identifiers. Only validates
* the encoding, does not check for valid code points (they
* are less well defined than the encoding). */
int janet_valid_utf8(const uint8_t *str, int32_t len) {
int32_t i = 0;
int32_t j;
while (i < len) {
int32_t nexti;
uint8_t c = str[i];
/* Check the number of bytes in code point */
if (c < 0x80) nexti = i + 1;
else if ((c >> 5) == 0x06) nexti = i + 2;
else if ((c >> 4) == 0x0E) nexti = i + 3;
else if ((c >> 3) == 0x1E) nexti = i + 4;
/* Don't allow 5 or 6 byte code points */
else return 0;
/* No overflow */
if (nexti > len) return 0;
/* Ensure trailing bytes are well formed (10XX XXXX) */
for (j = i + 1; j < nexti; j++) {
if ((str[j] >> 6) != 2) return 0;
}
/* Check for overlong encoding */
if ((nexti == i + 2) && str[i] < 0xC2) return 0;
if ((str[i] == 0xE0) && str[i + 1] < 0xA0) return 0;
if ((str[i] == 0xF0) && str[i + 1] < 0x90) return 0;
i = nexti;
}
return 1;
}
/* Get hex digit from a letter */
static int to_hex(uint8_t c) {
if (c >= '0' && c <= '9') {
return c - '0';
} else if (c >= 'A' && c <= 'F') {
return 10 + c - 'A';
} else if (c >= 'a' && c <= 'f') {
return 10 + c - 'a';
} else {
return -1;
}
}
typedef int (*Consumer)(JanetParser *p, JanetParseState *state, uint8_t c);
struct JanetParseState {
int32_t counter;
int32_t argn;
int flags;
size_t line;
size_t column;
Consumer consumer;
};
/* Define a stack on the main parser struct */
#define DEF_PARSER_STACK(NAME, T, STACK, STACKCOUNT, STACKCAP) \
static void NAME(JanetParser *p, T x) { \
size_t oldcount = p->STACKCOUNT; \
size_t newcount = oldcount + 1; \
if (newcount > p->STACKCAP) { \
T *next; \
size_t newcap = 2 * newcount; \
next = janet_realloc(p->STACK, sizeof(T) * newcap); \
if (NULL == next) { \
JANET_OUT_OF_MEMORY; \
} \
p->STACK = next; \
p->STACKCAP = newcap; \
} \
p->STACK[oldcount] = x; \
p->STACKCOUNT = newcount; \
}
DEF_PARSER_STACK(push_buf, uint8_t, buf, bufcount, bufcap)
DEF_PARSER_STACK(push_arg, Janet, args, argcount, argcap)
DEF_PARSER_STACK(_pushstate, JanetParseState, states, statecount, statecap)
#undef DEF_PARSER_STACK
#define PFLAG_CONTAINER 0x100
#define PFLAG_BUFFER 0x200
#define PFLAG_PARENS 0x400
#define PFLAG_SQRBRACKETS 0x800
#define PFLAG_CURLYBRACKETS 0x1000
#define PFLAG_STRING 0x2000
#define PFLAG_LONGSTRING 0x4000
#define PFLAG_READERMAC 0x8000
#define PFLAG_ATSYM 0x10000
#define PFLAG_COMMENT 0x20000
#define PFLAG_TOKEN 0x40000
static void pushstate(JanetParser *p, Consumer consumer, int flags) {
JanetParseState s;
s.counter = 0;
s.argn = 0;
s.flags = flags;
s.consumer = consumer;
s.line = p->line;
s.column = p->column;
_pushstate(p, s);
}
static void popstate(JanetParser *p, Janet val) {
for (;;) {
JanetParseState top = p->states[--p->statecount];
JanetParseState *newtop = p->states + p->statecount - 1;
/* Source mapping info */
if (janet_checktype(val, JANET_TUPLE)) {
janet_tuple_sm_line(janet_unwrap_tuple(val)) = (int32_t) top.line;
janet_tuple_sm_column(janet_unwrap_tuple(val)) = (int32_t) top.column;
}
if (newtop->flags & PFLAG_CONTAINER) {
newtop->argn++;
/* Keep track of number of values in the root state */
if (p->statecount == 1) {
p->pending++;
/* Root items are always wrapped in a tuple for source map info. */
const Janet *tup = janet_tuple_n(&val, 1);
janet_tuple_sm_line(tup) = (int32_t) top.line;
janet_tuple_sm_column(tup) = (int32_t) top.column;
val = janet_wrap_tuple(tup);
}
push_arg(p, val);
return;
} else if (newtop->flags & PFLAG_READERMAC) {
Janet *t = janet_tuple_begin(2);
int c = newtop->flags & 0xFF;
const char *which =
(c == '\'') ? "quote" :
(c == ',') ? "unquote" :
(c == ';') ? "splice" :
(c == '|') ? "short-fn" :
(c == '~') ? "quasiquote" : "<unknown>";
t[0] = janet_csymbolv(which);
t[1] = val;
/* Quote source mapping info */
janet_tuple_sm_line(t) = (int32_t) newtop->line;
janet_tuple_sm_column(t) = (int32_t) newtop->column;
val = janet_wrap_tuple(janet_tuple_end(t));
} else {
return;
}
}
}
static void delim_error(JanetParser *parser, size_t stack_index, char c, const char *msg) {
JanetParseState *s = parser->states + stack_index;
JanetBuffer *buffer = janet_buffer(40);
if (msg) {
janet_buffer_push_cstring(buffer, msg);
}
if (c) {
janet_buffer_push_u8(buffer, c);
}
if (stack_index > 0) {
janet_buffer_push_cstring(buffer, ", ");
if (s->flags & PFLAG_PARENS) {
janet_buffer_push_u8(buffer, '(');
} else if (s->flags & PFLAG_SQRBRACKETS) {
janet_buffer_push_u8(buffer, '[');
} else if (s->flags & PFLAG_CURLYBRACKETS) {
janet_buffer_push_u8(buffer, '{');
} else if (s->flags & PFLAG_STRING) {
janet_buffer_push_u8(buffer, '"');
} else if (s->flags & PFLAG_LONGSTRING) {
int32_t i;
for (i = 0; i < s->argn; i++) {
janet_buffer_push_u8(buffer, '`');
}
}
janet_formatb(buffer, " opened at line %d, column %d", s->line, s->column);
}
parser->error = (const char *) janet_string(buffer->data, buffer->count);
parser->flag |= JANET_PARSER_GENERATED_ERROR;
}
static int checkescape(uint8_t c) {
switch (c) {
default:
return -1;
case 'x':
case 'u':
case 'U':
return 1;
case 'n':
return '\n';
case 't':
return '\t';
case 'r':
return '\r';
case '0':
return '\0';
case 'z':
return '\0';
case 'f':
return '\f';
case 'v':
return '\v';
case 'e':
return 27;
case '"':
return '"';
case '\\':
return '\\';
}
}
/* Forward declare */
static int stringchar(JanetParser *p, JanetParseState *state, uint8_t c);
static void write_codepoint(JanetParser *p, int32_t codepoint) {
if (codepoint <= 0x7F) {
push_buf(p, (uint8_t) codepoint);
} else if (codepoint <= 0x7FF) {
push_buf(p, (uint8_t)((codepoint >> 6) & 0x1F) | 0xC0);
push_buf(p, (uint8_t)((codepoint >> 0) & 0x3F) | 0x80);
} else if (codepoint <= 0xFFFF) {
push_buf(p, (uint8_t)((codepoint >> 12) & 0x0F) | 0xE0);
push_buf(p, (uint8_t)((codepoint >> 6) & 0x3F) | 0x80);
push_buf(p, (uint8_t)((codepoint >> 0) & 0x3F) | 0x80);
} else {
push_buf(p, (uint8_t)((codepoint >> 18) & 0x07) | 0xF0);
push_buf(p, (uint8_t)((codepoint >> 12) & 0x3F) | 0x80);
push_buf(p, (uint8_t)((codepoint >> 6) & 0x3F) | 0x80);
push_buf(p, (uint8_t)((codepoint >> 0) & 0x3F) | 0x80);
}
}
static int escapeh(JanetParser *p, JanetParseState *state, uint8_t c) {
int digit = to_hex(c);
if (digit < 0) {
p->error = "invalid hex digit in hex escape";
return 1;
}
state->argn = (state->argn << 4) + digit;
state->counter--;
if (!state->counter) {
push_buf(p, (uint8_t)(state->argn & 0xFF));
state->argn = 0;
state->consumer = stringchar;
}
return 1;
}
static int escapeu(JanetParser *p, JanetParseState *state, uint8_t c) {
int digit = to_hex(c);
if (digit < 0) {
p->error = "invalid hex digit in unicode escape";
return 1;
}
state->argn = (state->argn << 4) + digit;
state->counter--;
if (!state->counter) {
if (state->argn > 0x10FFFF) {
p->error = "invalid unicode codepoint";
return 1;
}
write_codepoint(p, state->argn);
state->argn = 0;
state->consumer = stringchar;
}
return 1;
}
static int escape1(JanetParser *p, JanetParseState *state, uint8_t c) {
int e = checkescape(c);
if (e < 0) {
p->error = "invalid string escape sequence";
return 1;
}
if (c == 'x') {
state->counter = 2;
state->argn = 0;
state->consumer = escapeh;
} else if (c == 'u' || c == 'U') {
state->counter = c == 'u' ? 4 : 6;
state->argn = 0;
state->consumer = escapeu;
} else {
push_buf(p, (uint8_t) e);
state->consumer = stringchar;
}
return 1;
}
static int stringend(JanetParser *p, JanetParseState *state) {
Janet ret;
uint8_t *bufstart = p->buf;
int32_t buflen = (int32_t) p->bufcount;
if (state->flags & PFLAG_LONGSTRING) {
/* Post process to remove leading whitespace */
JanetParseState top = p->states[p->statecount - 1];
int32_t indent_col = (int32_t) top.column - 1;
uint8_t *r = bufstart, *end = r + buflen;
/* Check if there are any characters before the start column -
* if so, do not reindent. */
int reindent = 1;
while (reindent && (r < end)) {
if (*r++ == '\n') {
for (int32_t j = 0; (r < end) && (*r != '\n') && (j < indent_col); j++, r++) {
if (*r != ' ') {
reindent = 0;
break;
}
}
}
}
/* Now reindent if able to, otherwise just drop leading newline. */
if (!reindent) {
if (buflen > 0 && bufstart[0] == '\n') {
buflen--;
bufstart++;
}
} else {
uint8_t *w = bufstart;
r = bufstart;
while (r < end) {
if (*r == '\n') {
if (r == bufstart) {
/* Skip leading newline */
r++;
} else {
*w++ = *r++;
}
for (int32_t j = 0; (r < end) && (*r != '\n') && (j < indent_col); j++, r++);
} else {
*w++ = *r++;
}
}
buflen = (int32_t)(w - bufstart);
}
/* Check for trailing newline character so we can remove it */
if (buflen > 0 && bufstart[buflen - 1] == '\n') {
buflen--;
}
}
if (state->flags & PFLAG_BUFFER) {
JanetBuffer *b = janet_buffer(buflen);
janet_buffer_push_bytes(b, bufstart, buflen);
ret = janet_wrap_buffer(b);
} else {
ret = janet_wrap_string(janet_string(bufstart, buflen));
}
p->bufcount = 0;
popstate(p, ret);
return 1;
}
static int stringchar(JanetParser *p, JanetParseState *state, uint8_t c) {
/* Enter escape */
if (c == '\\') {
state->consumer = escape1;
return 1;
}
/* String end */
if (c == '"') {
return stringend(p, state);
}
/* normal char */
if (c != '\n' && c != '\r')
push_buf(p, c);
return 1;
}
/* Check for string equality in the buffer */
static int check_str_const(const char *cstr, const uint8_t *str, int32_t len) {
int32_t index;
for (index = 0; index < len; index++) {
uint8_t c = str[index];
uint8_t k = ((const uint8_t *)cstr)[index];
if (c < k) return -1;
if (c > k) return 1;
if (k == '\0') break;
}
return (cstr[index] == '\0') ? 0 : -1;
}
static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
Janet ret;
double numval;
int32_t blen;
if (janet_is_symbol_char(c)) {
push_buf(p, (uint8_t) c);
if (c > 127) state->argn = 1; /* Use to indicate non ascii */
return 1;
}
/* Token finished */
blen = (int32_t) p->bufcount;
int start_dig = p->buf[0] >= '0' && p->buf[0] <= '9';
int start_num = start_dig || p->buf[0] == '-' || p->buf[0] == '+' || p->buf[0] == '.';
if (p->buf[0] == ':') {
/* Don't do full utf-8 check unless we have seen non ascii characters. */
int valid = (!state->argn) || janet_valid_utf8(p->buf + 1, blen - 1);
if (!valid) {
p->error = "invalid utf-8 in keyword";
return 0;
}
ret = janet_keywordv(p->buf + 1, blen - 1);
} else if (start_num && !janet_scan_number(p->buf, blen, &numval)) {
ret = janet_wrap_number(numval);
} else if (!check_str_const("nil", p->buf, blen)) {
ret = janet_wrap_nil();
} else if (!check_str_const("false", p->buf, blen)) {
ret = janet_wrap_false();
} else if (!check_str_const("true", p->buf, blen)) {
ret = janet_wrap_true();
} else {
if (start_dig) {
p->error = "symbol literal cannot start with a digit";
return 0;
} else {
/* Don't do full utf-8 check unless we have seen non ascii characters. */
int valid = (!state->argn) || janet_valid_utf8(p->buf, blen);
if (!valid) {
p->error = "invalid utf-8 in symbol";
return 0;
}
ret = janet_symbolv(p->buf, blen);
}
}
p->bufcount = 0;
popstate(p, ret);
return 0;
}
static int comment(JanetParser *p, JanetParseState *state, uint8_t c) {
(void) state;
if (c == '\n') {
p->statecount--;
p->bufcount = 0;
} else {
push_buf(p, c);
}
return 1;
}
static Janet close_tuple(JanetParser *p, JanetParseState *state, int32_t flag) {
Janet *ret = janet_tuple_begin(state->argn);
janet_tuple_flag(ret) |= flag;
for (int32_t i = state->argn - 1; i >= 0; i--)
ret[i] = p->args[--p->argcount];
return janet_wrap_tuple(janet_tuple_end(ret));
}
static Janet close_array(JanetParser *p, JanetParseState *state) {
JanetArray *array = janet_array(state->argn);
for (int32_t i = state->argn - 1; i >= 0; i--)
array->data[i] = p->args[--p->argcount];
array->count = state->argn;
return janet_wrap_array(array);
}
static Janet close_struct(JanetParser *p, JanetParseState *state) {
JanetKV *st = janet_struct_begin(state->argn >> 1);
for (size_t i = p->argcount - state->argn; i < p->argcount; i += 2) {
Janet key = p->args[i];
Janet value = p->args[i + 1];
janet_struct_put(st, key, value);
}
p->argcount -= state->argn;
return janet_wrap_struct(janet_struct_end(st));
}
static Janet close_table(JanetParser *p, JanetParseState *state) {
JanetTable *table = janet_table(state->argn >> 1);
for (size_t i = p->argcount - state->argn; i < p->argcount; i += 2) {
Janet key = p->args[i];
Janet value = p->args[i + 1];
janet_table_put(table, key, value);
}
p->argcount -= state->argn;
return janet_wrap_table(table);
}
#define PFLAG_INSTRING 0x100000
#define PFLAG_END_CANDIDATE 0x200000
static int longstring(JanetParser *p, JanetParseState *state, uint8_t c) {
if (state->flags & PFLAG_INSTRING) {
/* We are inside the long string */
if (c == '`') {
state->flags |= PFLAG_END_CANDIDATE;
state->flags &= ~PFLAG_INSTRING;
state->counter = 1; /* Use counter to keep track of number of '=' seen */
return 1;
}
push_buf(p, c);
return 1;
} else if (state->flags & PFLAG_END_CANDIDATE) {
int i;
/* We are checking a potential end of the string */
if (state->counter == state->argn) {
stringend(p, state);
return 0;
}
if (c == '`' && state->counter < state->argn) {
state->counter++;
return 1;
}
/* Failed end candidate */
for (i = 0; i < state->counter; i++) {
push_buf(p, '`');
}
push_buf(p, c);
state->counter = 0;
state->flags &= ~PFLAG_END_CANDIDATE;
state->flags |= PFLAG_INSTRING;
return 1;
} else {
/* We are at beginning of string */
state->argn++;
if (c != '`') {
state->flags |= PFLAG_INSTRING;
push_buf(p, c);
}
return 1;
}
}
static int root(JanetParser *p, JanetParseState *state, uint8_t c);
static int atsign(JanetParser *p, JanetParseState *state, uint8_t c) {
(void) state;
p->statecount--;
switch (c) {
case '{':
pushstate(p, root, PFLAG_CONTAINER | PFLAG_CURLYBRACKETS | PFLAG_ATSYM);
return 1;
case '"':
pushstate(p, stringchar, PFLAG_BUFFER | PFLAG_STRING);
return 1;
case '`':
pushstate(p, longstring, PFLAG_BUFFER | PFLAG_LONGSTRING);
return 1;
case '[':
pushstate(p, root, PFLAG_CONTAINER | PFLAG_SQRBRACKETS | PFLAG_ATSYM);
return 1;
case '(':
pushstate(p, root, PFLAG_CONTAINER | PFLAG_PARENS | PFLAG_ATSYM);
return 1;
default:
break;
}
pushstate(p, tokenchar, PFLAG_TOKEN);
push_buf(p, '@'); /* Push the leading at-sign that was dropped */
return 0;
}
/* The root state of the parser */
static int root(JanetParser *p, JanetParseState *state, uint8_t c) {
switch (c) {
default:
if (is_whitespace(c)) return 1;
if (!janet_is_symbol_char(c)) {
p->error = "unexpected character";
return 1;
}
pushstate(p, tokenchar, PFLAG_TOKEN);
return 0;
case '\'':
case ',':
case ';':
case '~':
case '|':
pushstate(p, root, PFLAG_READERMAC | c);
return 1;
case '"':
pushstate(p, stringchar, PFLAG_STRING);
return 1;
case '#':
pushstate(p, comment, PFLAG_COMMENT);
return 1;
case '@':
pushstate(p, atsign, PFLAG_ATSYM);
return 1;
case '`':
pushstate(p, longstring, PFLAG_LONGSTRING);
return 1;
case ')':
case ']':
case '}': {
Janet ds;
if (p->statecount == 1) {
delim_error(p, 0, c, "unexpected closing delimiter ");
return 1;
}
if ((c == ')' && (state->flags & PFLAG_PARENS)) ||
(c == ']' && (state->flags & PFLAG_SQRBRACKETS))) {
if (state->flags & PFLAG_ATSYM) {
ds = close_array(p, state);
} else {
ds = close_tuple(p, state, c == ']' ? JANET_TUPLE_FLAG_BRACKETCTOR : 0);
}
} else if (c == '}' && (state->flags & PFLAG_CURLYBRACKETS)) {
if (state->argn & 1) {
p->error = "struct and table literals expect even number of arguments";
return 1;
}
if (state->flags & PFLAG_ATSYM) {
ds = close_table(p, state);
} else {
ds = close_struct(p, state);
}
} else {
delim_error(p, p->statecount - 1, c, "mismatched delimiter ");
return 1;
}
popstate(p, ds);
}
return 1;
case '(':
pushstate(p, root, PFLAG_CONTAINER | PFLAG_PARENS);
return 1;
case '[':
pushstate(p, root, PFLAG_CONTAINER | PFLAG_SQRBRACKETS);
return 1;
case '{':
pushstate(p, root, PFLAG_CONTAINER | PFLAG_CURLYBRACKETS);
return 1;
}
}
static void janet_parser_checkdead(JanetParser *parser) {
if (parser->flag) janet_panic("parser is dead, cannot consume");
if (parser->error) janet_panic("parser has unchecked error, cannot consume");
}
/* Public API */
void janet_parser_consume(JanetParser *parser, uint8_t c) {
int consumed = 0;
janet_parser_checkdead(parser);
if (c == '\r') {
parser->line++;
parser->column = 0;
} else if (c == '\n') {
parser->column = 0;
if (parser->lookback != '\r')
parser->line++;
} else {
parser->column++;
}
while (!consumed && !parser->error) {
JanetParseState *state = parser->states + parser->statecount - 1;
consumed = state->consumer(parser, state, c);
}
parser->lookback = c;
}
void janet_parser_eof(JanetParser *parser) {
janet_parser_checkdead(parser);
size_t oldcolumn = parser->column;
size_t oldline = parser->line;
janet_parser_consume(parser, '\n');
if (parser->statecount > 1) {
delim_error(parser, parser->statecount - 1, 0, "unexpected end of source");
}
parser->line = oldline;
parser->column = oldcolumn;
parser->flag |= JANET_PARSER_DEAD;
}
enum JanetParserStatus janet_parser_status(JanetParser *parser) {
if (parser->error) return JANET_PARSE_ERROR;
if (parser->flag) return JANET_PARSE_DEAD;
if (parser->statecount > 1) return JANET_PARSE_PENDING;
return JANET_PARSE_ROOT;
}
void janet_parser_flush(JanetParser *parser) {
parser->argcount = 0;
parser->statecount = 1;
parser->bufcount = 0;
parser->pending = 0;
}
const char *janet_parser_error(JanetParser *parser) {
enum JanetParserStatus status = janet_parser_status(parser);
if (status == JANET_PARSE_ERROR) {
const char *e = parser->error;
parser->error = NULL;
parser->flag &= ~JANET_PARSER_GENERATED_ERROR;
janet_parser_flush(parser);
return e;
}
return NULL;
}
Janet janet_parser_produce(JanetParser *parser) {
Janet ret;
size_t i;
if (parser->pending == 0) return janet_wrap_nil();
ret = janet_unwrap_tuple(parser->args[0])[0];
for (i = 1; i < parser->argcount; i++) {
parser->args[i - 1] = parser->args[i];
}
parser->pending--;
parser->argcount--;
parser->states[0].argn--;
return ret;
}
Janet janet_parser_produce_wrapped(JanetParser *parser) {
Janet ret;
size_t i;
if (parser->pending == 0) return janet_wrap_nil();
ret = parser->args[0];
for (i = 1; i < parser->argcount; i++) {
parser->args[i - 1] = parser->args[i];
}
parser->pending--;
parser->argcount--;
parser->states[0].argn--;
return ret;
}
void janet_parser_init(JanetParser *parser) {
parser->args = NULL;
parser->states = NULL;
parser->buf = NULL;
parser->argcount = 0;
parser->argcap = 0;
parser->bufcount = 0;
parser->bufcap = 0;
parser->statecount = 0;
parser->statecap = 0;
parser->error = NULL;
parser->lookback = -1;
parser->line = 1;
parser->column = 0;
parser->pending = 0;
parser->flag = 0;
pushstate(parser, root, PFLAG_CONTAINER);
}
void janet_parser_deinit(JanetParser *parser) {
janet_free(parser->args);
janet_free(parser->buf);
janet_free(parser->states);
}
void janet_parser_clone(const JanetParser *src, JanetParser *dest) {
/* Misc fields */
dest->flag = src->flag;
dest->pending = src->pending;
dest->lookback = src->lookback;
dest->line = src->line;
dest->column = src->column;
dest->error = src->error;
/* Keep counts */
dest->argcount = src->argcount;
dest->bufcount = src->bufcount;
dest->statecount = src->statecount;
/* Capacities are equal to counts */
dest->bufcap = dest->bufcount;
dest->statecap = dest->statecount;
dest->argcap = dest->argcount;
/* Deep cloned fields */
dest->args = NULL;
dest->states = NULL;
dest->buf = NULL;
if (dest->bufcap) {
dest->buf = janet_malloc(dest->bufcap);
if (!dest->buf) goto nomem;
memcpy(dest->buf, src->buf, dest->bufcap);
}
if (dest->argcap) {
dest->args = janet_malloc(sizeof(Janet) * dest->argcap);
if (!dest->args) goto nomem;
memcpy(dest->args, src->args, dest->argcap * sizeof(Janet));
}
if (dest->statecap) {
dest->states = janet_malloc(sizeof(JanetParseState) * dest->statecap);
if (!dest->states) goto nomem;
memcpy(dest->states, src->states, dest->statecap * sizeof(JanetParseState));
}
return;
nomem:
JANET_OUT_OF_MEMORY;
}
int janet_parser_has_more(JanetParser *parser) {
return !!parser->pending;
}
/* C functions */
static int parsermark(void *p, size_t size) {
size_t i;
JanetParser *parser = (JanetParser *)p;
(void) size;
for (i = 0; i < parser->argcount; i++) {
janet_mark(parser->args[i]);
}
if (parser->flag & JANET_PARSER_GENERATED_ERROR) {
janet_mark(janet_wrap_string((const uint8_t *) parser->error));
}
return 0;
}
static int parsergc(void *p, size_t size) {
JanetParser *parser = (JanetParser *)p;
(void) size;
janet_parser_deinit(parser);
return 0;
}
static int parserget(void *p, Janet key, Janet *out);
static Janet parsernext(void *p, Janet key);
const JanetAbstractType janet_parser_type = {
"core/parser",
parsergc,
parsermark,
parserget,
NULL, /* put */
NULL, /* marshal */
NULL, /* unmarshal */
NULL, /* tostring */
NULL, /* compare */
NULL, /* hash */
parsernext,
JANET_ATEND_NEXT
};
/* C Function parser */
JANET_CORE_FN(cfun_parse_parser,
"(parser/new)",
"Creates and returns a new parser object. Parsers are state machines "
"that can receive bytes and generate a stream of values.") {
(void) argv;
janet_fixarity(argc, 0);
JanetParser *p = janet_abstract(&janet_parser_type, sizeof(JanetParser));
janet_parser_init(p);
return janet_wrap_abstract(p);
}
JANET_CORE_FN(cfun_parse_consume,
"(parser/consume parser bytes &opt index)",
"Input bytes into the parser and parse them. Will not throw errors "
"if there is a parse error. Starts at the byte index given by `index`. Returns "
"the number of bytes read.") {
janet_arity(argc, 2, 3);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
JanetByteView view = janet_getbytes(argv, 1);
if (argc == 3) {
int32_t offset = janet_getinteger(argv, 2);
if (offset < 0 || offset > view.len)
janet_panicf("invalid offset %d out of range [0,%d]", offset, view.len);
view.len -= offset;
view.bytes += offset;
}
int32_t i;
for (i = 0; i < view.len; i++) {
janet_parser_consume(p, view.bytes[i]);
switch (janet_parser_status(p)) {
case JANET_PARSE_ROOT:
case JANET_PARSE_PENDING:
break;
default:
return janet_wrap_integer(i + 1);
}
}
return janet_wrap_integer(i);
}
JANET_CORE_FN(cfun_parse_eof,
"(parser/eof parser)",
"Indicate to the parser that the end of file was reached. This puts the parser in the :dead state.") {
janet_fixarity(argc, 1);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
janet_parser_eof(p);
return argv[0];
}
JANET_CORE_FN(cfun_parse_insert,
"(parser/insert parser value)",
"Insert a value into the parser. This means that the parser state can be manipulated "
"in between chunks of bytes. This would allow a user to add extra elements to arrays "
"and tuples, for example. Returns the parser.") {
janet_fixarity(argc, 2);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
JanetParseState *s = p->states + p->statecount - 1;
if (s->consumer == tokenchar) {
janet_parser_consume(p, ' ');
p->column--;
s = p->states + p->statecount - 1;
}
if (s->flags & PFLAG_COMMENT) s--;
if (s->flags & PFLAG_CONTAINER) {
s->argn++;
if (p->statecount == 1) {
p->pending++;
Janet tup = janet_wrap_tuple(janet_tuple_n(argv + 1, 1));
push_arg(p, tup);
} else {
push_arg(p, argv[1]);
}
} else if (s->flags & (PFLAG_STRING | PFLAG_LONGSTRING)) {
const uint8_t *str = janet_to_string(argv[1]);
int32_t slen = janet_string_length(str);
size_t newcount = p->bufcount + slen;
if (p->bufcap < newcount) {
size_t newcap = 2 * newcount;
p->buf = janet_realloc(p->buf, newcap);
if (p->buf == NULL) {
JANET_OUT_OF_MEMORY;
}
p->bufcap = newcap;
}
safe_memcpy(p->buf + p->bufcount, str, slen);
p->bufcount = newcount;
} else {
janet_panic("cannot insert value into parser");
}
return argv[0];
}
JANET_CORE_FN(cfun_parse_has_more,
"(parser/has-more parser)",
"Check if the parser has more values in the value queue.") {
janet_fixarity(argc, 1);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
return janet_wrap_boolean(janet_parser_has_more(p));
}
JANET_CORE_FN(cfun_parse_byte,
"(parser/byte parser b)",
"Input a single byte `b` into the parser byte stream. Returns the parser.") {
janet_fixarity(argc, 2);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
int32_t i = janet_getinteger(argv, 1);
janet_parser_consume(p, 0xFF & i);
return argv[0];
}
JANET_CORE_FN(cfun_parse_status,
"(parser/status parser)",
"Gets the current status of the parser state machine. The status will "
"be one of:\n\n"
"* :pending - a value is being parsed.\n\n"
"* :error - a parsing error was encountered.\n\n"
"* :root - the parser can either read more values or safely terminate.") {
janet_fixarity(argc, 1);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
const char *stat = NULL;
switch (janet_parser_status(p)) {
case JANET_PARSE_PENDING:
stat = "pending";
break;
case JANET_PARSE_ERROR:
stat = "error";
break;
case JANET_PARSE_ROOT:
stat = "root";
break;
case JANET_PARSE_DEAD:
stat = "dead";
break;
}
return janet_ckeywordv(stat);
}
JANET_CORE_FN(cfun_parse_error,
"(parser/error parser)",
"If the parser is in the error state, returns the message associated with "
"that error. Otherwise, returns nil. Also flushes the parser state and parser "
"queue, so be sure to handle everything in the queue before calling "
"`parser/error`.") {
janet_fixarity(argc, 1);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
const char *err = janet_parser_error(p);
if (err) {
return (p->flag & JANET_PARSER_GENERATED_ERROR)
? janet_wrap_string((const uint8_t *) err)
: janet_cstringv(err);
}
return janet_wrap_nil();
}
JANET_CORE_FN(cfun_parse_produce,
"(parser/produce parser &opt wrap)",
"Dequeue the next value in the parse queue. Will return nil if "
"no parsed values are in the queue, otherwise will dequeue the "
"next value. If `wrap` is truthy, will return a 1-element tuple that "
"wraps the result. This tuple can be used for source-mapping "
"purposes.") {
janet_arity(argc, 1, 2);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
if (argc == 2 && janet_truthy(argv[1])) {
return janet_parser_produce_wrapped(p);
} else {
return janet_parser_produce(p);
}
}
JANET_CORE_FN(cfun_parse_flush,
"(parser/flush parser)",
"Clears the parser state and parse queue. Can be used to reset the parser "
"if an error was encountered. Does not reset the line and column counter, so "
"to begin parsing in a new context, create a new parser.") {
janet_fixarity(argc, 1);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
janet_parser_flush(p);
return argv[0];
}
JANET_CORE_FN(cfun_parse_where,
"(parser/where parser &opt line col)",
"Returns the current line number and column of the parser's internal state. If line is "
"provided, the current line number of the parser is first set to that value. If column is "
"also provided, the current column number of the parser is also first set to that value.") {
janet_arity(argc, 1, 3);
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
if (argc > 1) {
int32_t line = janet_getinteger(argv, 1);
if (line < 1)
janet_panicf("invalid line number %d", line);
p->line = (size_t) line;
}
if (argc > 2) {
int32_t column = janet_getinteger(argv, 2);
if (column < 0)
janet_panicf("invalid column number %d", column);
p->column = (size_t) column;
}
Janet *tup = janet_tuple_begin(2);
tup[0] = janet_wrap_integer(p->line);
tup[1] = janet_wrap_integer(p->column);
return janet_wrap_tuple(janet_tuple_end(tup));
}
static Janet janet_wrap_parse_state(JanetParseState *s, Janet *args,
uint8_t *buff, uint32_t bufcount) {
JanetTable *state = janet_table(0);
const uint8_t *buffer;
int add_buffer = 0;
const char *type = NULL;
if (s->flags & PFLAG_CONTAINER) {
JanetArray *container_args = janet_array(s->argn);
for (int32_t i = 0; i < s->argn; i++) {
janet_array_push(container_args, args[i]);
}
janet_table_put(state, janet_ckeywordv("args"),
janet_wrap_array(container_args));
}
if (s->flags & PFLAG_PARENS || s->flags & PFLAG_SQRBRACKETS) {
if (s->flags & PFLAG_ATSYM) {
type = "array";
} else {
type = "tuple";
}
} else if (s->flags & PFLAG_CURLYBRACKETS) {
if (s->flags & PFLAG_ATSYM) {
type = "table";
} else {
type = "struct";
}
} else if (s->flags & PFLAG_STRING || s->flags & PFLAG_LONGSTRING) {
if (s->flags & PFLAG_BUFFER) {
type = "buffer";
} else {
type = "string";
}
add_buffer = 1;
} else if (s->flags & PFLAG_COMMENT) {
type = "comment";
add_buffer = 1;
} else if (s->flags & PFLAG_TOKEN) {
type = "token";
add_buffer = 1;
} else if (s->flags & PFLAG_ATSYM) {
type = "at";
} else if (s->flags & PFLAG_READERMAC) {
int c = s->flags & 0xFF;
type = (c == '\'') ? "quote" :
(c == ',') ? "unquote" :
(c == ';') ? "splice" :
(c == '~') ? "quasiquote" : "<reader>";
} else {
type = "root";
}
if (type) {
janet_table_put(state, janet_ckeywordv("type"),
janet_ckeywordv(type));
}
if (add_buffer) {
buffer = janet_string(buff, bufcount);
janet_table_put(state, janet_ckeywordv("buffer"), janet_wrap_string(buffer));
}
janet_table_put(state, janet_ckeywordv("line"), janet_wrap_integer(s->line));
janet_table_put(state, janet_ckeywordv("column"), janet_wrap_integer(s->column));
return janet_wrap_table(state);
}
struct ParserStateGetter {
const char *name;
Janet(*fn)(const JanetParser *p);
};
static Janet parser_state_delimiters(const JanetParser *_p) {
JanetParser *p = (JanetParser *)_p;
size_t i;
const uint8_t *str;
size_t oldcount;
oldcount = p->bufcount;
for (i = 0; i < p->statecount; i++) {
JanetParseState *s = p->states + i;
if (s->flags & PFLAG_PARENS) {
push_buf(p, '(');
} else if (s->flags & PFLAG_SQRBRACKETS) {
push_buf(p, '[');
} else if (s->flags & PFLAG_CURLYBRACKETS) {
push_buf(p, '{');
} else if (s->flags & PFLAG_STRING) {
push_buf(p, '"');
} else if (s->flags & PFLAG_LONGSTRING) {
int32_t i;
for (i = 0; i < s->argn; i++) {
push_buf(p, '`');
}
}
}
/* avoid ptr arithmetic on NULL */
str = janet_string(oldcount ? p->buf + oldcount : p->buf, (int32_t)(p->bufcount - oldcount));
p->bufcount = oldcount;
return janet_wrap_string(str);
}
static Janet parser_state_frames(const JanetParser *p) {
int32_t count = (int32_t) p->statecount;
JanetArray *states = janet_array(count);
states->count = count;
uint8_t *buf = p->buf;
/* Iterate arg stack backwards */
Janet *args = p->argcount ? p->args + p->argcount : p->args; /* avoid ptr arithmetic on NULL */
for (int32_t i = count - 1; i >= 0; --i) {
JanetParseState *s = p->states + i;
/* avoid ptr arithmetic on args if NULL */
if ((s->flags & PFLAG_CONTAINER) && s->argn) {
args -= s->argn;
}
states->data[i] = janet_wrap_parse_state(s, args, buf, (uint32_t) p->bufcount);
}
return janet_wrap_array(states);
}
static const struct ParserStateGetter parser_state_getters[] = {
{"frames", parser_state_frames},
{"delimiters", parser_state_delimiters},
{NULL, NULL}
};
JANET_CORE_FN(cfun_parse_state,
"(parser/state parser &opt key)",
"Returns a representation of the internal state of the parser. If a key is passed, "
"only that information about the state is returned. Allowed keys are:\n\n"
"* :delimiters - Each byte in the string represents a nested data structure. For example, "
"if the parser state is '([\"', then the parser is in the middle of parsing a "
"string inside of square brackets inside parentheses. Can be used to augment a REPL prompt.\n\n"
"* :frames - Each table in the array represents a 'frame' in the parser state. Frames "
"contain information about the start of the expression being parsed as well as the "
"type of that expression and some type-specific information.") {
janet_arity(argc, 1, 2);
const uint8_t *key = NULL;
JanetParser *p = janet_getabstract(argv, 0, &janet_parser_type);
if (argc == 2) {
key = janet_getkeyword(argv, 1);
}
if (key) {
/* Get one result */
for (const struct ParserStateGetter *sg = parser_state_getters;
sg->name != NULL; sg++) {
if (janet_cstrcmp(key, sg->name)) continue;
return sg->fn(p);
}
janet_panicf("unexpected keyword %v", janet_wrap_keyword(key));
return janet_wrap_nil();
} else {
/* Put results in table */
JanetTable *tab = janet_table(0);
for (const struct ParserStateGetter *sg = parser_state_getters;
sg->name != NULL; sg++) {
janet_table_put(tab, janet_ckeywordv(sg->name), sg->fn(p));
}
return janet_wrap_table(tab);
}
}
JANET_CORE_FN(cfun_parse_clone,
"(parser/clone p)",
"Creates a deep clone of a parser that is identical to the input parser. "
"This cloned parser can be used to continue parsing from a good checkpoint "
"if parsing later fails. Returns a new parser.") {
janet_fixarity(argc, 1);
JanetParser *src = janet_getabstract(argv, 0, &janet_parser_type);
JanetParser *dest = janet_abstract(&janet_parser_type, sizeof(JanetParser));
janet_parser_clone(src, dest);
return janet_wrap_abstract(dest);
}
static const JanetMethod parser_methods[] = {
{"byte", cfun_parse_byte},
{"clone", cfun_parse_clone},
{"consume", cfun_parse_consume},
{"eof", cfun_parse_eof},
{"error", cfun_parse_error},
{"flush", cfun_parse_flush},
{"has-more", cfun_parse_has_more},
{"insert", cfun_parse_insert},
{"produce", cfun_parse_produce},
{"state", cfun_parse_state},
{"status", cfun_parse_status},
{"where", cfun_parse_where},
{NULL, NULL}
};
static int parserget(void *p, Janet key, Janet *out) {
(void) p;
if (!janet_checktype(key, JANET_KEYWORD)) return 0;
return janet_getmethod(janet_unwrap_keyword(key), parser_methods, out);
}
static Janet parsernext(void *p, Janet key) {
(void) p;
return janet_nextmethod(parser_methods, key);
}
/* Load the library */
void janet_lib_parse(JanetTable *env) {
JanetRegExt parse_cfuns[] = {
JANET_CORE_REG("parser/new", cfun_parse_parser),
JANET_CORE_REG("parser/clone", cfun_parse_clone),
JANET_CORE_REG("parser/has-more", cfun_parse_has_more),
JANET_CORE_REG("parser/produce", cfun_parse_produce),
JANET_CORE_REG("parser/consume", cfun_parse_consume),
JANET_CORE_REG("parser/byte", cfun_parse_byte),
JANET_CORE_REG("parser/error", cfun_parse_error),
JANET_CORE_REG("parser/status", cfun_parse_status),
JANET_CORE_REG("parser/flush", cfun_parse_flush),
JANET_CORE_REG("parser/state", cfun_parse_state),
JANET_CORE_REG("parser/where", cfun_parse_where),
JANET_CORE_REG("parser/eof", cfun_parse_eof),
JANET_CORE_REG("parser/insert", cfun_parse_insert),
JANET_REG_END
};
janet_core_cfuns_ext(env, NULL, parse_cfuns);
}