/* * Copyright (c) 2018 Calvin Rose * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include /* Quote a value */ static Janet quote(Janet x) { Janet *t = janet_tuple_begin(2); t[0] = janet_csymbolv("quote"); t[1] = x; return janet_wrap_tuple(janet_tuple_end(t)); } /* Check if a character is whitespace */ static int is_whitespace(uint8_t c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0' || c == '\f' || c == ';' || c == ','; } /* Code generated by tools/symcharsgen.c. * The table contains 256 bits, where each bit is 1 * if the corresponding ascci code is a symbol char, and 0 * if not. The upper characters are also considered symbol * chars and are then checked for utf-8 compliance. */ static const uint32_t symchars[8] = { 0x00000000, 0xf7ffec72, 0xc7ffffff, 0x57fffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; /* Check if a character is a valid symbol character * symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_~| */ static int is_symbol_char(uint8_t c) { return symchars[c >> 5] & (1 << (c & 0x1F)); } /* Validate some utf8. Useful for identifiers. Only validates * the encoding, does not check for valid codepoints (they * are less well defined than the encoding). */ static int valid_utf8(const uint8_t *str, int32_t len) { int32_t i = 0; int32_t j; while (i < len) { int32_t nexti; uint8_t c = str[i]; /* Check the number of bytes in code point */ if (c < 0x80) nexti = i + 1; else if ((c >> 5) == 0x06) nexti = i + 2; else if ((c >> 4) == 0x0E) nexti = i + 3; else if ((c >> 3) == 0x1E) nexti = i + 4; /* Don't allow 5 or 6 byte code points */ else return 0; /* No overflow */ if (nexti > len) return 0; /* Ensure trailing bytes are well formed (10XX XXXX) */ for (j = i + 1; j < nexti; j++) { if ((str[j] >> 6) != 2) return 0; } /* Check for overlong encodings */ if ((nexti == i + 2) && str[i] < 0xC2) return 0; if ((str[i] == 0xE0) && str[i + 1] < 0xA0) return 0; if ((str[i] == 0xF0) && str[i + 1] < 0x90) return 0; i = nexti; } return 1; } /* Get hex digit from a letter */ static int to_hex(uint8_t c) { if (c >= '0' && c <= '9') { return c - '0'; } else if (c >= 'A' && c <= 'F') { return 10 + c - 'A'; } else if (c >= 'a' && c <= 'f') { return 10 + c - 'a'; } else { return -1; } } typedef int (*Consumer)(JanetParser *p, JanetParseState *state, uint8_t c); struct JanetParseState { int32_t qcount; int32_t argn; int flags; size_t start_line; size_t start_col; Consumer consumer; }; /* Define a stack on the main parser struct */ #define DEF_PARSER_STACK(NAME, T, STACK, STACKCOUNT, STACKCAP) \ static void NAME(JanetParser *p, T x) { \ size_t oldcount = p->STACKCOUNT; \ size_t newcount = oldcount + 1; \ if (newcount > p->STACKCAP) { \ T *next; \ size_t newcap = 2 * newcount; \ next = realloc(p->STACK, sizeof(T) * newcap); \ if (NULL == next) { \ JANET_OUT_OF_MEMORY; \ } \ p->STACK = next; \ p->STACKCAP = newcap; \ } \ p->STACK[oldcount] = x; \ p->STACKCOUNT = newcount; \ } DEF_PARSER_STACK(push_buf, uint8_t, buf, bufcount, bufcap) DEF_PARSER_STACK(push_arg, Janet, args, argcount, argcap) DEF_PARSER_STACK(_pushstate, JanetParseState, states, statecount, statecap) #undef DEF_PARSER_STACK #define PFLAG_CONTAINER 1 #define PFLAG_BUFFER 2 #define PFLAG_PARENS 4 #define PFLAG_SQRBRACKETS 8 #define PFLAG_CURLYBRACKETS 16 #define PFLAG_STRING 32 #define PFLAG_LONGSTRING 64 static void pushstate(JanetParser *p, Consumer consumer, int flags) { JanetParseState s; s.qcount = 0; s.argn = 0; s.flags = flags; s.consumer = consumer; s.start_line = p->line; s.start_col = p->col; _pushstate(p, s); } static void popstate(JanetParser *p, Janet val) { JanetParseState top = p->states[--p->statecount]; JanetParseState *newtop = p->states + p->statecount - 1; if (newtop->flags & PFLAG_CONTAINER) { int32_t i, len; len = newtop->qcount; /* Quote the returned value qcount times */ for (i = 0; i < len; i++) { if (janet_checktype(val, JANET_TUPLE)) { janet_tuple_sm_line(janet_unwrap_tuple(val)) = (int32_t) top.start_line; janet_tuple_sm_col(janet_unwrap_tuple(val)) = (int32_t) top.start_col; } val = quote(val); } newtop->qcount = 0; /* Ast wrap */ if (janet_checktype(val, JANET_TUPLE)) { janet_tuple_sm_line(janet_unwrap_tuple(val)) = (int32_t) top.start_line; janet_tuple_sm_col(janet_unwrap_tuple(val)) = (int32_t) top.start_col; } newtop->argn++; push_arg(p, val); } } static int checkescape(uint8_t c) { switch (c) { default: return -1; case 'x': return 1; case 'n': return '\n'; case 't': return '\t'; case 'r': return '\r'; case '0': return '\0'; case 'z': return '\0'; case 'f': return '\f'; case 'e': return 27; case '"': return '"'; case '\\': return '\\'; } } /* Forward declare */ static int stringchar(JanetParser *p, JanetParseState *state, uint8_t c); static int escapeh(JanetParser *p, JanetParseState *state, uint8_t c) { int digit = to_hex(c); if (digit < 0) { p->error = "invalid hex digit in hex escape"; return 1; } state->argn = (state->argn << 4) + digit;; state->qcount--; if (!state->qcount) { push_buf(p, (state->argn & 0xFF)); state->argn = 0; state->consumer = stringchar; } return 1; } static int escape1(JanetParser *p, JanetParseState *state, uint8_t c) { int e = checkescape(c); if (e < 0) { p->error = "invalid string escape sequence"; return 1; } if (c == 'x') { state->qcount = 2; state->argn = 0; state->consumer = escapeh; } else { push_buf(p, (uint8_t) e); state->consumer = stringchar; } return 1; } static int stringend(JanetParser *p, JanetParseState *state) { Janet ret; if (state->flags & PFLAG_BUFFER) { JanetBuffer *b = janet_buffer((int32_t)p->bufcount); janet_buffer_push_bytes(b, p->buf, (int32_t)p->bufcount); ret = janet_wrap_buffer(b); } else { ret = janet_wrap_string(janet_string(p->buf, (int32_t)p->bufcount)); } p->bufcount = 0; popstate(p, ret); return 1; } static int stringchar(JanetParser *p, JanetParseState *state, uint8_t c) { /* Enter escape */ if (c == '\\') { state->consumer = escape1; return 1; } /* String end */ if (c == '"') { return stringend(p, state); } /* normal char */ push_buf(p, c); return 1; } /* Check for string equality in the buffer */ static int check_str_const(const char *cstr, const uint8_t *str, int32_t len) { int32_t index; for (index = 0; index < len; index++) { uint8_t c = str[index]; uint8_t k = ((const uint8_t *)cstr)[index]; if (c < k) return -1; if (c > k) return 1; if (k == '\0') break; } return (cstr[index] == '\0') ? 0 : -1; } static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { Janet numcheck, ret; int32_t blen; if (is_symbol_char(c)) { push_buf(p, (uint8_t) c); if (c > 127) state->argn = 1; /* Use to indicate non ascii */ return 1; } /* Token finished */ blen = (int32_t) p->bufcount; numcheck = janet_scan_number(p->buf, blen); if (!janet_checktype(numcheck, JANET_NIL)) { ret = numcheck; } else if (!check_str_const("nil", p->buf, blen)) { ret = janet_wrap_nil(); } else if (!check_str_const("false", p->buf, blen)) { ret = janet_wrap_false(); } else if (!check_str_const("true", p->buf, blen)) { ret = janet_wrap_true(); } else if (p->buf) { if (p->buf[0] >= '0' && p->buf[0] <= '9') { p->error = "symbol literal cannot start with a digit"; return 0; } else { /* Don't do full utf8 check unless we have seen non ascii characters. */ int valid = (!state->argn) || valid_utf8(p->buf, blen); if (!valid) { p->error = "invalid utf-8 in symbol"; return 0; } ret = janet_symbolv(p->buf, blen); } } else { p->error = "empty symbol invalid"; return 0; } p->bufcount = 0; popstate(p, ret); return 0; } static int comment(JanetParser *p, JanetParseState *state, uint8_t c) { (void) state; if (c == '\n') p->statecount--; return 1; } /* Forward declaration */ static int root(JanetParser *p, JanetParseState *state, uint8_t c); static int dotuple(JanetParser *p, JanetParseState *state, uint8_t c) { if (state->flags & PFLAG_SQRBRACKETS ? c == ']' : c == ')') { int32_t i; Janet *ret = janet_tuple_begin(state->argn); for (i = state->argn - 1; i >= 0; i--) { ret[i] = p->args[--p->argcount]; } popstate(p, janet_wrap_tuple(janet_tuple_end(ret))); return 1; } return root(p, state, c); } static int doarray(JanetParser *p, JanetParseState *state, uint8_t c) { if (state->flags & PFLAG_SQRBRACKETS ? c == ']' : c == ')') { int32_t i; JanetArray *array = janet_array(state->argn); for (i = state->argn - 1; i >= 0; i--) { array->data[i] = p->args[--p->argcount]; } array->count = state->argn; popstate(p, janet_wrap_array(array)); return 1; } return root(p, state, c); } static int dostruct(JanetParser *p, JanetParseState *state, uint8_t c) { if (c == '}') { int32_t i; JanetKV *st; if (state->argn & 1) { p->error = "struct literal expects even number of arguments"; return 1; } st = janet_struct_begin(state->argn >> 1); for (i = state->argn; i > 0; i -= 2) { Janet value = p->args[--p->argcount]; Janet key = p->args[--p->argcount]; janet_struct_put(st, key, value); } popstate(p, janet_wrap_struct(janet_struct_end(st))); return 1; } return root(p, state, c); } static int dotable(JanetParser *p, JanetParseState *state, uint8_t c) { if (c == '}') { int32_t i; JanetTable *table; if (state->argn & 1) { p->error = "table literal expects even number of arguments"; return 1; } table = janet_table(state->argn >> 1); for (i = state->argn; i > 0; i -= 2) { Janet value = p->args[--p->argcount]; Janet key = p->args[--p->argcount]; janet_table_put(table, key, value); } popstate(p, janet_wrap_table(table)); return 1; } return root(p, state, c); } #define PFLAG_INSTRING 128 #define PFLAG_END_CANDIDATE 256 static int longstring(JanetParser *p, JanetParseState *state, uint8_t c) { if (state->flags & PFLAG_INSTRING) { /* We are inside the long string */ if (c == '`') { state->flags |= PFLAG_END_CANDIDATE; state->flags &= ~PFLAG_INSTRING; state->qcount = 1; /* Use qcount to keep track of number of '=' seen */ return 1; } push_buf(p, c); return 1; } else if (state->flags & PFLAG_END_CANDIDATE) { int i; /* We are checking a potential end of the string */ if (state->qcount == state->argn) { stringend(p, state); return 0; } if (c == '`' && state->qcount < state->argn) { state->qcount++; return 1; } /* Failed end candidate */ for (i = 0; i < state->qcount; i++) { push_buf(p, '`'); } push_buf(p, c); state->qcount = 0; state->flags &= ~PFLAG_END_CANDIDATE; state->flags |= PFLAG_INSTRING; return 1; } else { /* We are at beginning of string */ state->argn++; if (c != '`') { state->flags |= PFLAG_INSTRING; push_buf(p, c); } return 1; } } static int ampersand(JanetParser *p, JanetParseState *state, uint8_t c) { (void) state; p->statecount--; switch (c) { case '{': pushstate(p, dotable, PFLAG_CONTAINER | PFLAG_CURLYBRACKETS); return 1; case '"': pushstate(p, stringchar, PFLAG_BUFFER | PFLAG_STRING); return 1; case '`': pushstate(p, longstring, PFLAG_BUFFER | PFLAG_LONGSTRING); return 1; case '[': pushstate(p, doarray, PFLAG_CONTAINER | PFLAG_SQRBRACKETS); return 1; case '(': pushstate(p, doarray, PFLAG_CONTAINER | PFLAG_PARENS); return 1; default: break; } pushstate(p, tokenchar, 0); push_buf(p, '@'); /* Push the leading ampersand that was dropped */ return 0; } /* The root state of the parser */ static int root(JanetParser *p, JanetParseState *state, uint8_t c) { switch (c) { default: if (is_whitespace(c)) return 1; if (!is_symbol_char(c)) { p->error = "unexpected character"; return 1; } pushstate(p, tokenchar, 0); return 0; case '\'': state->qcount++; return 1; case '"': pushstate(p, stringchar, PFLAG_STRING); return 1; case '#': pushstate(p, comment, 0); return 1; case '@': pushstate(p, ampersand, 0); return 1; case '`': pushstate(p, longstring, PFLAG_LONGSTRING); return 1; case ')': case ']': case '}': p->error = "mismatched delimiter"; return 1; case '(': pushstate(p, dotuple, PFLAG_CONTAINER | PFLAG_PARENS); return 1; case '[': pushstate(p, dotuple, PFLAG_CONTAINER | PFLAG_SQRBRACKETS); return 1; case '{': pushstate(p, dostruct, PFLAG_CONTAINER | PFLAG_CURLYBRACKETS); return 1; } } int janet_parser_consume(JanetParser *parser, uint8_t c) { int consumed = 0; if (parser->error) return 0; if (c == '\n') { parser->line++; parser->col = 0; } else if (c != '\r') { parser->col++; } while (!consumed && !parser->error) { JanetParseState *state = parser->states + parser->statecount - 1; consumed = state->consumer(parser, state, c); } parser->lookback = c; return 1; } enum JanetParserStatus janet_parser_status(JanetParser *parser) { if (parser->error) return JANET_PARSE_ERROR; if (parser->statecount > 1) return JANET_PARSE_PENDING; if (parser->argcount) return JANET_PARSE_FULL; return JANET_PARSE_ROOT; } void janet_parser_flush(JanetParser *parser) { parser->argcount = 0; parser->statecount = 1; parser->bufcount = 0; } const char *janet_parser_error(JanetParser *parser) { enum JanetParserStatus status = janet_parser_status(parser); if (status == JANET_PARSE_ERROR) { const char *e = parser->error; parser->error = NULL; janet_parser_flush(parser); return e; } return NULL; } Janet janet_parser_produce(JanetParser *parser) { Janet ret; size_t i; enum JanetParserStatus status = janet_parser_status(parser); if (status != JANET_PARSE_FULL) return janet_wrap_nil(); ret = parser->args[0]; for (i = 1; i < parser->argcount; i++) { parser->args[i - 1] = parser->args[i]; } parser->argcount--; return ret; } void janet_parser_init(JanetParser *parser) { parser->args = NULL; parser->states = NULL; parser->buf = NULL; parser->argcount = 0; parser->argcap = 0; parser->bufcount = 0; parser->bufcap = 0; parser->statecount = 0; parser->statecap = 0; parser->error = NULL; parser->line = 1; parser->col = 0; parser->lookback = -1; pushstate(parser, root, PFLAG_CONTAINER); } void janet_parser_deinit(JanetParser *parser) { free(parser->args); free(parser->buf); free(parser->states); } /* C functions */ static int parsermark(void *p, size_t size) { size_t i; JanetParser *parser = (JanetParser *)p; (void) size; for (i = 0; i < parser->argcount; i++) { janet_mark(parser->args[i]); } return 0; } static int parsergc(void *p, size_t size) { JanetParser *parser = (JanetParser *)p; (void) size; janet_parser_deinit(parser); return 0; } static JanetAbstractType janet_parse_parsertype = { ":core.parser", parsergc, parsermark }; JanetParser *janet_check_parser(Janet x) { if (!janet_checktype(x, JANET_ABSTRACT)) return NULL; void *abstract = janet_unwrap_abstract(x); if (janet_abstract_type(abstract) != &janet_parse_parsertype) return NULL; return (JanetParser *)abstract; } /* C Function parser */ static int cfun_parser(JanetArgs args) { JANET_FIXARITY(args, 0); JanetParser *p = janet_abstract(&janet_parse_parsertype, sizeof(JanetParser)); janet_parser_init(p); JANET_RETURN_ABSTRACT(args, p); } static int cfun_consume(JanetArgs args) { const uint8_t *bytes; int32_t len; JanetParser *p; int32_t i; JANET_FIXARITY(args, 2); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); JANET_ARG_BYTES(bytes, len, args, 1); for (i = 0; i < len; i++) { janet_parser_consume(p, bytes[i]); switch (janet_parser_status(p)) { case JANET_PARSE_ROOT: case JANET_PARSE_PENDING: break; default: { JanetBuffer *b = janet_buffer(len - i); janet_buffer_push_bytes(b, bytes + i + 1, len - i - 1); JANET_RETURN_BUFFER(args, b); } } } JANET_RETURN(args, janet_wrap_nil()); } static int cfun_byte(JanetArgs args) { int32_t i; JanetParser *p; JANET_FIXARITY(args, 2); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); JANET_ARG_INTEGER(i, args, 1); janet_parser_consume(p, 0xFF & i); JANET_RETURN(args, args.v[0]); } static int cfun_status(JanetArgs args) { const char *stat = NULL; JanetParser *p; JANET_FIXARITY(args, 1); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); switch (janet_parser_status(p)) { case JANET_PARSE_FULL: stat = ":full"; break; case JANET_PARSE_PENDING: stat = ":pending"; break; case JANET_PARSE_ERROR: stat = ":error"; break; case JANET_PARSE_ROOT: stat = ":root"; break; } JANET_RETURN_CSYMBOL(args, stat); } static int cfun_error(JanetArgs args) { const char *err; JanetParser *p; JANET_FIXARITY(args, 1); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); err = janet_parser_error(p); if (err) { JANET_RETURN_CSYMBOL(args, err); } else { JANET_RETURN_NIL(args); } } static int cfun_produce(JanetArgs args) { Janet val; JanetParser *p; JANET_FIXARITY(args, 1); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); val = janet_parser_produce(p); JANET_RETURN(args, val); } static int cfun_flush(JanetArgs args) { JanetParser *p; JANET_FIXARITY(args, 1); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); janet_parser_flush(p); JANET_RETURN(args, args.v[0]); } static int cfun_where(JanetArgs args) { JanetParser *p; JANET_FIXARITY(args, 1); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); Janet *tup = janet_tuple_begin(2); tup[0] = janet_wrap_integer((int32_t)p->line); tup[1] = janet_wrap_integer((int32_t)p->col); JANET_RETURN_TUPLE(args, janet_tuple_end(tup)); } static int cfun_state(JanetArgs args) { size_t i; const uint8_t *str; size_t oldcount; JanetParser *p; JANET_FIXARITY(args, 1); JANET_CHECKABSTRACT(args, 0, &janet_parse_parsertype); p = (JanetParser *) janet_unwrap_abstract(args.v[0]); oldcount = p->bufcount; for (i = 0; i < p->statecount; i++) { JanetParseState *s = p->states + i; if (s->flags & PFLAG_PARENS) { push_buf(p, '('); } else if (s->flags & PFLAG_SQRBRACKETS) { push_buf(p, '['); } else if (s->flags & PFLAG_CURLYBRACKETS) { push_buf(p, '{'); } else if (s->flags & PFLAG_STRING) { push_buf(p, '"'); } else if (s->flags & PFLAG_LONGSTRING) { int32_t i; for (i = 0; i < s->argn; i++) { push_buf(p, '`'); } } } str = janet_string(p->buf + oldcount, (int32_t)(p->bufcount - oldcount)); p->bufcount = oldcount; JANET_RETURN_STRING(args, str); } static const JanetReg cfuns[] = { {"parser.new", cfun_parser}, {"parser.produce", cfun_produce}, {"parser.consume", cfun_consume}, {"parser.byte", cfun_byte}, {"parser.error", cfun_error}, {"parser.status", cfun_status}, {"parser.flush", cfun_flush}, {"parser.state", cfun_state}, {"parser.where", cfun_where}, {NULL, NULL} }; /* Load the library */ int janet_lib_parse(JanetArgs args) { JanetTable *env = janet_env(args); janet_cfuns(env, NULL, cfuns); return 0; }