/* * Copyright (c) 2017 Calvin Rose * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include "strtod.h" /* Checks if a string slice is equal to a string constant */ static int check_str_const(const char *ref, const uint8_t *start, const uint8_t *end) { while (*ref && start < end) { if (*ref != *(char *)start) return 0; ++ref; ++start; } return !*ref && start == end; } /* Quote a value */ static DstValue quote(DstValue x) { DstValue *t = dst_tuple_begin(2); t[0] = dst_csymbolv("quote"); t[1] = x; return dst_wrap_tuple(dst_tuple_end(t)); } /* Check if a character is whitespace */ static int is_whitespace(uint8_t c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0' || c == ';' || c == ','; } /* Code gen printf("static uint32_t symchars[8] = {\n\t"); for (int i = 0; i < 256; i += 32) { uint32_t block = 0; for (int j = 0; j < 32; j++) { block |= is_symbol_char_gen(i + j) << j; } printf("0x%08x%s", block, (i == (256 - 32)) ? "" : ", "); } printf("\n};\n"); static int is_symbol_char_gen(uint8_t c) { if (c >= 'a' && c <= 'z') return 1; if (c >= 'A' && c <= 'Z') return 1; if (c >= '0' && c <= '9') return 1; return (c == '!' || c == '$' || c == '%' || c == '&' || c == '*' || c == '+' || c == '-' || c == '.' || c == '/' || c == ':' || c == '<' || c == '=' || c == '>' || c == '@' || c == '\\' || c == '^' || c == '_' || c == '~' || c == '|'); } The table contains 256 bits, where each bit is 1 if the corresponding ascci code is a symbol char, and 0 if not. The upper characters are also considered symbol chars and are then checked for utf-8 compliance. */ static uint32_t symchars[256] = { 0x00000000, 0x77ffec72, 0xd7ffffff, 0x57fffffe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; /* Check if a character is a valid symbol character */ /* TODO - allow utf8 - shouldn't be difficult, err on side * of inclusivity * symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_~| */ static int is_symbol_char(uint8_t c) { return symchars[c >> 5] & (1 << (c & 0x1F)); } /* Validate some utf8. Useful for identifiers. Only validates * the encoding, does not check for valid codepoints (they * are less well defined than the encoding). */ static int valid_utf8(const uint8_t *str, int32_t len) { int32_t i = 0; int32_t j; while (i < len) { int32_t nexti; uint8_t c = str[i]; /* Check the number of bytes in code point */ if (c < 0x80) nexti = i + 1; else if ((c >> 5) == 0x06) nexti = i + 2; else if ((c >> 4) == 0x0E) nexti = i + 3; else if ((c >> 3) == 0x1E) nexti = i + 4; /* Don't allow 5 or 6 byte code points */ else return 0; /* No overflow */ if (nexti > len) return 0; /* Ensure trailing bytes are well formed (10XX XXXX) */ for (j = i + 1; j < nexti; j++) { if ((str[j] >> 6) != 2) return 0; } /* Check for overlong encodings */ if ((nexti == i + 2) && str[i] < 0xC2) return 0; if ((str[i] == 0xE0) && str[i + 1] < 0xA0) return 0; if ((str[i] == 0xF0) && str[i + 1] < 0x90) return 0; i = nexti; } return 1; } /* Get hex digit from a letter */ static int to_hex(uint8_t c) { if (c >= '0' && c <= '9') { return c - '0'; } else if (c >= 'A' && c <= 'F') { return 10 + c - 'A'; } else if (c >= 'a' && c <= 'f') { return 10 + c - 'a'; } else { return -1; } } /* Make source mapping for atom (non recursive structure) */ static DstValue atom_map(int32_t start, int32_t end) { DstValue *t = dst_tuple_begin(2); t[0] = dst_wrap_integer(start); t[1] = dst_wrap_integer(end); return dst_wrap_tuple(dst_tuple_end(t)); } /* Create mappingd for recursive data structure */ static DstValue ds_map(int32_t start, int32_t end, DstValue submap) { DstValue *t = dst_tuple_begin(3); t[0] = dst_wrap_integer(start); t[1] = dst_wrap_integer(end); t[2] = submap; return dst_wrap_tuple(dst_tuple_end(t)); } /* Create a sourcemapping for a key value pair */ static DstValue kv_map(DstValue k, DstValue v) { DstValue *t = dst_tuple_begin(2); t[0] = k; t[1] = v; return dst_wrap_tuple(dst_tuple_end(t)); } typedef struct { DstArray stack; DstArray mapstack; const uint8_t *srcstart; const uint8_t *end; const char *errmsg; DstParseStatus status; } ParseArgs; /* Entry point of the recursive descent parser */ static const uint8_t *parse_recur( ParseArgs *args, const uint8_t *src, int32_t recur) { const uint8_t *end = args->end; const uint8_t *mapstart; int32_t qcount = 0; DstValue ret; DstValue submapping; /* Prevent stack overflow */ if (recur == 0) goto too_much_recur; /* try parsing again */ begin: /* Trim leading whitespace and count quotes */ while (src < end && (is_whitespace(*src) || *src == '\'')) { if (*src == '\'') { ++qcount; } ++src; } /* Check for end of source */ if (src >= end) { if (qcount || recur != DST_RECURSION_GUARD) { goto unexpected_eos; } else { goto nodata; } } /* Open mapping */ mapstart = src; submapping = dst_wrap_nil(); /* Detect token type based on first character */ switch (*src) { /* Numbers, symbols, simple literals */ default: atom: { DstValue numcheck; const uint8_t *tokenend = src; if (!is_symbol_char(*src)) goto unexpected_character; while (tokenend < end && is_symbol_char(*tokenend)) tokenend++; numcheck = dst_scan_number(src, tokenend - src); if (!dst_checktype(numcheck, DST_NIL)) { ret = numcheck; } else if (check_str_const("nil", src, tokenend)) { ret = dst_wrap_nil(); } else if (check_str_const("false", src, tokenend)) { ret = dst_wrap_boolean(0); } else if (check_str_const("true", src, tokenend)) { ret = dst_wrap_boolean(1); } else { if (*src >= '0' && *src <= '9') { goto sym_nodigits; } else { if (!valid_utf8(src, tokenend - src)) goto invalid_utf8; if (*src == ':') { ret = dst_stringv(src + 1, tokenend - src - 1); } else { ret = dst_symbolv(src, tokenend - src); } } } src = tokenend; break; } case '#': { /* Jump to next newline */ while (src < end && *src != '\n') ++src; goto begin; } /* String literals */ case '"': { const uint8_t *strend = ++src; const uint8_t *strstart = strend; int32_t len = 0; int containsEscape = 0; /* Preprocess string to check for escapes and string end */ while (strend < end && *strend != '"') { len++; if (*strend++ == '\\') { containsEscape = 1; if (strend >= end) goto unexpected_eos; if (*strend == 'h') { strend += 3; if (strend >= end) goto unexpected_eos; } else { strend++; if (strend >= end) goto unexpected_eos; } } } if (containsEscape) { uint8_t *buf = dst_string_begin(len); uint8_t *write = buf; while (src < strend) { if (*src == '\\') { src++; switch (*src++) { case 'n': *write++ = '\n'; break; case 'r': *write++ = '\r'; break; case 't': *write++ = '\t'; break; case 'f': *write++ = '\f'; break; case '0': *write++ = '\0'; break; case '"': *write++ = '"'; break; case '\'': *write++ = '\''; break; case 'z': *write++ = '\0'; break; case 'e': *write++ = 27; break; case 'h': { int d1 = to_hex(*src++); int d2 = to_hex(*src++); if (d1 < 0 || d2 < 0) goto invalid_hex; *write++ = 16 * d1 + d2; break; } default: goto unknown_strescape; } } else { *write++ = *src++; } } ret = dst_wrap_string(dst_string_end(buf)); } else { ret = dst_wrap_string(dst_string(strstart, strend - strstart)); } src = strend + 1; break; } /* Data Structure literals */ case '@': if (src[1] != '{') goto atom; case '(': case '[': case '{': { int32_t n = 0, i = 0; int32_t istable = 0; uint8_t close; switch (*src++) { case '[': close = ']'; break; case '{': close = '}'; break; case '@': close = '}'; src++; istable = 1; break; default: close = ')'; break; } /* Trim trailing whitespace */ while (src < end && (is_whitespace(*src))) ++src; /* Recursively parse inside literal */ while (*src != close) { src = parse_recur(args, src, recur - 1); if (args->errmsg || !src) return src; n++; /* Trim trailing whitespace */ while (src < end && (is_whitespace(*src))) ++src; } src++; switch (close) { case ')': { DstValue *tup = dst_tuple_begin(n); DstValue *subtup = dst_tuple_begin(n); for (i = n; i > 0; i--) { tup[i - 1] = dst_array_pop(&args->stack); subtup[i - 1] = dst_array_pop(&args->mapstack); } ret = dst_wrap_tuple(dst_tuple_end(tup)); submapping = dst_wrap_tuple(dst_tuple_end(subtup)); break; } case ']': { DstArray *arr = dst_array(n); DstArray *subarr = dst_array(n); for (i = n; i > 0; i--) { arr->data[i - 1] = dst_array_pop(&args->stack); subarr->data[i - 1] = dst_array_pop(&args->mapstack); } arr->count = n; subarr->count = n; ret = dst_wrap_array(arr); submapping = dst_wrap_array(subarr); break; } case '}': { if (n & 1) { if (istable) goto table_oddargs; goto struct_oddargs; } if (istable) { DstTable *t = dst_table(n); DstTable *subt = dst_table(n); for (i = n; i > 0; i -= 2) { DstValue val = dst_array_pop(&args->stack); DstValue key = dst_array_pop(&args->stack); DstValue subval = dst_array_pop(&args->mapstack); DstValue subkey = dst_array_pop(&args->mapstack); dst_table_put(t, key, val); dst_table_put(subt, key, kv_map(subkey, subval)); } ret = dst_wrap_table(t); submapping = dst_wrap_table(subt); } else { DstValue *st = dst_struct_begin(n >> 1); DstValue *subst = dst_struct_begin(n >> 1); for (i = n; i > 0; i -= 2) { DstValue val = dst_array_pop(&args->stack); DstValue key = dst_array_pop(&args->stack); DstValue subval = dst_array_pop(&args->mapstack); DstValue subkey = dst_array_pop(&args->mapstack); dst_struct_put(st, key, val); dst_struct_put(subst, key, kv_map(subkey, subval)); } ret = dst_wrap_struct(dst_struct_end(st)); submapping = dst_wrap_struct(dst_struct_end(subst)); } break; } } break; } } /* Push source mapping */ if (dst_checktype(submapping, DST_NIL)) { /* We just parsed an atom */ dst_array_push(&args->mapstack, atom_map( mapstart - args->srcstart, src - args->srcstart)); } else { /* We just parsed a recursive data structure */ dst_array_push(&args->mapstack, ds_map( mapstart - args->srcstart, src - args->srcstart, submapping)); } /* Quote the returned value qcount times */ while (qcount--) { int32_t start = mapstart - args->srcstart; int32_t end = src - args->srcstart; DstValue sourcemap = dst_array_pop(&args->mapstack); DstValue* tup = dst_tuple_begin(2); tup[0] = atom_map(start, end); tup[1] = sourcemap; ret = quote(ret); dst_array_push(&args->mapstack, ds_map( start, end, dst_wrap_tuple(dst_tuple_end(tup)))); } /* Push the result to the stack */ dst_array_push(&args->stack, ret); /* Return the new source position for further calls */ return src; /* Errors below */ nodata: args->status = DST_PARSE_NODATA; return NULL; unexpected_eos: args->errmsg = "unexpected end of source"; args->status = DST_PARSE_UNEXPECTED_EOS; return NULL; unexpected_character: args->errmsg = "unexpected character"; args->status = DST_PARSE_ERROR; return src; sym_nodigits: args->errmsg = "symbols cannot start with digits"; args->status = DST_PARSE_ERROR; return src; table_oddargs: args->errmsg = "table literal needs an even number of arguments"; args->status = DST_PARSE_ERROR; return src; struct_oddargs: args->errmsg = "struct literal needs an even number of arguments"; args->status = DST_PARSE_ERROR; return src; unknown_strescape: args->errmsg = "unknown string escape sequence"; args->status = DST_PARSE_ERROR; return src; invalid_hex: args->errmsg = "invalid hex escape in string"; args->status = DST_PARSE_ERROR; return src; invalid_utf8: args->errmsg = "identifier is not valid utf-8"; args->status = DST_PARSE_ERROR; return src; too_much_recur: args->errmsg = "recursed too deeply in parsing"; args->status = DST_PARSE_ERROR; return src; } /* Parse an array of bytes. Return value in the fiber return value. */ DstParseResult dst_parse(const uint8_t *src, int32_t len) { DstParseResult res; ParseArgs args; const uint8_t *newsrc; dst_array_init(&args.stack, 10); args.status = DST_PARSE_OK; args.srcstart = src; args.end = src + len; args.errmsg = NULL; dst_array_init(&args.mapstack, 10); newsrc = parse_recur(&args, src, DST_RECURSION_GUARD); res.status = args.status; res.bytes_read = (int32_t) (newsrc - src); if (args.errmsg) { res.error = dst_cstring(args.errmsg); res.value = dst_wrap_nil(); res.map = NULL; } else { res.value = dst_array_pop(&args.stack); res.error = NULL; res.map = dst_unwrap_tuple(dst_array_pop(&args.mapstack)); } dst_array_deinit(&args.stack); dst_array_deinit(&args.mapstack); return res; } /* Parse a c string */ DstParseResult dst_parsec(const char *src) { int32_t len = 0; while (src[len]) ++len; return dst_parse((const uint8_t *)src, len); }