From 976dfc7195bd72cd799964eef59eeb261538781e Mon Sep 17 00:00:00 2001 From: Andrew Chambers Date: Sat, 23 Nov 2019 16:55:23 +1300 Subject: [PATCH] Minor fixes for parser Check length before dereferencing buffer in tokenchar. Check keywords are valid utf-8. Fix minor typos. --- src/core/parse.c | 15 +++++++++------ test/suite6.janet | 10 ++++++++++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/core/parse.c b/src/core/parse.c index eb4ad24a..92486500 100644 --- a/src/core/parse.c +++ b/src/core/parse.c @@ -38,7 +38,7 @@ static int is_whitespace(uint8_t c) { /* Code generated by tools/symcharsgen.c. * The table contains 256 bits, where each bit is 1 - * if the corresponding ascci code is a symbol char, and 0 + * if the corresponding ascii code is a symbol char, and 0 * if not. The upper characters are also considered symbol * chars and are then checked for utf-8 compliance. */ static const uint32_t symchars[8] = { @@ -233,7 +233,7 @@ static int escapeh(JanetParser *p, JanetParseState *state, uint8_t c) { p->error = "invalid hex digit in hex escape"; return 1; } - state->argn = (state->argn << 4) + digit;; + state->argn = (state->argn << 4) + digit; state->counter--; if (!state->counter) { push_buf(p, (state->argn & 0xFF)); @@ -329,6 +329,12 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { int start_dig = p->buf[0] >= '0' && p->buf[0] <= '9'; int start_num = start_dig || p->buf[0] == '-' || p->buf[0] == '+' || p->buf[0] == '.'; if (p->buf[0] == ':') { + /* Don't do full utf-8 check unless we have seen non ascii characters. */ + int valid = (!state->argn) || valid_utf8(p->buf + 1, blen - 1); + if (!valid) { + p->error = "invalid utf-8 in keyword"; + return 0; + } ret = janet_keywordv(p->buf + 1, blen - 1); } else if (start_num && !janet_scan_number(p->buf, blen, &numval)) { ret = janet_wrap_number(numval); @@ -338,7 +344,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { ret = janet_wrap_false(); } else if (!check_str_const("true", p->buf, blen)) { ret = janet_wrap_true(); - } else if (p->buf) { + } else { if (start_dig) { p->error = "symbol literal cannot start with a digit"; return 0; @@ -351,9 +357,6 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { } ret = janet_symbolv(p->buf, blen); } - } else { - p->error = "empty symbol invalid"; - return 0; } p->bufcount = 0; popstate(p, ret); diff --git a/test/suite6.janet b/test/suite6.janet index e53ad0c8..b3e836e0 100644 --- a/test/suite6.janet +++ b/test/suite6.janet @@ -118,6 +118,16 @@ (assert (deep= (parser/status p) (parser/status p2)) "parser 2") (assert (deep= (parser/state p) (parser/state p2)) "parser 3") +# Parser errors +(defn parse-error [input] + (def p (parser/new)) + (parser/consume p input) + (parser/error p)) + +# Invalid utf-8 sequences +(assert (not= nil (parse-error @"\xc3\x28")) "reject invalid utf-8 symbol") +(assert (not= nil (parse-error @":\xc3\x28")) "reject invalid utf-8 keyword") + # String check-set (assert (string/check-set "abc" "a") "string/check-set 1") (assert (not (string/check-set "abc" "z")) "string/check-set 2")