Minor fixes for parser

Check length before dereferencing buffer in tokenchar. Check keywords are valid utf-8. Fix minor typos.
2025-08-09 15:34:28 +00:00 · 2019-11-23 16:55:23 +13:00 · 2019-11-23 16:55:23 +13:00 · 976dfc7195
commit 976dfc7195
parent 8372d1e499
2 changed files with 19 additions and 6 deletions
--- a/src/core/parse.c
+++ b/src/core/parse.c
@ -38,7 +38,7 @@ static int is_whitespace(uint8_t c) {

 /* Code generated by tools/symcharsgen.c.
 * The table contains 256 bits, where each bit is 1
- * if the corresponding ascci code is a symbol char, and 0
+ * if the corresponding ascii code is a symbol char, and 0
 * if not. The upper characters are also considered symbol
 * chars and are then checked for utf-8 compliance. */
 static const uint32_t symchars[8] = {
@ -233,7 +233,7 @@ static int escapeh(JanetParser *p, JanetParseState *state, uint8_t c) {
        p->error = "invalid hex digit in hex escape";
        return 1;
    }
-    state->argn = (state->argn << 4) + digit;;
+    state->argn = (state->argn << 4) + digit;
    state->counter--;
    if (!state->counter) {
        push_buf(p, (state->argn & 0xFF));
@ -329,6 +329,12 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
    int start_dig = p->buf[0] >= '0' && p->buf[0] <= '9';
    int start_num = start_dig || p->buf[0] == '-' || p->buf[0] == '+' || p->buf[0] == '.';
    if (p->buf[0] == ':') {
+        /* Don't do full utf-8 check unless we have seen non ascii characters. */
+        int valid = (!state->argn) || valid_utf8(p->buf + 1, blen - 1);
+        if (!valid) {
+            p->error = "invalid utf-8 in keyword";
+            return 0;
+        }
        ret = janet_keywordv(p->buf + 1, blen - 1);
    } else if (start_num && !janet_scan_number(p->buf, blen, &numval)) {
        ret = janet_wrap_number(numval);
@ -338,7 +344,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
        ret = janet_wrap_false();
    } else if (!check_str_const("true", p->buf, blen)) {
        ret = janet_wrap_true();
-    } else if (p->buf) {
+    } else {
        if (start_dig) {
            p->error = "symbol literal cannot start with a digit";
            return 0;
@ -351,9 +357,6 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
            }
            ret = janet_symbolv(p->buf, blen);
        }
-    } else {
-        p->error = "empty symbol invalid";
-        return 0;
    }
    p->bufcount = 0;
    popstate(p, ret);
--- a/test/suite6.janet
+++ b/test/suite6.janet
@ -118,6 +118,16 @@
 (assert (deep= (parser/status p) (parser/status p2)) "parser 2")
 (assert (deep= (parser/state p) (parser/state p2)) "parser 3")

+# Parser errors
+(defn parse-error [input]
+  (def p (parser/new))
+  (parser/consume p input)
+  (parser/error p))
+
+# Invalid utf-8 sequences
+(assert (not= nil (parse-error @"\xc3\x28")) "reject invalid utf-8 symbol")
+(assert (not= nil (parse-error @":\xc3\x28")) "reject invalid utf-8 keyword")
+
 # String check-set
 (assert (string/check-set "abc" "a") "string/check-set 1")
 (assert (not (string/check-set "abc" "z")) "string/check-set 2")