From 6e38bf15789405bdd00ce099646ad3a5a465b675 Mon Sep 17 00:00:00 2001 From: Calvin Rose Date: Fri, 29 Oct 2021 11:08:53 -0500 Subject: [PATCH] Use more inclusive check for the %j formatter for valid symbols. We did not allow arbitrary utf8 to be printed with %j, even though the parser allows. Thos changes uses the existing built in utf8 detectiotion to exclude only unprintable symbols from the docstring. --- src/core/math.c | 4 ++-- src/core/parse.c | 14 +++++++------- src/core/pp.c | 12 ++---------- src/core/util.h | 2 ++ test/suite0000.janet | 1 + 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/core/math.c b/src/core/math.c index 6c91664b..ce6a61fe 100644 --- a/src/core/math.c +++ b/src/core/math.c @@ -331,7 +331,7 @@ static double janet_lcm(double x, double y) { } JANET_CORE_FN(janet_cfun_gcd, "(math/gcd x y)", - "Returns the greatest common divisor between x and y.") { + "Returns the greatest common divisor between x and y.") { janet_fixarity(argc, 2); double x = janet_getnumber(argv, 0); double y = janet_getnumber(argv, 1); @@ -339,7 +339,7 @@ JANET_CORE_FN(janet_cfun_gcd, "(math/gcd x y)", } JANET_CORE_FN(janet_cfun_lcm, "(math/lcm x y)", - "Returns the least common multiple of x and y.") { + "Returns the least common multiple of x and y.") { janet_fixarity(argc, 2); double x = janet_getnumber(argv, 0); double y = janet_getnumber(argv, 1); diff --git a/src/core/parse.c b/src/core/parse.c index 8abc7221..f667fc18 100644 --- a/src/core/parse.c +++ b/src/core/parse.c @@ -51,15 +51,15 @@ static const uint32_t symchars[8] = { }; /* Check if a character is a valid symbol character - * symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_~| */ -static int is_symbol_char(uint8_t c) { + * symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_| */ +int janet_is_symbol_char(uint8_t c) { return symchars[c >> 5] & ((uint32_t)1 << (c & 0x1F)); } /* Validate some utf8. Useful for identifiers. Only validates * the encoding, does not check for valid code points (they * are less well defined than the encoding). */ -static int valid_utf8(const uint8_t *str, int32_t len) { +int janet_valid_utf8(const uint8_t *str, int32_t len) { int32_t i = 0; int32_t j; while (i < len) { @@ -411,7 +411,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { Janet ret; double numval; int32_t blen; - if (is_symbol_char(c)) { + if (janet_is_symbol_char(c)) { push_buf(p, (uint8_t) c); if (c > 127) state->argn = 1; /* Use to indicate non ascii */ return 1; @@ -422,7 +422,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { int start_num = start_dig || p->buf[0] == '-' || p->buf[0] == '+' || p->buf[0] == '.'; if (p->buf[0] == ':') { /* Don't do full utf-8 check unless we have seen non ascii characters. */ - int valid = (!state->argn) || valid_utf8(p->buf + 1, blen - 1); + int valid = (!state->argn) || janet_valid_utf8(p->buf + 1, blen - 1); if (!valid) { p->error = "invalid utf-8 in keyword"; return 0; @@ -442,7 +442,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) { return 0; } else { /* Don't do full utf-8 check unless we have seen non ascii characters. */ - int valid = (!state->argn) || valid_utf8(p->buf, blen); + int valid = (!state->argn) || janet_valid_utf8(p->buf, blen); if (!valid) { p->error = "invalid utf-8 in symbol"; return 0; @@ -582,7 +582,7 @@ static int root(JanetParser *p, JanetParseState *state, uint8_t c) { switch (c) { default: if (is_whitespace(c)) return 1; - if (!is_symbol_char(c)) { + if (!janet_is_symbol_char(c)) { p->error = "unexpected character"; return 1; } diff --git a/src/core/pp.c b/src/core/pp.c index bd3fd8c2..dc8034bd 100644 --- a/src/core/pp.c +++ b/src/core/pp.c @@ -261,21 +261,13 @@ void janet_to_string_b(JanetBuffer *buffer, Janet x) { /* See parse.c for full table */ -static const uint32_t pp_symchars[8] = { - 0x00000000, 0xf7ffec72, 0xc7ffffff, 0x07fffffe, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 -}; - -static int pp_is_symbol_char(uint8_t c) { - return pp_symchars[c >> 5] & ((uint32_t)1 << (c & 0x1F)); -} - /* Check if a symbol or keyword contains no symbol characters */ static int contains_bad_chars(const uint8_t *sym, int issym) { int32_t len = janet_string_length(sym); if (len && issym && sym[0] >= '0' && sym[0] <= '9') return 1; + if (!janet_valid_utf8(sym, len)) return 1; for (int32_t i = 0; i < len; i++) { - if (!pp_is_symbol_char(sym[i])) return 1; + if (!janet_is_symbol_char(sym[i])) return 1; } return 0; } diff --git a/src/core/util.h b/src/core/util.h index 6a4e57f0..9a144e73 100644 --- a/src/core/util.h +++ b/src/core/util.h @@ -57,6 +57,8 @@ /* Utils */ #define janet_maphash(cap, hash) ((uint32_t)(hash) & (cap - 1)) +int janet_valid_utf8(const uint8_t *str, int32_t len); +int janet_is_symbol_char(uint8_t c); extern const char janet_base64[65]; int32_t janet_array_calchash(const Janet *array, int32_t len); int32_t janet_kv_calchash(const JanetKV *kvs, int32_t len); diff --git a/test/suite0000.janet b/test/suite0000.janet index fd720b9b..39d63b6e 100644 --- a/test/suite0000.janet +++ b/test/suite0000.janet @@ -202,6 +202,7 @@ #🐙🐙🐙🐙 +(defn foo [Θa Θb Θc] 0) (def 🦊 :fox) (def 🐮 :cow) (assert (= (string "🐼" 🦊 🐮) "🐼foxcow") "emojis 🙉 :)")