From 6e38bf15789405bdd00ce099646ad3a5a465b675 Mon Sep 17 00:00:00 2001
From: Calvin Rose <calsrose@gmail.com>
Date: Fri, 29 Oct 2021 11:08:53 -0500
Subject: [PATCH] Use more inclusive check for the %j formatter for valid
 symbols.

We did not allow arbitrary utf8 to be printed with %j, even though the parser
allows. Thos changes uses the existing built in utf8 detectiotion to
exclude only unprintable symbols from the docstring.
---
 src/core/math.c      |  4 ++--
 src/core/parse.c     | 14 +++++++-------
 src/core/pp.c        | 12 ++----------
 src/core/util.h      |  2 ++
 test/suite0000.janet |  1 +
 5 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/core/math.c b/src/core/math.c
index 6c91664b..ce6a61fe 100644
--- a/src/core/math.c
+++ b/src/core/math.c
@@ -331,7 +331,7 @@ static double janet_lcm(double x, double y) {
 }
 
 JANET_CORE_FN(janet_cfun_gcd, "(math/gcd x y)",
-        "Returns the greatest common divisor between x and y.") {
+              "Returns the greatest common divisor between x and y.") {
     janet_fixarity(argc, 2);
     double x = janet_getnumber(argv, 0);
     double y = janet_getnumber(argv, 1);
@@ -339,7 +339,7 @@ JANET_CORE_FN(janet_cfun_gcd, "(math/gcd x y)",
 }
 
 JANET_CORE_FN(janet_cfun_lcm, "(math/lcm x y)",
-        "Returns the least common multiple of x and y.") {
+              "Returns the least common multiple of x and y.") {
     janet_fixarity(argc, 2);
     double x = janet_getnumber(argv, 0);
     double y = janet_getnumber(argv, 1);
diff --git a/src/core/parse.c b/src/core/parse.c
index 8abc7221..f667fc18 100644
--- a/src/core/parse.c
+++ b/src/core/parse.c
@@ -51,15 +51,15 @@ static const uint32_t symchars[8] = {
 };
 
 /* Check if a character is a valid symbol character
- * symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_~| */
-static int is_symbol_char(uint8_t c) {
+ * symbol chars are A-Z, a-z, 0-9, or one of !$&*+-./:<=>@\^_| */
+int janet_is_symbol_char(uint8_t c) {
     return symchars[c >> 5] & ((uint32_t)1 << (c & 0x1F));
 }
 
 /* Validate some utf8. Useful for identifiers. Only validates
  * the encoding, does not check for valid code points (they
  * are less well defined than the encoding). */
-static int valid_utf8(const uint8_t *str, int32_t len) {
+int janet_valid_utf8(const uint8_t *str, int32_t len) {
     int32_t i = 0;
     int32_t j;
     while (i < len) {
@@ -411,7 +411,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
     Janet ret;
     double numval;
     int32_t blen;
-    if (is_symbol_char(c)) {
+    if (janet_is_symbol_char(c)) {
         push_buf(p, (uint8_t) c);
         if (c > 127) state->argn = 1; /* Use to indicate non ascii */
         return 1;
@@ -422,7 +422,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
     int start_num = start_dig || p->buf[0] == '-' || p->buf[0] == '+' || p->buf[0] == '.';
     if (p->buf[0] == ':') {
         /* Don't do full utf-8 check unless we have seen non ascii characters. */
-        int valid = (!state->argn) || valid_utf8(p->buf + 1, blen - 1);
+        int valid = (!state->argn) || janet_valid_utf8(p->buf + 1, blen - 1);
         if (!valid) {
             p->error = "invalid utf-8 in keyword";
             return 0;
@@ -442,7 +442,7 @@ static int tokenchar(JanetParser *p, JanetParseState *state, uint8_t c) {
             return 0;
         } else {
             /* Don't do full utf-8 check unless we have seen non ascii characters. */
-            int valid = (!state->argn) || valid_utf8(p->buf, blen);
+            int valid = (!state->argn) || janet_valid_utf8(p->buf, blen);
             if (!valid) {
                 p->error = "invalid utf-8 in symbol";
                 return 0;
@@ -582,7 +582,7 @@ static int root(JanetParser *p, JanetParseState *state, uint8_t c) {
     switch (c) {
         default:
             if (is_whitespace(c)) return 1;
-            if (!is_symbol_char(c)) {
+            if (!janet_is_symbol_char(c)) {
                 p->error = "unexpected character";
                 return 1;
             }
diff --git a/src/core/pp.c b/src/core/pp.c
index bd3fd8c2..dc8034bd 100644
--- a/src/core/pp.c
+++ b/src/core/pp.c
@@ -261,21 +261,13 @@ void janet_to_string_b(JanetBuffer *buffer, Janet x) {
 
 /* See parse.c for full table */
 
-static const uint32_t pp_symchars[8] = {
-    0x00000000, 0xf7ffec72, 0xc7ffffff, 0x07fffffe,
-    0x00000000, 0x00000000, 0x00000000, 0x00000000
-};
-
-static int pp_is_symbol_char(uint8_t c) {
-    return pp_symchars[c >> 5] & ((uint32_t)1 << (c & 0x1F));
-}
-
 /* Check if a symbol or keyword contains no symbol characters */
 static int contains_bad_chars(const uint8_t *sym, int issym) {
     int32_t len = janet_string_length(sym);
     if (len && issym && sym[0] >= '0' && sym[0] <= '9') return 1;
+    if (!janet_valid_utf8(sym, len)) return 1;
     for (int32_t i = 0; i < len; i++) {
-        if (!pp_is_symbol_char(sym[i])) return 1;
+        if (!janet_is_symbol_char(sym[i])) return 1;
     }
     return 0;
 }
diff --git a/src/core/util.h b/src/core/util.h
index 6a4e57f0..9a144e73 100644
--- a/src/core/util.h
+++ b/src/core/util.h
@@ -57,6 +57,8 @@
 
 /* Utils */
 #define janet_maphash(cap, hash) ((uint32_t)(hash) & (cap - 1))
+int janet_valid_utf8(const uint8_t *str, int32_t len);
+int janet_is_symbol_char(uint8_t c);
 extern const char janet_base64[65];
 int32_t janet_array_calchash(const Janet *array, int32_t len);
 int32_t janet_kv_calchash(const JanetKV *kvs, int32_t len);
diff --git a/test/suite0000.janet b/test/suite0000.janet
index fd720b9b..39d63b6e 100644
--- a/test/suite0000.janet
+++ b/test/suite0000.janet
@@ -202,6 +202,7 @@
 
 #🐙🐙🐙🐙
 
+(defn foo [Θa Θb Θc] 0)
 (def 🦊 :fox)
 (def 🐮 :cow)
 (assert (= (string "🐼" 🦊 🐮) "🐼foxcow") "emojis 🙉 :)")