Address #306 - Add unicode escapes.

Unicode escapes have the same syntax as go - \uXXXX or \UXXXXXXXX.
2025-10-19 17:57:40 +00:00 · 2020-04-04 21:46:08 -05:00
parent 081d132538
commit ae70a03383
3 changed files with 46 additions and 2 deletions
--- a/src/core/parse.c
+++ b/src/core/parse.c
@@ -201,6 +201,8 @@ static int checkescape(uint8_t c) {
        default:
            return -1;
        case 'x':
+        case 'u':
+        case 'U':
            return 1;
        case 'n':
            return '\n';
@@ -228,6 +230,24 @@ static int checkescape(uint8_t c) {
 /* Forward declare */
 static int stringchar(JanetParser *p, JanetParseState *state, uint8_t c);

+static void write_codepoint(JanetParser *p, int32_t codepoint) {
+    if (codepoint <= 0x7F) {
+        push_buf(p, (uint8_t) codepoint);
+    } else if (codepoint <= 0x7FF) {
+        push_buf(p, (uint8_t)((codepoint >>  6) & 0x1F) | 0xC0);
+        push_buf(p, (uint8_t)((codepoint >>  0) & 0x3F) | 0x80);
+    } else if (codepoint <= 0xFFFF) {
+        push_buf(p, (uint8_t)((codepoint >> 12) & 0x0F) | 0xE0);
+        push_buf(p, (uint8_t)((codepoint >>  6) & 0x3F) | 0x80);
+        push_buf(p, (uint8_t)((codepoint >>  0) & 0x3F) | 0x80);
+    } else {
+        push_buf(p, (uint8_t)((codepoint >> 18) & 0x07) | 0xF0);
+        push_buf(p, (uint8_t)((codepoint >> 12) & 0x3F) | 0x80);
+        push_buf(p, (uint8_t)((codepoint >>  6) & 0x3F) | 0x80);
+        push_buf(p, (uint8_t)((codepoint >>  0) & 0x3F) | 0x80);
+    }
+}
+
 static int escapeh(JanetParser *p, JanetParseState *state, uint8_t c) {
    int digit = to_hex(c);
    if (digit < 0) {
@@ -237,7 +257,23 @@ static int escapeh(JanetParser *p, JanetParseState *state, uint8_t c) {
    state->argn = (state->argn << 4) + digit;
    state->counter--;
    if (!state->counter) {
-        push_buf(p, (state->argn & 0xFF));
+        push_buf(p, (uint8_t)(state->argn & 0xFF));
+        state->argn = 0;
+        state->consumer = stringchar;
+    }
+    return 1;
+}
+
+static int escapeu(JanetParser *p, JanetParseState *state, uint8_t c) {
+    int digit = to_hex(c);
+    if (digit < 0) {
+        p->error = "invalid hex digit in unicode escape";
+        return 1;
+    }
+    state->argn = (state->argn << 4) + digit;
+    state->counter--;
+    if (!state->counter) {
+        write_codepoint(p, state->argn);
        state->argn = 0;
        state->consumer = stringchar;
    }
@@ -254,6 +290,10 @@ static int escape1(JanetParser *p, JanetParseState *state, uint8_t c) {
        state->counter = 2;
        state->argn = 0;
        state->consumer = escapeh;
+    } else if (c == 'u' || c == 'U') {
+        state->counter = c == 'u' ? 4 : 8;
+        state->argn = 0;
+        state->consumer = escapeu;
    } else {
        push_buf(p, (uint8_t) e);
        state->consumer = stringchar;
--- a/test/suite0.janet
+++ b/test/suite0.janet
@@ -206,6 +206,10 @@
 (def 🐮 :cow)
 (assert (= (string "🐼" 🦊 🐮) "🐼foxcow") "emojis 🙉 :)")
 (assert (not= 🦊 "🦊") "utf8 strings are not symbols and vice versa")
+(assert (= "\U0001F637" "😷") "unicode escape 1")
+(assert (= "\u2623" "\U00002623" "☣") "unicode escape 2")
+(assert (= "\u24c2" "\U000024c2" "Ⓜ") "unicode escape 3")
+(assert (= "\u0061" "a") "unicode escape 4")

 # Symbols with @ character

--- a/tools/tm_lang_gen.janet
+++ b/tools/tm_lang_gen.janet
@@ -308,7 +308,7 @@
      <array>
        <dict>
          <key>match</key>
-          <string>(\\[nevr0zft"\\']|\\x[0-9a-fA-F][0-9a-fA-f])</string>
+          <string>(\\[nevr0zft"\\']|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})</string>
          <key>name</key>
          <string>constant.character.escape.janet</string>
        </dict>