From 02ae53cec5c1b25b05a38e4c554664ffd9a07a0c Mon Sep 17 00:00:00 2001 From: s-hadinger <49731213+s-hadinger@users.noreply.github.com> Date: Tue, 24 Dec 2024 18:03:21 +0100 Subject: [PATCH] Berry add unicode encoding to string parsing (#22713) --- CHANGELOG.md | 1 + lib/libesp32/berry/src/be_jsonlib.c | 35 +----------- lib/libesp32/berry/src/be_lexer.c | 82 ++++++++++++++++++++++------- lib/libesp32/berry/src/be_lexer.h | 1 + lib/libesp32/berry/tests/lexer.be | 21 ++++++++ 5 files changed, 87 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fc6955aa..3d2d51224 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Berry scroll to Leds_matrix (#22693) - HASPmota support for `tabview` (#22707) - Berry bit-shift operators to `int64` (#22709) +- Berry add unicode encoding to string parsing ### Breaking Changed diff --git a/lib/libesp32/berry/src/be_jsonlib.c b/lib/libesp32/berry/src/be_jsonlib.c index 615e8bebc..e6667dc2e 100644 --- a/lib/libesp32/berry/src/be_jsonlib.c +++ b/lib/libesp32/berry/src/be_jsonlib.c @@ -7,6 +7,7 @@ ********************************************************************/ #include "be_object.h" #include "be_mem.h" +#include "be_lexer.h" #include #include @@ -116,38 +117,6 @@ static const char* parser_null(bvm *vm, const char *json) return NULL; } -static char* load_unicode(char *dst, const char *json) -{ - int ucode = 0, i = 4; - while (i--) { - int ch = *json++; - if (ch >= '0' && ch <= '9') { - ucode = (ucode << 4) | (ch - '0'); - } else if (ch >= 'A' && ch <= 'F') { - ucode = (ucode << 4) | (ch - 'A' + 0x0A); - } else if (ch >= 'a' && ch <= 'f') { - ucode = (ucode << 4) | (ch - 'a' + 0x0A); - } else { - return NULL; - } - } - /* convert unicode to utf8 */ - if (ucode < 0x007F) { - /* unicode: 0000 - 007F -> utf8: 0xxxxxxx */ - *dst++ = (char)(ucode & 0x7F); - } else if (ucode < 0x7FF) { - /* unicode: 0080 - 07FF -> utf8: 110xxxxx 10xxxxxx */ - *dst++ = (char)(((ucode >> 6) & 0x1F) | 0xC0); - *dst++ = (char)((ucode & 0x3F) | 0x80); - } else { - /* unicode: 0800 - FFFF -> utf8: 1110xxxx 10xxxxxx 10xxxxxx */ - *dst++ = (char)(((ucode >> 12) & 0x0F) | 0xE0); - *dst++ = (char)(((ucode >> 6) & 0x03F) | 0x80); - *dst++ = (char)((ucode & 0x3F) | 0x80); - } - return dst; -} - static const char* parser_string(bvm *vm, const char *json) { if (*json == '"') { @@ -169,7 +138,7 @@ static const char* parser_string(bvm *vm, const char *json) case 'r': *dst++ = '\r'; break; case 't': *dst++ = '\t'; break; case 'u': { /* load unicode */ - dst = load_unicode(dst, json); + dst = be_load_unicode(dst, json); if (dst == NULL) { be_free(vm, buf, len); return NULL; diff --git a/lib/libesp32/berry/src/be_lexer.c b/lib/libesp32/berry/src/be_lexer.c index 4e41bf8f6..f0b74bab9 100644 --- a/lib/libesp32/berry/src/be_lexer.c +++ b/lib/libesp32/berry/src/be_lexer.c @@ -203,6 +203,38 @@ static int read_oct(blexer *lexer, const char *src) return c; } +char* be_load_unicode(char *dst, const char *src) +{ + int ucode = 0, i = 4; + while (i--) { + int ch = *src++; + if (ch >= '0' && ch <= '9') { + ucode = (ucode << 4) | (ch - '0'); + } else if (ch >= 'A' && ch <= 'F') { + ucode = (ucode << 4) | (ch - 'A' + 0x0A); + } else if (ch >= 'a' && ch <= 'f') { + ucode = (ucode << 4) | (ch - 'a' + 0x0A); + } else { + return NULL; + } + } + /* convert unicode to utf8 */ + if (ucode < 0x007F) { + /* unicode: 0000 - 007F -> utf8: 0xxxxxxx */ + *dst++ = (char)(ucode & 0x7F); + } else if (ucode < 0x7FF) { + /* unicode: 0080 - 07FF -> utf8: 110xxxxx 10xxxxxx */ + *dst++ = (char)(((ucode >> 6) & 0x1F) | 0xC0); + *dst++ = (char)((ucode & 0x3F) | 0x80); + } else { + /* unicode: 0800 - FFFF -> utf8: 1110xxxx 10xxxxxx 10xxxxxx */ + *dst++ = (char)(((ucode >> 12) & 0x0F) | 0xE0); + *dst++ = (char)(((ucode >> 6) & 0x03F) | 0x80); + *dst++ = (char)((ucode & 0x3F) | 0x80); + } + return dst; +} + static void tr_string(blexer *lexer) { char *dst, *src, *end; @@ -215,32 +247,42 @@ static void tr_string(blexer *lexer) be_lexerror(lexer, "unfinished string"); break; case '\\': - switch (*src) { - case 'a': c = '\a'; break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'v': c = '\v'; break; - case '\\': c = '\\'; break; - case '\'': c = '\''; break; - case '"': c = '"'; break; - case '?': c = '?'; break; - case 'x': c = read_hex(lexer, ++src); ++src; break; - default: - c = read_oct(lexer, src); - if (c != EOS) { - src += 2; + if (*src != 'u') { + switch (*src) { + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'v': c = '\v'; break; + case '\\': c = '\\'; break; + case '\'': c = '\''; break; + case '"': c = '"'; break; + case '?': c = '?'; break; + case 'x': c = read_hex(lexer, ++src); ++src; break; + default: + c = read_oct(lexer, src); + if (c != EOS) { + src += 2; + } + break; + } + ++src; + *dst++ = (char)c; + } else { + /* unicode encoding, ex "\uF054" is equivalent to "\xEF\x81\x94"*/ + dst = be_load_unicode(dst, src + 1); + src += 5; + if (dst == NULL) { + be_lexerror(lexer, "incorrect '\\u' encoding"); } - break; } - ++src; break; default: + *dst++ = (char)c; break; } - *dst++ = (char)c; } lexer->buf.len = dst - lexbuf(lexer); } diff --git a/lib/libesp32/berry/src/be_lexer.h b/lib/libesp32/berry/src/be_lexer.h index e166a1a64..fdbcc1d4f 100644 --- a/lib/libesp32/berry/src/be_lexer.h +++ b/lib/libesp32/berry/src/be_lexer.h @@ -136,5 +136,6 @@ int be_lexer_scan_next(blexer *lexer); bstring* be_lexer_newstr(blexer *lexer, const char *str); const char *be_token2str(bvm *vm, btoken *token); const char* be_tokentype2str(btokentype type); +char* be_load_unicode(char *dst, const char *src); #endif diff --git a/lib/libesp32/berry/tests/lexer.be b/lib/libesp32/berry/tests/lexer.be index db2945bc7..8470d779c 100644 --- a/lib/libesp32/berry/tests/lexer.be +++ b/lib/libesp32/berry/tests/lexer.be @@ -36,6 +36,27 @@ check(45.1e2, 4510) check(45.e2, 4500) check(45.e+2, 4500) +# unicode encoding from JSON +assert(bytes().fromstring("a").tohex() == "61") +assert(bytes().fromstring("\uF054").tohex() == "EF8194") +assert(bytes().fromstring("\uF054\uF055").tohex() == "EF8194EF8195") +assert(bytes().fromstring("a\uF054b").tohex() == "61EF819462") +# 1 byte +assert(bytes().fromstring("\u0061").tohex() == "61") +# 2 bytes +assert(bytes().fromstring("\u0088").tohex() == "C288") +assert(bytes().fromstring("\u0288").tohex() == "CA88") +# 3 bytes +assert(bytes().fromstring("\u1288").tohex() == "E18A88") + +assert(bytes().fromstring("\uFFFF").tohex() == "EFBFBF") + +# bad unicode encoding +test_source('"\\u"', "incorrect '\\u' encoding") +test_source('"\\u1"', "incorrect '\\u' encoding") +test_source('"\\u22"', "incorrect '\\u' encoding") +test_source('"\\u333"', "incorrect '\\u' encoding") + # Ensure pathologically long numbers don't crash the lexer (or cause an buffer overflow) assert(000000000000000000000000000000000000E0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 == 0.0);