mirror of
https://github.com/arendst/Tasmota.git
synced 2025-07-15 14:56:30 +00:00
Berry add unicode encoding to string parsing (#22713)
This commit is contained in:
parent
94f3744235
commit
02ae53cec5
@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
|
|||||||
- Berry scroll to Leds_matrix (#22693)
|
- Berry scroll to Leds_matrix (#22693)
|
||||||
- HASPmota support for `tabview` (#22707)
|
- HASPmota support for `tabview` (#22707)
|
||||||
- Berry bit-shift operators to `int64` (#22709)
|
- Berry bit-shift operators to `int64` (#22709)
|
||||||
|
- Berry add unicode encoding to string parsing
|
||||||
|
|
||||||
### Breaking Changed
|
### Breaking Changed
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
********************************************************************/
|
********************************************************************/
|
||||||
#include "be_object.h"
|
#include "be_object.h"
|
||||||
#include "be_mem.h"
|
#include "be_mem.h"
|
||||||
|
#include "be_lexer.h"
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
@ -116,38 +117,6 @@ static const char* parser_null(bvm *vm, const char *json)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static char* load_unicode(char *dst, const char *json)
|
|
||||||
{
|
|
||||||
int ucode = 0, i = 4;
|
|
||||||
while (i--) {
|
|
||||||
int ch = *json++;
|
|
||||||
if (ch >= '0' && ch <= '9') {
|
|
||||||
ucode = (ucode << 4) | (ch - '0');
|
|
||||||
} else if (ch >= 'A' && ch <= 'F') {
|
|
||||||
ucode = (ucode << 4) | (ch - 'A' + 0x0A);
|
|
||||||
} else if (ch >= 'a' && ch <= 'f') {
|
|
||||||
ucode = (ucode << 4) | (ch - 'a' + 0x0A);
|
|
||||||
} else {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* convert unicode to utf8 */
|
|
||||||
if (ucode < 0x007F) {
|
|
||||||
/* unicode: 0000 - 007F -> utf8: 0xxxxxxx */
|
|
||||||
*dst++ = (char)(ucode & 0x7F);
|
|
||||||
} else if (ucode < 0x7FF) {
|
|
||||||
/* unicode: 0080 - 07FF -> utf8: 110xxxxx 10xxxxxx */
|
|
||||||
*dst++ = (char)(((ucode >> 6) & 0x1F) | 0xC0);
|
|
||||||
*dst++ = (char)((ucode & 0x3F) | 0x80);
|
|
||||||
} else {
|
|
||||||
/* unicode: 0800 - FFFF -> utf8: 1110xxxx 10xxxxxx 10xxxxxx */
|
|
||||||
*dst++ = (char)(((ucode >> 12) & 0x0F) | 0xE0);
|
|
||||||
*dst++ = (char)(((ucode >> 6) & 0x03F) | 0x80);
|
|
||||||
*dst++ = (char)((ucode & 0x3F) | 0x80);
|
|
||||||
}
|
|
||||||
return dst;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char* parser_string(bvm *vm, const char *json)
|
static const char* parser_string(bvm *vm, const char *json)
|
||||||
{
|
{
|
||||||
if (*json == '"') {
|
if (*json == '"') {
|
||||||
@ -169,7 +138,7 @@ static const char* parser_string(bvm *vm, const char *json)
|
|||||||
case 'r': *dst++ = '\r'; break;
|
case 'r': *dst++ = '\r'; break;
|
||||||
case 't': *dst++ = '\t'; break;
|
case 't': *dst++ = '\t'; break;
|
||||||
case 'u': { /* load unicode */
|
case 'u': { /* load unicode */
|
||||||
dst = load_unicode(dst, json);
|
dst = be_load_unicode(dst, json);
|
||||||
if (dst == NULL) {
|
if (dst == NULL) {
|
||||||
be_free(vm, buf, len);
|
be_free(vm, buf, len);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -203,6 +203,38 @@ static int read_oct(blexer *lexer, const char *src)
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char* be_load_unicode(char *dst, const char *src)
|
||||||
|
{
|
||||||
|
int ucode = 0, i = 4;
|
||||||
|
while (i--) {
|
||||||
|
int ch = *src++;
|
||||||
|
if (ch >= '0' && ch <= '9') {
|
||||||
|
ucode = (ucode << 4) | (ch - '0');
|
||||||
|
} else if (ch >= 'A' && ch <= 'F') {
|
||||||
|
ucode = (ucode << 4) | (ch - 'A' + 0x0A);
|
||||||
|
} else if (ch >= 'a' && ch <= 'f') {
|
||||||
|
ucode = (ucode << 4) | (ch - 'a' + 0x0A);
|
||||||
|
} else {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* convert unicode to utf8 */
|
||||||
|
if (ucode < 0x007F) {
|
||||||
|
/* unicode: 0000 - 007F -> utf8: 0xxxxxxx */
|
||||||
|
*dst++ = (char)(ucode & 0x7F);
|
||||||
|
} else if (ucode < 0x7FF) {
|
||||||
|
/* unicode: 0080 - 07FF -> utf8: 110xxxxx 10xxxxxx */
|
||||||
|
*dst++ = (char)(((ucode >> 6) & 0x1F) | 0xC0);
|
||||||
|
*dst++ = (char)((ucode & 0x3F) | 0x80);
|
||||||
|
} else {
|
||||||
|
/* unicode: 0800 - FFFF -> utf8: 1110xxxx 10xxxxxx 10xxxxxx */
|
||||||
|
*dst++ = (char)(((ucode >> 12) & 0x0F) | 0xE0);
|
||||||
|
*dst++ = (char)(((ucode >> 6) & 0x03F) | 0x80);
|
||||||
|
*dst++ = (char)((ucode & 0x3F) | 0x80);
|
||||||
|
}
|
||||||
|
return dst;
|
||||||
|
}
|
||||||
|
|
||||||
static void tr_string(blexer *lexer)
|
static void tr_string(blexer *lexer)
|
||||||
{
|
{
|
||||||
char *dst, *src, *end;
|
char *dst, *src, *end;
|
||||||
@ -215,32 +247,42 @@ static void tr_string(blexer *lexer)
|
|||||||
be_lexerror(lexer, "unfinished string");
|
be_lexerror(lexer, "unfinished string");
|
||||||
break;
|
break;
|
||||||
case '\\':
|
case '\\':
|
||||||
switch (*src) {
|
if (*src != 'u') {
|
||||||
case 'a': c = '\a'; break;
|
switch (*src) {
|
||||||
case 'b': c = '\b'; break;
|
case 'a': c = '\a'; break;
|
||||||
case 'f': c = '\f'; break;
|
case 'b': c = '\b'; break;
|
||||||
case 'n': c = '\n'; break;
|
case 'f': c = '\f'; break;
|
||||||
case 'r': c = '\r'; break;
|
case 'n': c = '\n'; break;
|
||||||
case 't': c = '\t'; break;
|
case 'r': c = '\r'; break;
|
||||||
case 'v': c = '\v'; break;
|
case 't': c = '\t'; break;
|
||||||
case '\\': c = '\\'; break;
|
case 'v': c = '\v'; break;
|
||||||
case '\'': c = '\''; break;
|
case '\\': c = '\\'; break;
|
||||||
case '"': c = '"'; break;
|
case '\'': c = '\''; break;
|
||||||
case '?': c = '?'; break;
|
case '"': c = '"'; break;
|
||||||
case 'x': c = read_hex(lexer, ++src); ++src; break;
|
case '?': c = '?'; break;
|
||||||
default:
|
case 'x': c = read_hex(lexer, ++src); ++src; break;
|
||||||
c = read_oct(lexer, src);
|
default:
|
||||||
if (c != EOS) {
|
c = read_oct(lexer, src);
|
||||||
src += 2;
|
if (c != EOS) {
|
||||||
|
src += 2;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
++src;
|
||||||
|
*dst++ = (char)c;
|
||||||
|
} else {
|
||||||
|
/* unicode encoding, ex "\uF054" is equivalent to "\xEF\x81\x94"*/
|
||||||
|
dst = be_load_unicode(dst, src + 1);
|
||||||
|
src += 5;
|
||||||
|
if (dst == NULL) {
|
||||||
|
be_lexerror(lexer, "incorrect '\\u' encoding");
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
++src;
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
*dst++ = (char)c;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
*dst++ = (char)c;
|
|
||||||
}
|
}
|
||||||
lexer->buf.len = dst - lexbuf(lexer);
|
lexer->buf.len = dst - lexbuf(lexer);
|
||||||
}
|
}
|
||||||
|
@ -136,5 +136,6 @@ int be_lexer_scan_next(blexer *lexer);
|
|||||||
bstring* be_lexer_newstr(blexer *lexer, const char *str);
|
bstring* be_lexer_newstr(blexer *lexer, const char *str);
|
||||||
const char *be_token2str(bvm *vm, btoken *token);
|
const char *be_token2str(bvm *vm, btoken *token);
|
||||||
const char* be_tokentype2str(btokentype type);
|
const char* be_tokentype2str(btokentype type);
|
||||||
|
char* be_load_unicode(char *dst, const char *src);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -36,6 +36,27 @@ check(45.1e2, 4510)
|
|||||||
check(45.e2, 4500)
|
check(45.e2, 4500)
|
||||||
check(45.e+2, 4500)
|
check(45.e+2, 4500)
|
||||||
|
|
||||||
|
# unicode encoding from JSON
|
||||||
|
assert(bytes().fromstring("a").tohex() == "61")
|
||||||
|
assert(bytes().fromstring("\uF054").tohex() == "EF8194")
|
||||||
|
assert(bytes().fromstring("\uF054\uF055").tohex() == "EF8194EF8195")
|
||||||
|
assert(bytes().fromstring("a\uF054b").tohex() == "61EF819462")
|
||||||
|
# 1 byte
|
||||||
|
assert(bytes().fromstring("\u0061").tohex() == "61")
|
||||||
|
# 2 bytes
|
||||||
|
assert(bytes().fromstring("\u0088").tohex() == "C288")
|
||||||
|
assert(bytes().fromstring("\u0288").tohex() == "CA88")
|
||||||
|
# 3 bytes
|
||||||
|
assert(bytes().fromstring("\u1288").tohex() == "E18A88")
|
||||||
|
|
||||||
|
assert(bytes().fromstring("\uFFFF").tohex() == "EFBFBF")
|
||||||
|
|
||||||
|
# bad unicode encoding
|
||||||
|
test_source('"\\u"', "incorrect '\\u' encoding")
|
||||||
|
test_source('"\\u1"', "incorrect '\\u' encoding")
|
||||||
|
test_source('"\\u22"', "incorrect '\\u' encoding")
|
||||||
|
test_source('"\\u333"', "incorrect '\\u' encoding")
|
||||||
|
|
||||||
# Ensure pathologically long numbers don't crash the lexer (or cause an buffer overflow)
|
# Ensure pathologically long numbers don't crash the lexer (or cause an buffer overflow)
|
||||||
assert(000000000000000000000000000000000000E0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 == 0.0);
|
assert(000000000000000000000000000000000000E0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 == 0.0);
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user