Berry vulnerability in JSON parsing for unicode (#23603)

This commit is contained in:
s-hadinger 2025-06-27 19:38:31 +02:00 committed by GitHub
parent e9b62811c7
commit decdfc6b51
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 289 additions and 48 deletions

View File

@ -16,6 +16,7 @@ All notable changes to this project will be documented in this file.
### Fixed
- LVGL restore `lv_chart.set_range` removed in LVGL 9.3.0 in favor of `lv_chart.set_axis_range` (#23567)
- Berry vulnerability in JSON parsing for unicode
### Removed

View File

@ -259,6 +259,7 @@
#undef BE_USE_SOLIDIFY_MODULE
#define BE_USE_DEBUG_MODULE 1
#define BE_USE_SOLIDIFY_MODULE 1
#define BE_MAPPING_ENABLE_INPUT_VALIDATION 1 // input validation for lv_mapping
#endif // USE_BERRY_DEBUG
/* Macro: BE_EXPLICIT_XXX

View File

@ -10,6 +10,7 @@
#include "be_lexer.h"
#include <string.h>
#include <math.h>
#include <ctype.h>
#if BE_USE_JSON_MODULE
@ -20,6 +21,9 @@
#define INDENT_WIDTH 2
#define INDENT_CHAR ' '
/* Security: Maximum JSON string length to prevent memory exhaustion attacks */
#define MAX_JSON_STRING_LEN (1024 * 1024) /* 1MB limit */
static const char* parser_value(bvm *vm, const char *json);
static void value_dump(bvm *vm, int *indent, int idx, int fmt);
@ -62,21 +66,66 @@ static int is_object(bvm *vm, const char *class, int idx)
return 0;
}
static int json_strlen(const char *json)
/* Calculate the actual buffer size needed for JSON string parsing
* accounting for Unicode expansion and security limits */
static size_t json_strlen_safe(const char *json, size_t *actual_len)
{
int ch;
const char *s = json + 1; /* skip '"' */
/* get string length "(\\.|[^"])*" */
size_t char_count = 0;
size_t byte_count = 0;
while ((ch = *s) != '\0' && ch != '"') {
char_count++;
if (char_count > MAX_JSON_STRING_LEN) {
return SIZE_MAX; /* String too long */
}
++s;
if (ch == '\\') {
ch = *s++;
if (ch == '\0') {
return -1;
return SIZE_MAX; /* Malformed string */
}
switch (ch) {
case '"': case '\\': case '/':
case 'b': case 'f': case 'n': case 'r': case 't':
byte_count += 1;
break;
case 'u':
/* Unicode can expand to 1-3 UTF-8 bytes
* We conservatively assume 3 bytes for safety */
byte_count += 3;
/* Verify we have 4 hex digits following */
for (int i = 0; i < 4; i++) {
if (!s[i] || !isxdigit((unsigned char)s[i])) {
return SIZE_MAX; /* Invalid unicode sequence */
}
}
s += 4; /* Skip the 4 hex digits */
break;
default:
return SIZE_MAX; /* Invalid escape sequence */
}
} else if (ch >= 0 && ch <= 0x1f) {
return SIZE_MAX; /* Unescaped control character */
} else {
byte_count += 1;
}
/* Check for potential overflow */
if (byte_count > MAX_JSON_STRING_LEN) {
return SIZE_MAX;
}
}
return ch ? cast_int(s - json - 1) : -1;
if (ch != '"') {
return SIZE_MAX; /* Unterminated string */
}
*actual_len = char_count;
return byte_count;
}
static void json2berry(bvm *vm, const char *class)
@ -117,55 +166,94 @@ static const char* parser_null(bvm *vm, const char *json)
static const char* parser_string(bvm *vm, const char *json)
{
if (*json == '"') {
int len = json_strlen(json++);
if (len > -1) {
int ch;
char *buf, *dst = buf = be_malloc(vm, len);
while ((ch = *json) != '\0' && ch != '"') {
++json;
if (ch == '\\') {
ch = *json++; /* skip '\' */
switch (ch) {
case '"': *dst++ = '"'; break;
case '\\': *dst++ = '\\'; break;
case '/': *dst++ = '/'; break;
case 'b': *dst++ = '\b'; break;
case 'f': *dst++ = '\f'; break;
case 'n': *dst++ = '\n'; break;
case 'r': *dst++ = '\r'; break;
case 't': *dst++ = '\t'; break;
case 'u': { /* load unicode */
dst = be_load_unicode(dst, json);
if (dst == NULL) {
be_free(vm, buf, len);
return NULL;
}
json += 4;
break;
}
default: be_free(vm, buf, len); return NULL; /* error */
}
} else if(ch >= 0 && ch <= 0x1f) {
/* control characters must be escaped
as per https://www.rfc-editor.org/rfc/rfc7159#section-7 */
be_free(vm, buf, len);
if (*json != '"') {
return NULL;
}
size_t char_len;
size_t byte_len = json_strlen_safe(json, &char_len);
if (byte_len == SIZE_MAX) {
return NULL; /* Invalid or too long string */
}
if (byte_len == 0) {
/* Empty string */
be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
be_pushstring(vm, "");
return json + 2; /* Skip opening and closing quotes */
}
/* Allocate buffer - size is correctly calculated by json_strlen_safe */
char *buf = be_malloc(vm, byte_len + 1);
if (!buf) {
return NULL; /* Out of memory */
}
char *dst = buf;
const char *src = json + 1; /* Skip opening quote */
int ch;
while ((ch = *src) != '\0' && ch != '"') {
++src;
if (ch == '\\') {
ch = *src++;
switch (ch) {
case '"':
*dst++ = '"';
break;
case '\\':
*dst++ = '\\';
break;
case '/':
*dst++ = '/';
break;
case 'b':
*dst++ = '\b';
break;
case 'f':
*dst++ = '\f';
break;
case 'n':
*dst++ = '\n';
break;
case 'r':
*dst++ = '\r';
break;
case 't':
*dst++ = '\t';
break;
case 'u': {
dst = be_load_unicode(dst, src);
if (dst == NULL) {
be_free(vm, buf, byte_len + 1);
return NULL;
} else {
*dst++ = (char)ch;
}
src += 4;
break;
}
be_assert(ch == '"');
/* require the stack to have some free space for the string,
since parsing deeply nested objects might
crash the VM due to insufficient stack space. */
be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
be_pushnstring(vm, buf, cast_int(dst - buf));
be_free(vm, buf, len);
return json + 1; /* skip '"' */
default:
be_free(vm, buf, byte_len + 1);
return NULL; /* Invalid escape */
}
} else if (ch >= 0 && ch <= 0x1f) {
be_free(vm, buf, byte_len + 1);
return NULL; /* Unescaped control character */
} else {
*dst++ = (char)ch;
}
}
return NULL;
if (ch != '"') {
be_free(vm, buf, byte_len + 1);
return NULL; /* Unterminated string */
}
/* Success - create Berry string */
be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
be_pushnstring(vm, buf, (size_t)(dst - buf));
be_free(vm, buf, byte_len + 1);
return src + 1; /* Skip closing quote */
}
static const char* parser_field(bvm *vm, const char *json)

View File

@ -93,3 +93,154 @@ for count : 10..200
end
json.dump(arr)
end
# Security tests for JSON parsing fixes
# Test 1: Unicode expansion buffer overflow protection
# Each \u0800 sequence (6 chars in JSON) becomes 3 UTF-8 bytes
# Old code would allocate only 1 byte per sequence, causing buffer overflow
def test_unicode_expansion()
# Test single Unicode sequences of different byte lengths
assert_load('"\\u0048"', 'H') # 1 UTF-8 byte (ASCII)
assert_load('"\\u00E9"', 'é') # 2 UTF-8 bytes (Latin)
assert_load('"\\u0800"', 'ࠀ') # 3 UTF-8 bytes (Samaritan)
# Test multiple Unicode sequences that would cause buffer overflow in old code
var many_unicode = '"'
for i: 0..49 # 50 sequences (0-49 inclusive), each \u0800 -> 3 bytes (150 bytes total vs 50 bytes old allocation)
many_unicode += '\\u0800'
end
many_unicode += '"'
var result = json.load('{"test": ' + many_unicode + '}')
assert(result != nil, "Unicode expansion test should succeed")
assert(size(result['test']) == 150, "Unicode expansion should produce 150 UTF-8 bytes") # 50 * 3 bytes
end
# Test 2: Invalid Unicode sequence rejection
def test_invalid_unicode()
# Invalid hex digits in Unicode sequences should be rejected
assert_load_failed('"\\uXXXX"') # Non-hex characters
assert_load_failed('"\\u12XY"') # Mixed valid/invalid hex
assert_load_failed('"\\u"') # Incomplete sequence
assert_load_failed('"\\u123"') # Too short
assert_load_failed('"\\u123G"') # Invalid hex digit
end
# Test 3: Control character validation
def test_control_characters()
# Unescaped control characters (0x00-0x1F) should be rejected
# Note: We need to create JSON strings with actual unescaped control characters
assert_load_failed('{"test": "hello\x0Aworld"}') # Unescaped newline (0x0A)
assert_load_failed('{"test": "hello\x09world"}') # Unescaped tab (0x09)
assert_load_failed('{"test": "hello\x0Dworld"}') # Unescaped carriage return (0x0D)
assert_load_failed('{"test": "hello\x01world"}') # Unescaped control char (0x01)
# Properly escaped control characters should work
var escaped_newline = json.load('{"test": "hello\\nworld"}')
assert(escaped_newline != nil && escaped_newline['test'] == "hello\nworld", "Escaped newline should work")
var escaped_tab = json.load('{"test": "hello\\tworld"}')
assert(escaped_tab != nil && escaped_tab['test'] == "hello\tworld", "Escaped tab should work")
var escaped_cr = json.load('{"test": "hello\\rworld"}')
assert(escaped_cr != nil && escaped_cr['test'] == "hello\rworld", "Escaped carriage return should work")
end
# Test 4: Invalid escape sequence rejection
def test_invalid_escapes()
# Invalid escape sequences should be rejected
assert_load_failed('"\\q"') # Invalid escape character
assert_load_failed('"\\x"') # Invalid escape character
assert_load_failed('"\\z"') # Invalid escape character
assert_load_failed('"\\"') # Incomplete escape at end
end
# Test 5: String length limits
def test_string_length_limits()
# Test very long strings (should work up to limit)
var long_str = '"'
for i: 0..999 # 1000 character string (0-999 inclusive)
long_str += 'a'
end
long_str += '"'
var result = json.load('{"test": ' + long_str + '}')
assert(result != nil, "Long string within limits should work")
assert(size(result['test']) == 1000, "Long string should have correct length")
end
# Test 6: Mixed Unicode and ASCII (realistic scenario)
def test_mixed_content()
# Test realistic mixed content that could trigger the vulnerability
var mixed = '{"message": "Hello \\u4E16\\u754C! Welcome to \\u0048\\u0065\\u006C\\u006C\\u006F world."}'
var result = json.load(mixed)
assert(result != nil, "Mixed Unicode/ASCII should work")
assert(result['message'] == "Hello 世界! Welcome to Hello world.", "Mixed content should decode correctly")
end
# Test 7: Edge cases
def test_edge_cases()
# Empty string
var empty_result = json.load('{"empty": ""}')
assert(empty_result != nil && empty_result['empty'] == "", "Empty string should work")
# String with only Unicode
var unicode_result = json.load('{"unicode": "\\u0048\\u0065\\u006C\\u006C\\u006F"}')
assert(unicode_result != nil && unicode_result['unicode'] == "Hello", "Unicode-only string should work")
# String with only escapes
var escapes_result = json.load('{"escapes": "\\n\\t\\r\\\\\\\""}')
assert(escapes_result != nil && escapes_result['escapes'] == "\n\t\r\\\"", "Escape-only string should work")
# Maximum valid Unicode value
var max_unicode_result = json.load('{"max_unicode": "\\uFFFF"}')
assert(max_unicode_result != nil, "Maximum Unicode value should work")
end
# Test 8: Malformed JSON strings
def test_malformed_strings()
# Unterminated strings
assert_load_failed('{"test": "unterminated')
assert_load_failed('{"test": "unterminated\\')
# Invalid JSON structure with string issues
assert_load_failed('{"test": "valid"x}')
assert_load_failed('{"test": "\\uXXXX", "other": "valid"}')
end
# Test 9: Nested objects with Unicode (stress test)
def test_nested_unicode_stress()
# Create nested structure with Unicode to test memory management
var nested = '{"level0": {"unicode": "\\u0800\\u0801\\u0802", "level1": {"unicode": "\\u0800\\u0801\\u0802", "final": "\\u4E16\\u754C"}}}'
var result = json.load(nested)
assert(result != nil, "Nested Unicode structure should parse successfully")
end
# Test 10: Security regression test
def test_security_regression()
# This specific pattern would cause buffer overflow in the original code
# \u0800 sequences: 6 chars in JSON -> 3 bytes in UTF-8 (50% expansion)
var attack_pattern = '{"payload": "'
for i: 0..99 # 100 sequences (0-99 inclusive) = 600 chars in JSON, 300 bytes needed, but old code allocated only 100 bytes
attack_pattern += '\\u0800'
end
attack_pattern += '"}'
var result = json.load(attack_pattern)
assert(result != nil, "Security regression test should not crash")
assert(size(result['payload']) == 300, "Should produce exactly 300 UTF-8 bytes") # 100 * 3 bytes
end
# Run all security tests
test_unicode_expansion()
test_invalid_unicode()
test_control_characters()
test_invalid_escapes()
test_string_length_limits()
test_mixed_content()
test_edge_cases()
test_malformed_strings()
test_nested_unicode_stress()
test_security_regression()