Berry vulnerability in JSON parsing for unicode (#23603)

2025-07-27 04:36:31 +00:00 · 2025-06-27 19:38:31 +02:00 · 2025-06-27 19:38:31 +02:00 · decdfc6b51
commit decdfc6b51
parent e9b62811c7
4 changed files with 289 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -16,6 +16,7 @@ All notable changes to this project will be documented in this file.
 ### Fixed
 - LVGL restore `lv_chart.set_range` removed in LVGL 9.3.0 in favor of `lv_chart.set_axis_range` (#23567)
 - Berry vulnerability in JSON parsing for unicode
 ### Removed
--- a/lib/libesp32/berry/default/berry_conf.h
+++ b/lib/libesp32/berry/default/berry_conf.h
@ -259,6 +259,7 @@
  #undef BE_USE_SOLIDIFY_MODULE
  #define BE_USE_DEBUG_MODULE             1
  #define BE_USE_SOLIDIFY_MODULE          1
  #define BE_MAPPING_ENABLE_INPUT_VALIDATION   1    // input validation for lv_mapping
 #endif // USE_BERRY_DEBUG
 /* Macro: BE_EXPLICIT_XXX
--- a/lib/libesp32/berry/src/be_jsonlib.c
+++ b/lib/libesp32/berry/src/be_jsonlib.c
@ -10,6 +10,7 @@
 #include "be_lexer.h"
 #include <string.h>
 #include <math.h>
 #include <ctype.h>
 #if BE_USE_JSON_MODULE
@ -20,6 +21,9 @@
 #define INDENT_WIDTH    2
 #define INDENT_CHAR     ' '
 /* Security: Maximum JSON string length to prevent memory exhaustion attacks */
 #define MAX_JSON_STRING_LEN  (1024 * 1024)  /* 1MB limit */
 static const char* parser_value(bvm *vm, const char *json);
 static void value_dump(bvm *vm, int *indent, int idx, int fmt);
@ -62,21 +66,66 @@ static int is_object(bvm *vm, const char *class, int idx)
    return  0;
 }
-static int json_strlen(const char *json)
+/* Calculate the actual buffer size needed for JSON string parsing
 * accounting for Unicode expansion and security limits */
 static size_t json_strlen_safe(const char *json, size_t *actual_len)
 {
    int ch;
    const char *s = json + 1; /* skip '"' */
-    /* get string length "(\\.|[^"])*" */
+    size_t char_count = 0;
    size_t byte_count = 0;
    while ((ch = *s) != '\0' && ch != '"') {
        char_count++;
        if (char_count > MAX_JSON_STRING_LEN) {
            return SIZE_MAX; /* String too long */
        }
        ++s;
        if (ch == '\\') {
            ch = *s++;
            if (ch == '\0') {
-                return -1;
+                return SIZE_MAX; /* Malformed string */
            }
            switch (ch) {
            case '"': case '\\': case '/':
            case 'b': case 'f': case 'n': case 'r': case 't':
                byte_count += 1;
                break;
            case 'u':
                /* Unicode can expand to 1-3 UTF-8 bytes
                 * We conservatively assume 3 bytes for safety */
                byte_count += 3;
                /* Verify we have 4 hex digits following */
                for (int i = 0; i < 4; i++) {
                    if (!s[i] || !isxdigit((unsigned char)s[i])) {
                        return SIZE_MAX; /* Invalid unicode sequence */
                    }
                }
                s += 4; /* Skip the 4 hex digits */
                break;
            default:
                return SIZE_MAX; /* Invalid escape sequence */
            }
-    return ch ? cast_int(s - json - 1) : -1;
+        } else if (ch >= 0 && ch <= 0x1f) {
            return SIZE_MAX; /* Unescaped control character */
        } else {
            byte_count += 1;
        }
        /* Check for potential overflow */
        if (byte_count > MAX_JSON_STRING_LEN) {
            return SIZE_MAX;
        }
    }
    if (ch != '"') {
        return SIZE_MAX; /* Unterminated string */
    }
    *actual_len = char_count;
    return byte_count;
 }
 static void json2berry(bvm *vm, const char *class)
@ -117,55 +166,94 @@ static const char* parser_null(bvm *vm, const char *json)
 static const char* parser_string(bvm *vm, const char *json)
 {
-    if (*json == '"') {
+    if (*json != '"') {
        int len = json_strlen(json++);
        if (len > -1) {
            int ch;
            char *buf, *dst = buf = be_malloc(vm, len);
            while ((ch = *json) != '\0' && ch != '"') {
                ++json;
                if (ch == '\\') {
                    ch = *json++; /* skip '\' */
                    switch (ch) {
                    case '"': *dst++ = '"'; break;
                    case '\\': *dst++ = '\\'; break;
                    case '/': *dst++ = '/'; break;
                    case 'b': *dst++ = '\b'; break;
                    case 'f': *dst++ = '\f'; break;
                    case 'n': *dst++ = '\n'; break;
                    case 'r': *dst++ = '\r'; break;
                    case 't': *dst++ = '\t'; break;
                    case 'u': { /* load unicode */
                        dst = be_load_unicode(dst, json);
                        if (dst == NULL) {
                            be_free(vm, buf, len);
        return NULL;
    }
-                        json += 4;
+    
    size_t char_len;
    size_t byte_len = json_strlen_safe(json, &char_len);
    if (byte_len == SIZE_MAX) {
        return NULL; /* Invalid or too long string */
    }
    if (byte_len == 0) {
        /* Empty string */
        be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
        be_pushstring(vm, "");
        return json + 2; /* Skip opening and closing quotes */
    }
    /* Allocate buffer - size is correctly calculated by json_strlen_safe */
    char *buf = be_malloc(vm, byte_len + 1);
    if (!buf) {
        return NULL; /* Out of memory */
    }
    char *dst = buf;
    const char *src = json + 1; /* Skip opening quote */
    int ch;
    while ((ch = *src) != '\0' && ch != '"') {
        ++src;
        if (ch == '\\') {
            ch = *src++;
            switch (ch) {
            case '"': 
                *dst++ = '"'; 
                break;
            case '\\': 
                *dst++ = '\\'; 
                break;
            case '/': 
                *dst++ = '/'; 
                break;
            case 'b': 
                *dst++ = '\b'; 
                break;
            case 'f': 
                *dst++ = '\f'; 
                break;
            case 'n': 
                *dst++ = '\n'; 
                break;
            case 'r': 
                *dst++ = '\r'; 
                break;
            case 't': 
                *dst++ = '\t'; 
                break;
            case 'u': {
                dst = be_load_unicode(dst, src);
                if (dst == NULL) {
                    be_free(vm, buf, byte_len + 1);
                    return NULL;
                }
                src += 4;
                break;
            }
-                    default: be_free(vm, buf, len); return NULL; /* error */
+            default: 
                be_free(vm, buf, byte_len + 1);
                return NULL; /* Invalid escape */
            }
        } else if (ch >= 0 && ch <= 0x1f) {
-                    /* control characters must be escaped
+            be_free(vm, buf, byte_len + 1);
-                       as per https://www.rfc-editor.org/rfc/rfc7159#section-7 */
+            return NULL; /* Unescaped control character */
                    be_free(vm, buf, len);
                    return NULL;
        } else {
            *dst++ = (char)ch;
        }
    }
-            be_assert(ch == '"');
+    
-            /* require the stack to have some free space for the string, 
+    if (ch != '"') {
-               since parsing deeply nested objects might
+        be_free(vm, buf, byte_len + 1);
-               crash the VM due to insufficient stack space. */
+        return NULL; /* Unterminated string */
    }
    /* Success - create Berry string */
    be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
-            be_pushnstring(vm, buf, cast_int(dst - buf));
+    be_pushnstring(vm, buf, (size_t)(dst - buf));
-            be_free(vm, buf, len);
+    be_free(vm, buf, byte_len + 1);
-            return json + 1; /* skip '"' */
+    return src + 1; /* Skip closing quote */
        }
    }
    return NULL;
 }
 static const char* parser_field(bvm *vm, const char *json)
--- a/lib/libesp32/berry/tests/json.be
+++ b/lib/libesp32/berry/tests/json.be
@ -93,3 +93,154 @@ for count : 10..200
    end
    json.dump(arr)
 end
 # Security tests for JSON parsing fixes
 # Test 1: Unicode expansion buffer overflow protection
 # Each \u0800 sequence (6 chars in JSON) becomes 3 UTF-8 bytes
 # Old code would allocate only 1 byte per sequence, causing buffer overflow
 def test_unicode_expansion()
    # Test single Unicode sequences of different byte lengths
    assert_load('"\\u0048"', 'H')        # 1 UTF-8 byte (ASCII)
    assert_load('"\\u00E9"', 'é')        # 2 UTF-8 bytes (Latin)
    assert_load('"\\u0800"', 'ࠀ')        # 3 UTF-8 bytes (Samaritan)
    # Test multiple Unicode sequences that would cause buffer overflow in old code
    var many_unicode = '"'
    for i: 0..49  # 50 sequences (0-49 inclusive), each \u0800 -> 3 bytes (150 bytes total vs 50 bytes old allocation)
        many_unicode += '\\u0800'
    end
    many_unicode += '"'
    var result = json.load('{"test": ' + many_unicode + '}')
    assert(result != nil, "Unicode expansion test should succeed")
    assert(size(result['test']) == 150, "Unicode expansion should produce 150 UTF-8 bytes")  # 50 * 3 bytes
 end
 # Test 2: Invalid Unicode sequence rejection
 def test_invalid_unicode()
    # Invalid hex digits in Unicode sequences should be rejected
    assert_load_failed('"\\uXXXX"')      # Non-hex characters
    assert_load_failed('"\\u12XY"')      # Mixed valid/invalid hex
    assert_load_failed('"\\u"')          # Incomplete sequence
    assert_load_failed('"\\u123"')       # Too short
    assert_load_failed('"\\u123G"')      # Invalid hex digit
 end
 # Test 3: Control character validation
 def test_control_characters()
    # Unescaped control characters (0x00-0x1F) should be rejected
    # Note: We need to create JSON strings with actual unescaped control characters
    assert_load_failed('{"test": "hello\x0Aworld"}')     # Unescaped newline (0x0A)
    assert_load_failed('{"test": "hello\x09world"}')     # Unescaped tab (0x09)
    assert_load_failed('{"test": "hello\x0Dworld"}')     # Unescaped carriage return (0x0D)
    assert_load_failed('{"test": "hello\x01world"}')     # Unescaped control char (0x01)
    # Properly escaped control characters should work
    var escaped_newline = json.load('{"test": "hello\\nworld"}')
    assert(escaped_newline != nil && escaped_newline['test'] == "hello\nworld", "Escaped newline should work")
    var escaped_tab = json.load('{"test": "hello\\tworld"}')
    assert(escaped_tab != nil && escaped_tab['test'] == "hello\tworld", "Escaped tab should work")
    var escaped_cr = json.load('{"test": "hello\\rworld"}')
    assert(escaped_cr != nil && escaped_cr['test'] == "hello\rworld", "Escaped carriage return should work")
 end
 # Test 4: Invalid escape sequence rejection
 def test_invalid_escapes()
    # Invalid escape sequences should be rejected
    assert_load_failed('"\\q"')          # Invalid escape character
    assert_load_failed('"\\x"')          # Invalid escape character
    assert_load_failed('"\\z"')          # Invalid escape character
    assert_load_failed('"\\"')           # Incomplete escape at end
 end
 # Test 5: String length limits
 def test_string_length_limits()
    # Test very long strings (should work up to limit)
    var long_str = '"'
    for i: 0..999  # 1000 character string (0-999 inclusive)
        long_str += 'a'
    end
    long_str += '"'
    var result = json.load('{"test": ' + long_str + '}')
    assert(result != nil, "Long string within limits should work")
    assert(size(result['test']) == 1000, "Long string should have correct length")
 end
 # Test 6: Mixed Unicode and ASCII (realistic scenario)
 def test_mixed_content()
    # Test realistic mixed content that could trigger the vulnerability
    var mixed = '{"message": "Hello \\u4E16\\u754C! Welcome to \\u0048\\u0065\\u006C\\u006C\\u006F world."}'
    var result = json.load(mixed)
    assert(result != nil, "Mixed Unicode/ASCII should work")
    assert(result['message'] == "Hello 世界! Welcome to Hello world.", "Mixed content should decode correctly")
 end
 # Test 7: Edge cases
 def test_edge_cases()
    # Empty string
    var empty_result = json.load('{"empty": ""}')
    assert(empty_result != nil && empty_result['empty'] == "", "Empty string should work")
    # String with only Unicode
    var unicode_result = json.load('{"unicode": "\\u0048\\u0065\\u006C\\u006C\\u006F"}')
    assert(unicode_result != nil && unicode_result['unicode'] == "Hello", "Unicode-only string should work")
    # String with only escapes
    var escapes_result = json.load('{"escapes": "\\n\\t\\r\\\\\\\""}')
    assert(escapes_result != nil && escapes_result['escapes'] == "\n\t\r\\\"", "Escape-only string should work")
    # Maximum valid Unicode value
    var max_unicode_result = json.load('{"max_unicode": "\\uFFFF"}')
    assert(max_unicode_result != nil, "Maximum Unicode value should work")
 end
 # Test 8: Malformed JSON strings
 def test_malformed_strings()
    # Unterminated strings
    assert_load_failed('{"test": "unterminated')
    assert_load_failed('{"test": "unterminated\\')
    # Invalid JSON structure with string issues
    assert_load_failed('{"test": "valid"x}')
    assert_load_failed('{"test": "\\uXXXX", "other": "valid"}')
 end
 # Test 9: Nested objects with Unicode (stress test)
 def test_nested_unicode_stress()
    # Create nested structure with Unicode to test memory management
    var nested = '{"level0": {"unicode": "\\u0800\\u0801\\u0802", "level1": {"unicode": "\\u0800\\u0801\\u0802", "final": "\\u4E16\\u754C"}}}'
    var result = json.load(nested)
    assert(result != nil, "Nested Unicode structure should parse successfully")
 end
 # Test 10: Security regression test
 def test_security_regression()
    # This specific pattern would cause buffer overflow in the original code
    # \u0800 sequences: 6 chars in JSON -> 3 bytes in UTF-8 (50% expansion)
    var attack_pattern = '{"payload": "'
    for i: 0..99  # 100 sequences (0-99 inclusive) = 600 chars in JSON, 300 bytes needed, but old code allocated only 100 bytes
        attack_pattern += '\\u0800'
    end
    attack_pattern += '"}'
    var result = json.load(attack_pattern)
    assert(result != nil, "Security regression test should not crash")
    assert(size(result['payload']) == 300, "Should produce exactly 300 UTF-8 bytes")  # 100 * 3 bytes
 end
 # Run all security tests
 test_unicode_expansion()
 test_invalid_unicode()
 test_control_characters()
 test_invalid_escapes()
 test_string_length_limits()
 test_mixed_content()
 test_edge_cases()
 test_malformed_strings()
 test_nested_unicode_stress()
 test_security_regression()