Berry vulnerability in JSON parsing for unicode (#23603)

2025-07-23 02:36:35 +00:00 · 2025-06-27 19:38:31 +02:00 · 2025-06-27 19:38:31 +02:00 · decdfc6b51
commit decdfc6b51
parent e9b62811c7
4 changed files with 289 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -16,6 +16,7 @@ All notable changes to this project will be documented in this file.

 ### Fixed
 - LVGL restore `lv_chart.set_range` removed in LVGL 9.3.0 in favor of `lv_chart.set_axis_range` (#23567)
+- Berry vulnerability in JSON parsing for unicode

 ### Removed

--- a/lib/libesp32/berry/default/berry_conf.h
+++ b/lib/libesp32/berry/default/berry_conf.h
@ -259,6 +259,7 @@
  #undef BE_USE_SOLIDIFY_MODULE
  #define BE_USE_DEBUG_MODULE             1
  #define BE_USE_SOLIDIFY_MODULE          1
+  #define BE_MAPPING_ENABLE_INPUT_VALIDATION   1    // input validation for lv_mapping
 #endif // USE_BERRY_DEBUG

 /* Macro: BE_EXPLICIT_XXX
--- a/lib/libesp32/berry/src/be_jsonlib.c
+++ b/lib/libesp32/berry/src/be_jsonlib.c
@ -10,6 +10,7 @@
 #include "be_lexer.h"
 #include <string.h>
 #include <math.h>
+#include <ctype.h>

 #if BE_USE_JSON_MODULE

@ -20,6 +21,9 @@
 #define INDENT_WIDTH    2
 #define INDENT_CHAR     ' '

+/* Security: Maximum JSON string length to prevent memory exhaustion attacks */
+#define MAX_JSON_STRING_LEN  (1024 * 1024)  /* 1MB limit */
+
 static const char* parser_value(bvm *vm, const char *json);
 static void value_dump(bvm *vm, int *indent, int idx, int fmt);

@ -62,21 +66,66 @@ static int is_object(bvm *vm, const char *class, int idx)
    return  0;
 }

-static int json_strlen(const char *json)
+/* Calculate the actual buffer size needed for JSON string parsing
+ * accounting for Unicode expansion and security limits */
+static size_t json_strlen_safe(const char *json, size_t *actual_len)
 {
    int ch;
    const char *s = json + 1; /* skip '"' */
-    /* get string length "(\\.|[^"])*" */
+    size_t char_count = 0;
+    size_t byte_count = 0;
+    
    while ((ch = *s) != '\0' && ch != '"') {
+        char_count++;
+        if (char_count > MAX_JSON_STRING_LEN) {
+            return SIZE_MAX; /* String too long */
+        }
+        
        ++s;
        if (ch == '\\') {
            ch = *s++;
            if (ch == '\0') {
-                return -1;
+                return SIZE_MAX; /* Malformed string */
            }
+            
+            switch (ch) {
+            case '"': case '\\': case '/':
+            case 'b': case 'f': case 'n': case 'r': case 't':
+                byte_count += 1;
+                break;
+            case 'u':
+                /* Unicode can expand to 1-3 UTF-8 bytes
+                 * We conservatively assume 3 bytes for safety */
+                byte_count += 3;
+                /* Verify we have 4 hex digits following */
+                for (int i = 0; i < 4; i++) {
+                    if (!s[i] || !isxdigit((unsigned char)s[i])) {
+                        return SIZE_MAX; /* Invalid unicode sequence */
+                    }
+                }
+                s += 4; /* Skip the 4 hex digits */
+                break;
+            default:
+                return SIZE_MAX; /* Invalid escape sequence */
+            }
+        } else if (ch >= 0 && ch <= 0x1f) {
+            return SIZE_MAX; /* Unescaped control character */
+        } else {
+            byte_count += 1;
+        }
+        
+        /* Check for potential overflow */
+        if (byte_count > MAX_JSON_STRING_LEN) {
+            return SIZE_MAX;
        }
    }
-    return ch ? cast_int(s - json - 1) : -1;
+    
+    if (ch != '"') {
+        return SIZE_MAX; /* Unterminated string */
+    }
+    
+    *actual_len = char_count;
+    return byte_count;
 }

 static void json2berry(bvm *vm, const char *class)
@ -117,55 +166,94 @@ static const char* parser_null(bvm *vm, const char *json)

 static const char* parser_string(bvm *vm, const char *json)
 {
-    if (*json == '"') {
-        int len = json_strlen(json++);
-        if (len > -1) {
-            int ch;
-            char *buf, *dst = buf = be_malloc(vm, len);
-            while ((ch = *json) != '\0' && ch != '"') {
-                ++json;
-                if (ch == '\\') {
-                    ch = *json++; /* skip '\' */
-                    switch (ch) {
-                    case '"': *dst++ = '"'; break;
-                    case '\\': *dst++ = '\\'; break;
-                    case '/': *dst++ = '/'; break;
-                    case 'b': *dst++ = '\b'; break;
-                    case 'f': *dst++ = '\f'; break;
-                    case 'n': *dst++ = '\n'; break;
-                    case 'r': *dst++ = '\r'; break;
-                    case 't': *dst++ = '\t'; break;
-                    case 'u': { /* load unicode */
-                        dst = be_load_unicode(dst, json);
-                        if (dst == NULL) {
-                            be_free(vm, buf, len);
-                            return NULL;
-                        }
-                        json += 4;
-                        break;
-                    }
-                    default: be_free(vm, buf, len); return NULL; /* error */
-                    }
-                } else if(ch >= 0 && ch <= 0x1f) {
-                    /* control characters must be escaped
-                       as per https://www.rfc-editor.org/rfc/rfc7159#section-7 */
-                    be_free(vm, buf, len);
+    if (*json != '"') {
+        return NULL;
+    }
+    
+    size_t char_len;
+    size_t byte_len = json_strlen_safe(json, &char_len);
+    
+    if (byte_len == SIZE_MAX) {
+        return NULL; /* Invalid or too long string */
+    }
+    
+    if (byte_len == 0) {
+        /* Empty string */
+        be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
+        be_pushstring(vm, "");
+        return json + 2; /* Skip opening and closing quotes */
+    }
+    
+    /* Allocate buffer - size is correctly calculated by json_strlen_safe */
+    char *buf = be_malloc(vm, byte_len + 1);
+    if (!buf) {
+        return NULL; /* Out of memory */
+    }
+    
+    char *dst = buf;
+    const char *src = json + 1; /* Skip opening quote */
+    int ch;
+    
+    while ((ch = *src) != '\0' && ch != '"') {
+        ++src;
+        if (ch == '\\') {
+            ch = *src++;
+            switch (ch) {
+            case '"': 
+                *dst++ = '"'; 
+                break;
+            case '\\': 
+                *dst++ = '\\'; 
+                break;
+            case '/': 
+                *dst++ = '/'; 
+                break;
+            case 'b': 
+                *dst++ = '\b'; 
+                break;
+            case 'f': 
+                *dst++ = '\f'; 
+                break;
+            case 'n': 
+                *dst++ = '\n'; 
+                break;
+            case 'r': 
+                *dst++ = '\r'; 
+                break;
+            case 't': 
+                *dst++ = '\t'; 
+                break;
+            case 'u': {
+                dst = be_load_unicode(dst, src);
+                if (dst == NULL) {
+                    be_free(vm, buf, byte_len + 1);
                    return NULL;
-                } else {
-                    *dst++ = (char)ch;
                }
+                src += 4;
+                break;
            }
-            be_assert(ch == '"');
-            /* require the stack to have some free space for the string, 
-               since parsing deeply nested objects might
-               crash the VM due to insufficient stack space. */
-            be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
-            be_pushnstring(vm, buf, cast_int(dst - buf));
-            be_free(vm, buf, len);
-            return json + 1; /* skip '"' */
+            default: 
+                be_free(vm, buf, byte_len + 1);
+                return NULL; /* Invalid escape */
+            }
+        } else if (ch >= 0 && ch <= 0x1f) {
+            be_free(vm, buf, byte_len + 1);
+            return NULL; /* Unescaped control character */
+        } else {
+            *dst++ = (char)ch;
        }
    }
-    return NULL;
+    
+    if (ch != '"') {
+        be_free(vm, buf, byte_len + 1);
+        return NULL; /* Unterminated string */
+    }
+    
+    /* Success - create Berry string */
+    be_stack_require(vm, 1 + BE_STACK_FREE_MIN);
+    be_pushnstring(vm, buf, (size_t)(dst - buf));
+    be_free(vm, buf, byte_len + 1);
+    return src + 1; /* Skip closing quote */
 }

 static const char* parser_field(bvm *vm, const char *json)
--- a/lib/libesp32/berry/tests/json.be
+++ b/lib/libesp32/berry/tests/json.be
@ -93,3 +93,154 @@ for count : 10..200
    end
    json.dump(arr)
 end
+
+# Security tests for JSON parsing fixes
+
+# Test 1: Unicode expansion buffer overflow protection
+# Each \u0800 sequence (6 chars in JSON) becomes 3 UTF-8 bytes
+# Old code would allocate only 1 byte per sequence, causing buffer overflow
+def test_unicode_expansion()
+    # Test single Unicode sequences of different byte lengths
+    assert_load('"\\u0048"', 'H')        # 1 UTF-8 byte (ASCII)
+    assert_load('"\\u00E9"', 'é')        # 2 UTF-8 bytes (Latin)
+    assert_load('"\\u0800"', 'ࠀ')        # 3 UTF-8 bytes (Samaritan)
+    
+    # Test multiple Unicode sequences that would cause buffer overflow in old code
+    var many_unicode = '"'
+    for i: 0..49  # 50 sequences (0-49 inclusive), each \u0800 -> 3 bytes (150 bytes total vs 50 bytes old allocation)
+        many_unicode += '\\u0800'
+    end
+    many_unicode += '"'
+    
+    var result = json.load('{"test": ' + many_unicode + '}')
+    assert(result != nil, "Unicode expansion test should succeed")
+    assert(size(result['test']) == 150, "Unicode expansion should produce 150 UTF-8 bytes")  # 50 * 3 bytes
+end
+
+# Test 2: Invalid Unicode sequence rejection
+def test_invalid_unicode()
+    # Invalid hex digits in Unicode sequences should be rejected
+    assert_load_failed('"\\uXXXX"')      # Non-hex characters
+    assert_load_failed('"\\u12XY"')      # Mixed valid/invalid hex
+    assert_load_failed('"\\u"')          # Incomplete sequence
+    assert_load_failed('"\\u123"')       # Too short
+    assert_load_failed('"\\u123G"')      # Invalid hex digit
+end
+
+# Test 3: Control character validation
+def test_control_characters()
+    # Unescaped control characters (0x00-0x1F) should be rejected
+    # Note: We need to create JSON strings with actual unescaped control characters
+    assert_load_failed('{"test": "hello\x0Aworld"}')     # Unescaped newline (0x0A)
+    assert_load_failed('{"test": "hello\x09world"}')     # Unescaped tab (0x09)
+    assert_load_failed('{"test": "hello\x0Dworld"}')     # Unescaped carriage return (0x0D)
+    assert_load_failed('{"test": "hello\x01world"}')     # Unescaped control char (0x01)
+    
+    # Properly escaped control characters should work
+    var escaped_newline = json.load('{"test": "hello\\nworld"}')
+    assert(escaped_newline != nil && escaped_newline['test'] == "hello\nworld", "Escaped newline should work")
+    
+    var escaped_tab = json.load('{"test": "hello\\tworld"}')
+    assert(escaped_tab != nil && escaped_tab['test'] == "hello\tworld", "Escaped tab should work")
+    
+    var escaped_cr = json.load('{"test": "hello\\rworld"}')
+    assert(escaped_cr != nil && escaped_cr['test'] == "hello\rworld", "Escaped carriage return should work")
+end
+
+# Test 4: Invalid escape sequence rejection
+def test_invalid_escapes()
+    # Invalid escape sequences should be rejected
+    assert_load_failed('"\\q"')          # Invalid escape character
+    assert_load_failed('"\\x"')          # Invalid escape character
+    assert_load_failed('"\\z"')          # Invalid escape character
+    assert_load_failed('"\\"')           # Incomplete escape at end
+end
+
+# Test 5: String length limits
+def test_string_length_limits()
+    # Test very long strings (should work up to limit)
+    var long_str = '"'
+    for i: 0..999  # 1000 character string (0-999 inclusive)
+        long_str += 'a'
+    end
+    long_str += '"'
+    
+    var result = json.load('{"test": ' + long_str + '}')
+    assert(result != nil, "Long string within limits should work")
+    assert(size(result['test']) == 1000, "Long string should have correct length")
+end
+
+# Test 6: Mixed Unicode and ASCII (realistic scenario)
+def test_mixed_content()
+    # Test realistic mixed content that could trigger the vulnerability
+    var mixed = '{"message": "Hello \\u4E16\\u754C! Welcome to \\u0048\\u0065\\u006C\\u006C\\u006F world."}'
+    var result = json.load(mixed)
+    assert(result != nil, "Mixed Unicode/ASCII should work")
+    assert(result['message'] == "Hello 世界! Welcome to Hello world.", "Mixed content should decode correctly")
+end
+
+# Test 7: Edge cases
+def test_edge_cases()
+    # Empty string
+    var empty_result = json.load('{"empty": ""}')
+    assert(empty_result != nil && empty_result['empty'] == "", "Empty string should work")
+    
+    # String with only Unicode
+    var unicode_result = json.load('{"unicode": "\\u0048\\u0065\\u006C\\u006C\\u006F"}')
+    assert(unicode_result != nil && unicode_result['unicode'] == "Hello", "Unicode-only string should work")
+    
+    # String with only escapes
+    var escapes_result = json.load('{"escapes": "\\n\\t\\r\\\\\\\""}')
+    assert(escapes_result != nil && escapes_result['escapes'] == "\n\t\r\\\"", "Escape-only string should work")
+    
+    # Maximum valid Unicode value
+    var max_unicode_result = json.load('{"max_unicode": "\\uFFFF"}')
+    assert(max_unicode_result != nil, "Maximum Unicode value should work")
+end
+
+# Test 8: Malformed JSON strings
+def test_malformed_strings()
+    # Unterminated strings
+    assert_load_failed('{"test": "unterminated')
+    assert_load_failed('{"test": "unterminated\\')
+    
+    # Invalid JSON structure with string issues
+    assert_load_failed('{"test": "valid"x}')
+    assert_load_failed('{"test": "\\uXXXX", "other": "valid"}')
+end
+
+# Test 9: Nested objects with Unicode (stress test)
+def test_nested_unicode_stress()
+    # Create nested structure with Unicode to test memory management
+    var nested = '{"level0": {"unicode": "\\u0800\\u0801\\u0802", "level1": {"unicode": "\\u0800\\u0801\\u0802", "final": "\\u4E16\\u754C"}}}'
+    
+    var result = json.load(nested)
+    assert(result != nil, "Nested Unicode structure should parse successfully")
+end
+
+# Test 10: Security regression test
+def test_security_regression()
+    # This specific pattern would cause buffer overflow in the original code
+    # \u0800 sequences: 6 chars in JSON -> 3 bytes in UTF-8 (50% expansion)
+    var attack_pattern = '{"payload": "'
+    for i: 0..99  # 100 sequences (0-99 inclusive) = 600 chars in JSON, 300 bytes needed, but old code allocated only 100 bytes
+        attack_pattern += '\\u0800'
+    end
+    attack_pattern += '"}'
+    
+    var result = json.load(attack_pattern)
+    assert(result != nil, "Security regression test should not crash")
+    assert(size(result['payload']) == 300, "Should produce exactly 300 UTF-8 bytes")  # 100 * 3 bytes
+end
+
+# Run all security tests
+test_unicode_expansion()
+test_invalid_unicode()
+test_control_characters()
+test_invalid_escapes()
+test_string_length_limits()
+test_mixed_content()
+test_edge_cases()
+test_malformed_strings()
+test_nested_unicode_stress()
+test_security_regression()