Fix buffer corruption in API message encoding with very verbose logging (#9249)

2025-07-29 06:36:45 +00:00 · 2025-06-28 15:04:42 -05:00 · 2025-06-28 15:04:42 -05:00 · 13d4823db6
commit 13d4823db6
parent 30f61b26ff
4 changed files with 210 additions and 5 deletions
--- a/esphome/components/api/api_connection.cpp
+++ b/esphome/components/api/api_connection.cpp
@ -94,6 +94,19 @@ APIConnection::~APIConnection() {
 #endif
 }

+#ifdef HAS_PROTO_MESSAGE_DUMP
+void APIConnection::log_batch_item_(const DeferredBatch::BatchItem &item) {
+  // Set log-only mode
+  this->log_only_mode_ = true;
+
+  // Call the creator - it will create the message and log it via encode_message_to_buffer
+  item.creator(item.entity, this, std::numeric_limits<uint16_t>::max(), true, item.message_type);
+
+  // Clear log-only mode
+  this->log_only_mode_ = false;
+}
+#endif
+
 void APIConnection::loop() {
  if (this->next_close_) {
    // requested a disconnect
@ -249,6 +262,14 @@ void APIConnection::on_disconnect_response(const DisconnectResponse &value) {
 // including header and footer overhead. Returns 0 if the message doesn't fit.
 uint16_t APIConnection::encode_message_to_buffer(ProtoMessage &msg, uint16_t message_type, APIConnection *conn,
                                                 uint32_t remaining_size, bool is_single) {
+#ifdef HAS_PROTO_MESSAGE_DUMP
+  // If in log-only mode, just log and return
+  if (conn->log_only_mode_) {
+    conn->log_send_message_(msg.message_name(), msg.dump());
+    return 1;  // Return non-zero to indicate "success" for logging
+  }
+#endif
+
  // Calculate size
  uint32_t calculated_size = 0;
  msg.calculate_size(calculated_size);
@ -276,11 +297,6 @@ uint16_t APIConnection::encode_message_to_buffer(ProtoMessage &msg, uint16_t mes
  // Encode directly into buffer
  msg.encode(buffer);

-#ifdef HAS_PROTO_MESSAGE_DUMP
-  // Log the message for VV debugging
-  conn->log_send_message_(msg.message_name(), msg.dump());
-#endif
-
  // Calculate actual encoded size (not including header that was already added)
  size_t actual_payload_size = shared_buf.size() - size_before_encode;

@ -1891,6 +1907,15 @@ void APIConnection::process_batch_() {
    }
  }

+#ifdef HAS_PROTO_MESSAGE_DUMP
+  // Log messages after send attempt for VV debugging
+  // It's safe to use the buffer for logging at this point regardless of send result
+  for (size_t i = 0; i < items_processed; i++) {
+    const auto &item = this->deferred_batch_.items[i];
+    this->log_batch_item_(item);
+  }
+#endif
+
  // Handle remaining items more efficiently
  if (items_processed < this->deferred_batch_.items.size()) {
    // Remove processed items from the beginning
--- a/esphome/components/api/api_connection.h
+++ b/esphome/components/api/api_connection.h
@ -470,6 +470,10 @@ class APIConnection : public APIServerConnection {
  bool sent_ping_{false};
  bool service_call_subscription_{false};
  bool next_close_ = false;
+#ifdef HAS_PROTO_MESSAGE_DUMP
+  // When true, encode_message_to_buffer will only log, not encode
+  bool log_only_mode_{false};
+#endif
  uint8_t ping_retries_{0};
  // 8 bytes used, no padding needed

@ -627,6 +631,10 @@ class APIConnection : public APIServerConnection {
  // State for batch buffer allocation
  bool batch_first_message_{false};

+#ifdef HAS_PROTO_MESSAGE_DUMP
+  void log_batch_item_(const DeferredBatch::BatchItem &item);
+#endif
+
  // Helper function to schedule a deferred message with known message type
  bool schedule_message_(EntityBase *entity, MessageCreator creator, uint16_t message_type) {
    this->deferred_batch_.add_item(entity, std::move(creator), message_type);
--- a/tests/integration/fixtures/api_vv_logging.yaml
+++ b/tests/integration/fixtures/api_vv_logging.yaml
@ -0,0 +1,89 @@
+esphome:
+  name: vv-logging-test
+
+host:
+
+api:
+
+logger:
+  level: VERY_VERBOSE
+  # Enable VV logging for API components where the issue occurs
+  logs:
+    api.connection: VERY_VERBOSE
+    api.service: VERY_VERBOSE
+    api.proto: VERY_VERBOSE
+    sensor: VERY_VERBOSE
+
+# Create many sensors that update frequently to generate API traffic
+# This will cause many messages to be batched and sent, triggering the
+# code path where VV logging could cause buffer corruption
+sensor:
+  - platform: template
+    name: "Test Sensor 1"
+    lambda: 'return millis() / 1000.0;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 2"
+    lambda: 'return (millis() / 1000.0) + 10;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 3"
+    lambda: 'return (millis() / 1000.0) + 20;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 4"
+    lambda: 'return (millis() / 1000.0) + 30;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 5"
+    lambda: 'return (millis() / 1000.0) + 40;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 6"
+    lambda: 'return (millis() / 1000.0) + 50;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 7"
+    lambda: 'return (millis() / 1000.0) + 60;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 8"
+    lambda: 'return (millis() / 1000.0) + 70;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 9"
+    lambda: 'return (millis() / 1000.0) + 80;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+  - platform: template
+    name: "Test Sensor 10"
+    lambda: 'return (millis() / 1000.0) + 90;'
+    update_interval: 50ms
+    unit_of_measurement: "s"
+
+# Add some binary sensors too for variety
+binary_sensor:
+  - platform: template
+    name: "Test Binary 1"
+    lambda: 'return (millis() / 1000) % 2 == 0;'
+
+  - platform: template
+    name: "Test Binary 2"
+    lambda: 'return (millis() / 1000) % 3 == 0;'
--- a/tests/integration/test_api_vv_logging.py
+++ b/tests/integration/test_api_vv_logging.py
@ -0,0 +1,83 @@
+"""Integration test for API with VERY_VERBOSE logging to verify no buffer corruption."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from aioesphomeapi import LogLevel
+import pytest
+
+from .types import APIClientConnectedFactory, RunCompiledFunction
+
+
+@pytest.mark.asyncio
+async def test_api_vv_logging(
+    yaml_config: str,
+    run_compiled: RunCompiledFunction,
+    api_client_connected: APIClientConnectedFactory,
+) -> None:
+    """Test that VERY_VERBOSE logging doesn't cause buffer corruption with API messages."""
+
+    # Track that we're receiving VV log messages and sensor updates
+    vv_logs_received = 0
+    sensor_updates_received = 0
+    errors_detected = []
+
+    def on_log(msg: Any) -> None:
+        """Capture log messages."""
+        nonlocal vv_logs_received
+        # msg is a SubscribeLogsResponse object with 'message' attribute
+        # The message field is always bytes
+        message_text = msg.message.decode("utf-8", errors="replace")
+
+        # Only count VV logs specifically
+        if "[VV]" in message_text:
+            vv_logs_received += 1
+
+        # Check for assertion or error messages
+        if "assert" in message_text.lower() or "error" in message_text.lower():
+            errors_detected.append(message_text)
+
+    # Write, compile and run the ESPHome device
+    async with run_compiled(yaml_config), api_client_connected() as client:
+        # Subscribe to VERY_VERBOSE logs - this enables the code path that could cause corruption
+        client.subscribe_logs(on_log, log_level=LogLevel.LOG_LEVEL_VERY_VERBOSE)
+
+        # Wait for device to be ready
+        device_info = await client.device_info()
+        assert device_info is not None
+        assert device_info.name == "vv-logging-test"
+
+        # Subscribe to sensor states
+        states = {}
+
+        def on_state(state):
+            nonlocal sensor_updates_received
+            sensor_updates_received += 1
+            states[state.key] = state
+
+        client.subscribe_states(on_state)
+
+        # List entities to find our test sensors
+        entity_info, _ = await client.list_entities_services()
+
+        # Count sensors
+        sensor_count = sum(1 for e in entity_info if hasattr(e, "unit_of_measurement"))
+        assert sensor_count >= 10, f"Expected at least 10 sensors, got {sensor_count}"
+
+        # Wait for sensor updates to flow with VV logging active
+        # The sensors update every 50ms, so we should get many updates
+        await asyncio.sleep(0.25)
+
+        # Verify we received both VV logs and sensor updates
+        assert vv_logs_received > 0, "Expected to receive VERY_VERBOSE log messages"
+        assert sensor_updates_received > 10, (
+            f"Expected many sensor updates, got {sensor_updates_received}"
+        )
+
+        # Check for any errors
+        if errors_detected:
+            pytest.fail(f"Errors detected during test: {errors_detected}")
+
+        # The test passes if we didn't hit any assertions or buffer corruption