From 9cc2a04d54cda2cdeb4c386a7d79acde4717fe92 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 8 Jun 2025 17:29:26 -0500 Subject: [PATCH] Implement proper API connection teardown before deep sleep/reboot (#9008) --- esphome/components/api/api_connection.cpp | 3 +- esphome/components/api/api_server.cpp | 38 ++++- esphome/components/api/api_server.h | 2 + .../deep_sleep/deep_sleep_component.cpp | 5 + .../components/esp32_ble_server/ble_server.h | 1 - esphome/core/application.cpp | 150 +++++++++++------- esphome/core/application.h | 15 ++ esphome/core/component.h | 6 + 8 files changed, 156 insertions(+), 64 deletions(-) diff --git a/esphome/components/api/api_connection.cpp b/esphome/components/api/api_connection.cpp index 684ffd8cd7..e6875a5d6d 100644 --- a/esphome/components/api/api_connection.cpp +++ b/esphome/components/api/api_connection.cpp @@ -275,7 +275,8 @@ DisconnectResponse APIConnection::disconnect(const DisconnectRequest &msg) { return resp; } void APIConnection::on_disconnect_response(const DisconnectResponse &value) { - // pass + this->helper_->close(); + this->remove_ = true; } #ifdef USE_BINARY_SENSOR diff --git a/esphome/components/api/api_server.cpp b/esphome/components/api/api_server.cpp index 2d5c507b9d..63b10527d4 100644 --- a/esphome/components/api/api_server.cpp +++ b/esphome/components/api/api_server.cpp @@ -88,6 +88,12 @@ void APIServer::setup() { #ifdef USE_LOGGER if (logger::global_logger != nullptr) { logger::global_logger->add_on_log_callback([this](int level, const char *tag, const char *message) { + if (this->shutting_down_) { + // Don't try to send logs during shutdown + // as it could result in a recursion and + // we would be filling a buffer we are trying to clear + return; + } for (auto &c : this->clients_) { if (!c->remove_) c->try_send_log_message(level, tag, message); @@ -112,8 +118,8 @@ void APIServer::setup() { } void APIServer::loop() { - // Accept new clients only if the socket has incoming connections - if (this->socket_->ready()) { + // Accept new clients only if the socket exists and has incoming connections + if (this->socket_ && this->socket_->ready()) { while (true) { struct sockaddr_storage source_addr; socklen_t addr_len = sizeof(source_addr); @@ -474,10 +480,32 @@ void APIServer::request_time() { bool APIServer::is_connected() const { return !this->clients_.empty(); } void APIServer::on_shutdown() { - for (auto &c : this->clients_) { - c->send_disconnect_request(DisconnectRequest()); + this->shutting_down_ = true; + + // Close the listening socket to prevent new connections + if (this->socket_) { + this->socket_->close(); + this->socket_ = nullptr; } - delay(10); + + // Send disconnect requests to all connected clients + for (auto &c : this->clients_) { + if (!c->send_disconnect_request(DisconnectRequest())) { + // If we can't send the disconnect request, mark for immediate closure + c->next_close_ = true; + } + } +} + +bool APIServer::teardown() { + // If network is disconnected, no point trying to flush buffers + if (!network::is_connected()) { + return true; + } + this->loop(); + + // Return true only when all clients have been torn down + return this->clients_.empty(); } } // namespace api diff --git a/esphome/components/api/api_server.h b/esphome/components/api/api_server.h index a6645b96ce..a8c4a0f009 100644 --- a/esphome/components/api/api_server.h +++ b/esphome/components/api/api_server.h @@ -34,6 +34,7 @@ class APIServer : public Component, public Controller { void loop() override; void dump_config() override; void on_shutdown() override; + bool teardown() override; bool check_password(const std::string &password) const; bool uses_password() const; void set_port(uint16_t port); @@ -136,6 +137,7 @@ class APIServer : public Component, public Controller { } protected: + bool shutting_down_ = false; std::unique_ptr socket_ = nullptr; uint16_t port_{6053}; uint32_t reboot_timeout_{300000}; diff --git a/esphome/components/deep_sleep/deep_sleep_component.cpp b/esphome/components/deep_sleep/deep_sleep_component.cpp index b53dabc92f..38a12e8e90 100644 --- a/esphome/components/deep_sleep/deep_sleep_component.cpp +++ b/esphome/components/deep_sleep/deep_sleep_component.cpp @@ -6,6 +6,8 @@ namespace esphome { namespace deep_sleep { static const char *const TAG = "deep_sleep"; +// 5 seconds for deep sleep to ensure clean disconnect from Home Assistant +static const uint32_t TEARDOWN_TIMEOUT_DEEP_SLEEP_MS = 5000; bool global_has_deep_sleep = false; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) @@ -62,6 +64,9 @@ void DeepSleepComponent::begin_sleep(bool manual) { ESP_LOGI(TAG, "Sleeping for %" PRId64 "us", *this->sleep_duration_); } App.run_safe_shutdown_hooks(); + // It's critical to teardown components cleanly for deep sleep to ensure + // Home Assistant sees a clean disconnect instead of marking the device unavailable + App.teardown_components(TEARDOWN_TIMEOUT_DEEP_SLEEP_MS); this->deep_sleep_(); } diff --git a/esphome/components/esp32_ble_server/ble_server.h b/esphome/components/esp32_ble_server/ble_server.h index 43599438f3..531b52d6b9 100644 --- a/esphome/components/esp32_ble_server/ble_server.h +++ b/esphome/components/esp32_ble_server/ble_server.h @@ -43,7 +43,6 @@ class BLEServer : public Component, float get_setup_priority() const override; bool can_proceed() override; - void teardown(); bool is_running(); void set_manufacturer_data(const std::vector &data) { diff --git a/esphome/core/application.cpp b/esphome/core/application.cpp index 4e62c8af1f..f60015af38 100644 --- a/esphome/core/application.cpp +++ b/esphome/core/application.cpp @@ -126,63 +126,7 @@ void Application::loop() { next_schedule = std::max(next_schedule, delay_time / 2); delay_time = std::min(next_schedule, delay_time); -#ifdef USE_SOCKET_SELECT_SUPPORT - if (!this->socket_fds_.empty()) { - // Use select() with timeout when we have sockets to monitor - - // Update fd_set if socket list has changed - if (this->socket_fds_changed_) { - FD_ZERO(&this->base_read_fds_); - for (int fd : this->socket_fds_) { - if (fd >= 0 && fd < FD_SETSIZE) { - FD_SET(fd, &this->base_read_fds_); - } - } - this->socket_fds_changed_ = false; - } - - // Copy base fd_set before each select - this->read_fds_ = this->base_read_fds_; - - // Convert delay_time (milliseconds) to timeval - struct timeval tv; - tv.tv_sec = delay_time / 1000; - tv.tv_usec = (delay_time - tv.tv_sec * 1000) * 1000; - - // Call select with timeout -#if defined(USE_SOCKET_IMPL_LWIP_SOCKETS) || (defined(USE_ESP32) && defined(USE_SOCKET_IMPL_BSD_SOCKETS)) - // Use lwip_select() on platforms with lwIP - it's faster - // Note: On ESP32 with BSD sockets, select() is already mapped to lwip_select() via macros, - // but we explicitly call lwip_select() for clarity and to ensure we get the optimized version - int ret = lwip_select(this->max_fd_ + 1, &this->read_fds_, nullptr, nullptr, &tv); -#else - // Use standard select() on other platforms (e.g., host/native builds) - int ret = ::select(this->max_fd_ + 1, &this->read_fds_, nullptr, nullptr, &tv); -#endif - - // Process select() result: - // ret < 0: error (except EINTR which is normal) - // ret > 0: socket(s) have data ready - normal and expected - // ret == 0: timeout occurred - normal and expected - if (ret < 0) { - if (errno == EINTR) { - // Interrupted by signal - this is normal, just continue - // No need to delay as some time has already passed - ESP_LOGVV(TAG, "select() interrupted by signal"); - } else { - // Actual error - log and fall back to delay - ESP_LOGW(TAG, "select() failed with errno %d", errno); - delay(delay_time); - } - } - } else { - // No sockets registered, use regular delay - delay(delay_time); - } -#else - // No select support, use regular delay - delay(delay_time); -#endif + this->delay_with_select_(delay_time); } this->last_loop_ = last_op_end_time; @@ -224,6 +168,7 @@ void Application::reboot() { void Application::safe_reboot() { ESP_LOGI(TAG, "Rebooting safely"); run_safe_shutdown_hooks(); + teardown_components(TEARDOWN_TIMEOUT_REBOOT_MS); arch_restart(); } @@ -236,6 +181,49 @@ void Application::run_safe_shutdown_hooks() { } } +void Application::teardown_components(uint32_t timeout_ms) { + uint32_t start_time = millis(); + + // Copy all components in reverse order using reverse iterators + // Reverse order matches the behavior of run_safe_shutdown_hooks() above and ensures + // components are torn down in the opposite order of their setup_priority (which is + // used to sort components during Application::setup()) + std::vector pending_components(this->components_.rbegin(), this->components_.rend()); + + uint32_t now = start_time; + while (!pending_components.empty() && (now - start_time) < timeout_ms) { + // Feed watchdog during teardown to prevent triggering + this->feed_wdt(now); + + // Use iterator to safely erase elements + for (auto it = pending_components.begin(); it != pending_components.end();) { + if ((*it)->teardown()) { + // Component finished teardown, erase it + it = pending_components.erase(it); + } else { + // Component still needs time + ++it; + } + } + + // Give some time for I/O operations if components are still pending + if (!pending_components.empty()) { + this->delay_with_select_(1); + } + + // Update time for next iteration + now = millis(); + } + + if (!pending_components.empty()) { + // Note: At this point, connections are either disconnected or in a bad state, + // so this warning will only appear via serial rather than being transmitted to clients + for (auto *component : pending_components) { + ESP_LOGW(TAG, "%s did not complete teardown within %u ms", component->get_component_source(), timeout_ms); + } + } +} + void Application::calculate_looping_components_() { for (auto *obj : this->components_) { if (obj->has_overridden_loop()) @@ -304,6 +292,54 @@ bool Application::is_socket_ready(int fd) const { } #endif +void Application::delay_with_select_(uint32_t delay_ms) { +#ifdef USE_SOCKET_SELECT_SUPPORT + if (!this->socket_fds_.empty()) { + // Update fd_set if socket list has changed + if (this->socket_fds_changed_) { + FD_ZERO(&this->base_read_fds_); + for (int fd : this->socket_fds_) { + if (fd >= 0 && fd < FD_SETSIZE) { + FD_SET(fd, &this->base_read_fds_); + } + } + this->socket_fds_changed_ = false; + } + + // Copy base fd_set before each select + this->read_fds_ = this->base_read_fds_; + + // Convert delay_ms to timeval + struct timeval tv; + tv.tv_sec = delay_ms / 1000; + tv.tv_usec = (delay_ms - tv.tv_sec * 1000) * 1000; + + // Call select with timeout +#if defined(USE_SOCKET_IMPL_LWIP_SOCKETS) || (defined(USE_ESP32) && defined(USE_SOCKET_IMPL_BSD_SOCKETS)) + int ret = lwip_select(this->max_fd_ + 1, &this->read_fds_, nullptr, nullptr, &tv); +#else + int ret = ::select(this->max_fd_ + 1, &this->read_fds_, nullptr, nullptr, &tv); +#endif + + // Process select() result: + // ret < 0: error (except EINTR which is normal) + // ret > 0: socket(s) have data ready - normal and expected + // ret == 0: timeout occurred - normal and expected + if (ret < 0 && errno != EINTR) { + // Actual error - log and fall back to delay + ESP_LOGW(TAG, "select() failed with errno %d", errno); + delay(delay_ms); + } + } else { + // No sockets registered, use regular delay + delay(delay_ms); + } +#else + // No select support, use regular delay + delay(delay_ms); +#endif +} + Application App; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) } // namespace esphome diff --git a/esphome/core/application.h b/esphome/core/application.h index c6e6d8b78e..90aa175edc 100644 --- a/esphome/core/application.h +++ b/esphome/core/application.h @@ -79,6 +79,12 @@ namespace esphome { +// Teardown timeout constant (in milliseconds) +// For reboots, it's more important to shut down quickly than disconnect cleanly +// since we're not entering deep sleep. The only consequence of not shutting down +// cleanly is a warning in the log. +static const uint32_t TEARDOWN_TIMEOUT_REBOOT_MS = 1000; // 1 second for quick reboot + class Application { public: void pre_setup(const std::string &name, const std::string &friendly_name, const std::string &area, @@ -251,6 +257,12 @@ class Application { void run_safe_shutdown_hooks(); + /** Teardown all components with a timeout. + * + * @param timeout_ms Maximum time to wait for teardown in milliseconds + */ + void teardown_components(uint32_t timeout_ms); + uint32_t get_app_state() const { return this->app_state_; } #ifdef USE_BINARY_SENSOR @@ -493,6 +505,9 @@ class Application { void feed_wdt_arch_(); + /// Perform a delay while also monitoring socket file descriptors for readiness + void delay_with_select_(uint32_t delay_ms); + std::vector components_{}; std::vector looping_components_{}; diff --git a/esphome/core/component.h b/esphome/core/component.h index 7b3e12eb59..a8c6b156b2 100644 --- a/esphome/core/component.h +++ b/esphome/core/component.h @@ -110,6 +110,12 @@ class Component { virtual void on_shutdown() {} virtual void on_safe_shutdown() {} + /** Called during teardown to allow component to gracefully finish operations. + * + * @return true if teardown is complete, false if more time is needed + */ + virtual bool teardown() { return true; } + uint32_t get_component_state() const; /** Mark this component as failed. Any future timeouts/intervals/setup/loop will no longer be called.