From 1bb814b793333bd0fda61f115b69c1270355fedd Mon Sep 17 00:00:00 2001 From: Mike Degatano Date: Thu, 20 Jun 2024 15:50:29 -0400 Subject: [PATCH] Home Assistant watchdog attempts safe mode after max fails (#5124) * Home Assistant watchdog attempts safe mode after max fails * Remove duplicate line * Refactor and logging change from feedback * Update supervisor/misc/tasks.py * Fix log text check in test --------- Co-authored-by: Stefan Agner --- supervisor/misc/tasks.py | 47 +++++++++++++++++++++++++--------------- tests/misc/test_tasks.py | 31 ++++++++++++++++++++------ 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/supervisor/misc/tasks.py b/supervisor/misc/tasks.py index 772ba1302..685f2bcb7 100644 --- a/supervisor/misc/tasks.py +++ b/supervisor/misc/tasks.py @@ -174,17 +174,6 @@ class Tasks(CoreSysAttributes): self._cache[HASS_WATCHDOG_API_FAILURES] = 0 return - # Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue. - reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0) - if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: - if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: - _LOGGER.critical( - "Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.", - reanimate_fails, - ) - self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1 - return - # Init cache data api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0) @@ -195,16 +184,38 @@ class Tasks(CoreSysAttributes): _LOGGER.warning("Watchdog missed an Home Assistant Core API response.") return - _LOGGER.error( - "Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!", - HASS_WATCHDOG_MAX_API_ATTEMPTS, - ) + # After 5 reanimation attempts switch to safe mode. If that fails, give up + reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0) + if reanimate_fails > HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: + return + + if safe_mode := reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: + _LOGGER.critical( + "Watchdog cannot reanimate Home Assistant Core, failed all %s attempts. Restarting into safe mode", + reanimate_fails, + ) + else: + _LOGGER.error( + "Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core!", + HASS_WATCHDOG_MAX_API_ATTEMPTS, + ) + try: - await self.sys_homeassistant.core.restart() + if safe_mode: + await self.sys_homeassistant.core.rebuild(safe_mode=True) + else: + await self.sys_homeassistant.core.restart() except HomeAssistantError as err: - _LOGGER.error("Home Assistant watchdog reanimation failed!") - if reanimate_fails == 0: + if reanimate_fails == 0 or safe_mode: capture_exception(err) + + if safe_mode: + _LOGGER.critical( + "Safe mode restart failed. Watchdog cannot bring Home Assistant online." + ) + else: + _LOGGER.error("Home Assistant watchdog reanimation failed!") + self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1 else: self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0 diff --git a/tests/misc/test_tasks.py b/tests/misc/test_tasks.py index c29d2e617..65071d452 100644 --- a/tests/misc/test_tasks.py +++ b/tests/misc/test_tasks.py @@ -46,7 +46,7 @@ async def test_watchdog_homeassistant_api( restart.assert_called_once() assert "Watchdog missed an Home Assistant Core API response." not in caplog.text assert ( - "Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core API!" + "Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core!" in caplog.text ) @@ -109,31 +109,48 @@ async def test_watchdog_homeassistant_api_reanimation_limit( HomeAssistantAPI, "check_api_state", return_value=False ), patch.object( HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError()) - ) as restart: + ) as restart, patch.object( + HomeAssistantCore, "rebuild", side_effect=err + ) as rebuild: for _ in range(5): await tasks._watchdog_homeassistant_api() restart.assert_not_called() await tasks._watchdog_homeassistant_api() - restart.assert_called_once() + restart.assert_called_once_with() assert "Home Assistant watchdog reanimation failed!" in caplog.text + rebuild.assert_not_called() restart.reset_mock() capture_exception.assert_called_once_with(err) + # Next time it should try safe mode caplog.clear() await tasks._watchdog_homeassistant_api() + rebuild.assert_not_called() + await tasks._watchdog_homeassistant_api() + + rebuild.assert_called_once_with(safe_mode=True) restart.assert_not_called() - assert "Watchdog missed an Home Assistant Core API response." not in caplog.text - assert "Watchdog found a problem with Home Assistant API!" not in caplog.text assert ( - "Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts." + "Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts. Restarting into safe mode" + in caplog.text + ) + assert ( + "Safe mode restart failed. Watchdog cannot bring Home Assistant online." in caplog.text ) + # After safe mode has failed too, no more restart attempts + rebuild.reset_mock() + caplog.clear() + await tasks._watchdog_homeassistant_api() + assert "Watchdog missed an Home Assistant Core API response." in caplog.text + caplog.clear() await tasks._watchdog_homeassistant_api() - restart.assert_not_called() assert not caplog.text + restart.assert_not_called() + rebuild.assert_not_called()