Home Assistant watchdog attempts safe mode after max fails (#5124)

* Home Assistant watchdog attempts safe mode after max fails

* Remove duplicate line

* Refactor and logging change from feedback

* Update supervisor/misc/tasks.py

* Fix log text check in test

---------

Co-authored-by: Stefan Agner <stefan@agner.ch>
This commit is contained in:
Mike Degatano
2024-06-20 15:50:29 -04:00
committed by GitHub
parent 918fcb7d62
commit 1bb814b793
2 changed files with 53 additions and 25 deletions

View File

@@ -174,17 +174,6 @@ class Tasks(CoreSysAttributes):
self._cache[HASS_WATCHDOG_API_FAILURES] = 0
return
# Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue.
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
_LOGGER.critical(
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.",
reanimate_fails,
)
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1
return
# Init cache data
api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0)
@@ -195,16 +184,38 @@ class Tasks(CoreSysAttributes):
_LOGGER.warning("Watchdog missed an Home Assistant Core API response.")
return
_LOGGER.error(
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!",
HASS_WATCHDOG_MAX_API_ATTEMPTS,
)
# After 5 reanimation attempts switch to safe mode. If that fails, give up
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
if reanimate_fails > HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
return
if safe_mode := reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
_LOGGER.critical(
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts. Restarting into safe mode",
reanimate_fails,
)
else:
_LOGGER.error(
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core!",
HASS_WATCHDOG_MAX_API_ATTEMPTS,
)
try:
await self.sys_homeassistant.core.restart()
if safe_mode:
await self.sys_homeassistant.core.rebuild(safe_mode=True)
else:
await self.sys_homeassistant.core.restart()
except HomeAssistantError as err:
_LOGGER.error("Home Assistant watchdog reanimation failed!")
if reanimate_fails == 0:
if reanimate_fails == 0 or safe_mode:
capture_exception(err)
if safe_mode:
_LOGGER.critical(
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
)
else:
_LOGGER.error("Home Assistant watchdog reanimation failed!")
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
else:
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0