mirror of
https://github.com/home-assistant/supervisor.git
synced 2025-11-15 22:10:12 +00:00
Home Assistant watchdog attempts safe mode after max fails (#5124)
* Home Assistant watchdog attempts safe mode after max fails * Remove duplicate line * Refactor and logging change from feedback * Update supervisor/misc/tasks.py * Fix log text check in test --------- Co-authored-by: Stefan Agner <stefan@agner.ch>
This commit is contained in:
@@ -174,17 +174,6 @@ class Tasks(CoreSysAttributes):
|
||||
self._cache[HASS_WATCHDOG_API_FAILURES] = 0
|
||||
return
|
||||
|
||||
# Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue.
|
||||
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
|
||||
if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
_LOGGER.critical(
|
||||
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.",
|
||||
reanimate_fails,
|
||||
)
|
||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1
|
||||
return
|
||||
|
||||
# Init cache data
|
||||
api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0)
|
||||
|
||||
@@ -195,16 +184,38 @@ class Tasks(CoreSysAttributes):
|
||||
_LOGGER.warning("Watchdog missed an Home Assistant Core API response.")
|
||||
return
|
||||
|
||||
_LOGGER.error(
|
||||
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!",
|
||||
HASS_WATCHDOG_MAX_API_ATTEMPTS,
|
||||
)
|
||||
# After 5 reanimation attempts switch to safe mode. If that fails, give up
|
||||
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
|
||||
if reanimate_fails > HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
return
|
||||
|
||||
if safe_mode := reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
_LOGGER.critical(
|
||||
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts. Restarting into safe mode",
|
||||
reanimate_fails,
|
||||
)
|
||||
else:
|
||||
_LOGGER.error(
|
||||
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core!",
|
||||
HASS_WATCHDOG_MAX_API_ATTEMPTS,
|
||||
)
|
||||
|
||||
try:
|
||||
await self.sys_homeassistant.core.restart()
|
||||
if safe_mode:
|
||||
await self.sys_homeassistant.core.rebuild(safe_mode=True)
|
||||
else:
|
||||
await self.sys_homeassistant.core.restart()
|
||||
except HomeAssistantError as err:
|
||||
_LOGGER.error("Home Assistant watchdog reanimation failed!")
|
||||
if reanimate_fails == 0:
|
||||
if reanimate_fails == 0 or safe_mode:
|
||||
capture_exception(err)
|
||||
|
||||
if safe_mode:
|
||||
_LOGGER.critical(
|
||||
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
|
||||
)
|
||||
else:
|
||||
_LOGGER.error("Home Assistant watchdog reanimation failed!")
|
||||
|
||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
|
||||
else:
|
||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
|
||||
|
||||
Reference in New Issue
Block a user