mirror of
https://github.com/home-assistant/supervisor.git
synced 2025-07-12 19:56:29 +00:00
Home Assistant watchdog attempts safe mode after max fails (#5124)
* Home Assistant watchdog attempts safe mode after max fails * Remove duplicate line * Refactor and logging change from feedback * Update supervisor/misc/tasks.py * Fix log text check in test --------- Co-authored-by: Stefan Agner <stefan@agner.ch>
This commit is contained in:
parent
918fcb7d62
commit
1bb814b793
@ -174,17 +174,6 @@ class Tasks(CoreSysAttributes):
|
||||
self._cache[HASS_WATCHDOG_API_FAILURES] = 0
|
||||
return
|
||||
|
||||
# Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue.
|
||||
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
|
||||
if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
_LOGGER.critical(
|
||||
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.",
|
||||
reanimate_fails,
|
||||
)
|
||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1
|
||||
return
|
||||
|
||||
# Init cache data
|
||||
api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0)
|
||||
|
||||
@ -195,16 +184,38 @@ class Tasks(CoreSysAttributes):
|
||||
_LOGGER.warning("Watchdog missed an Home Assistant Core API response.")
|
||||
return
|
||||
|
||||
_LOGGER.error(
|
||||
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!",
|
||||
HASS_WATCHDOG_MAX_API_ATTEMPTS,
|
||||
)
|
||||
# After 5 reanimation attempts switch to safe mode. If that fails, give up
|
||||
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
|
||||
if reanimate_fails > HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
return
|
||||
|
||||
if safe_mode := reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||
_LOGGER.critical(
|
||||
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts. Restarting into safe mode",
|
||||
reanimate_fails,
|
||||
)
|
||||
else:
|
||||
_LOGGER.error(
|
||||
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core!",
|
||||
HASS_WATCHDOG_MAX_API_ATTEMPTS,
|
||||
)
|
||||
|
||||
try:
|
||||
await self.sys_homeassistant.core.restart()
|
||||
if safe_mode:
|
||||
await self.sys_homeassistant.core.rebuild(safe_mode=True)
|
||||
else:
|
||||
await self.sys_homeassistant.core.restart()
|
||||
except HomeAssistantError as err:
|
||||
_LOGGER.error("Home Assistant watchdog reanimation failed!")
|
||||
if reanimate_fails == 0:
|
||||
if reanimate_fails == 0 or safe_mode:
|
||||
capture_exception(err)
|
||||
|
||||
if safe_mode:
|
||||
_LOGGER.critical(
|
||||
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
|
||||
)
|
||||
else:
|
||||
_LOGGER.error("Home Assistant watchdog reanimation failed!")
|
||||
|
||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
|
||||
else:
|
||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
|
||||
|
@ -46,7 +46,7 @@ async def test_watchdog_homeassistant_api(
|
||||
restart.assert_called_once()
|
||||
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
|
||||
assert (
|
||||
"Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core API!"
|
||||
"Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core!"
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
@ -109,31 +109,48 @@ async def test_watchdog_homeassistant_api_reanimation_limit(
|
||||
HomeAssistantAPI, "check_api_state", return_value=False
|
||||
), patch.object(
|
||||
HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError())
|
||||
) as restart:
|
||||
) as restart, patch.object(
|
||||
HomeAssistantCore, "rebuild", side_effect=err
|
||||
) as rebuild:
|
||||
for _ in range(5):
|
||||
await tasks._watchdog_homeassistant_api()
|
||||
restart.assert_not_called()
|
||||
|
||||
await tasks._watchdog_homeassistant_api()
|
||||
restart.assert_called_once()
|
||||
restart.assert_called_once_with()
|
||||
assert "Home Assistant watchdog reanimation failed!" in caplog.text
|
||||
|
||||
rebuild.assert_not_called()
|
||||
restart.reset_mock()
|
||||
|
||||
capture_exception.assert_called_once_with(err)
|
||||
|
||||
# Next time it should try safe mode
|
||||
caplog.clear()
|
||||
await tasks._watchdog_homeassistant_api()
|
||||
rebuild.assert_not_called()
|
||||
|
||||
await tasks._watchdog_homeassistant_api()
|
||||
|
||||
rebuild.assert_called_once_with(safe_mode=True)
|
||||
restart.assert_not_called()
|
||||
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
|
||||
assert "Watchdog found a problem with Home Assistant API!" not in caplog.text
|
||||
assert (
|
||||
"Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts."
|
||||
"Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts. Restarting into safe mode"
|
||||
in caplog.text
|
||||
)
|
||||
assert (
|
||||
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
# After safe mode has failed too, no more restart attempts
|
||||
rebuild.reset_mock()
|
||||
caplog.clear()
|
||||
await tasks._watchdog_homeassistant_api()
|
||||
assert "Watchdog missed an Home Assistant Core API response." in caplog.text
|
||||
|
||||
caplog.clear()
|
||||
await tasks._watchdog_homeassistant_api()
|
||||
restart.assert_not_called()
|
||||
assert not caplog.text
|
||||
restart.assert_not_called()
|
||||
rebuild.assert_not_called()
|
||||
|
Loading…
x
Reference in New Issue
Block a user