mirror of
https://github.com/home-assistant/supervisor.git
synced 2025-07-12 19:56:29 +00:00
Home Assistant watchdog attempts safe mode after max fails (#5124)
* Home Assistant watchdog attempts safe mode after max fails * Remove duplicate line * Refactor and logging change from feedback * Update supervisor/misc/tasks.py * Fix log text check in test --------- Co-authored-by: Stefan Agner <stefan@agner.ch>
This commit is contained in:
parent
918fcb7d62
commit
1bb814b793
@ -174,17 +174,6 @@ class Tasks(CoreSysAttributes):
|
|||||||
self._cache[HASS_WATCHDOG_API_FAILURES] = 0
|
self._cache[HASS_WATCHDOG_API_FAILURES] = 0
|
||||||
return
|
return
|
||||||
|
|
||||||
# Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue.
|
|
||||||
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
|
|
||||||
if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
|
||||||
if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
|
||||||
_LOGGER.critical(
|
|
||||||
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.",
|
|
||||||
reanimate_fails,
|
|
||||||
)
|
|
||||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1
|
|
||||||
return
|
|
||||||
|
|
||||||
# Init cache data
|
# Init cache data
|
||||||
api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0)
|
api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0)
|
||||||
|
|
||||||
@ -195,16 +184,38 @@ class Tasks(CoreSysAttributes):
|
|||||||
_LOGGER.warning("Watchdog missed an Home Assistant Core API response.")
|
_LOGGER.warning("Watchdog missed an Home Assistant Core API response.")
|
||||||
return
|
return
|
||||||
|
|
||||||
_LOGGER.error(
|
# After 5 reanimation attempts switch to safe mode. If that fails, give up
|
||||||
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!",
|
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
|
||||||
HASS_WATCHDOG_MAX_API_ATTEMPTS,
|
if reanimate_fails > HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||||
)
|
return
|
||||||
|
|
||||||
|
if safe_mode := reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
|
||||||
|
_LOGGER.critical(
|
||||||
|
"Watchdog cannot reanimate Home Assistant Core, failed all %s attempts. Restarting into safe mode",
|
||||||
|
reanimate_fails,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_LOGGER.error(
|
||||||
|
"Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core!",
|
||||||
|
HASS_WATCHDOG_MAX_API_ATTEMPTS,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await self.sys_homeassistant.core.restart()
|
if safe_mode:
|
||||||
|
await self.sys_homeassistant.core.rebuild(safe_mode=True)
|
||||||
|
else:
|
||||||
|
await self.sys_homeassistant.core.restart()
|
||||||
except HomeAssistantError as err:
|
except HomeAssistantError as err:
|
||||||
_LOGGER.error("Home Assistant watchdog reanimation failed!")
|
if reanimate_fails == 0 or safe_mode:
|
||||||
if reanimate_fails == 0:
|
|
||||||
capture_exception(err)
|
capture_exception(err)
|
||||||
|
|
||||||
|
if safe_mode:
|
||||||
|
_LOGGER.critical(
|
||||||
|
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_LOGGER.error("Home Assistant watchdog reanimation failed!")
|
||||||
|
|
||||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
|
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
|
||||||
else:
|
else:
|
||||||
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
|
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
|
||||||
|
@ -46,7 +46,7 @@ async def test_watchdog_homeassistant_api(
|
|||||||
restart.assert_called_once()
|
restart.assert_called_once()
|
||||||
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
|
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
|
||||||
assert (
|
assert (
|
||||||
"Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core API!"
|
"Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core!"
|
||||||
in caplog.text
|
in caplog.text
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -109,31 +109,48 @@ async def test_watchdog_homeassistant_api_reanimation_limit(
|
|||||||
HomeAssistantAPI, "check_api_state", return_value=False
|
HomeAssistantAPI, "check_api_state", return_value=False
|
||||||
), patch.object(
|
), patch.object(
|
||||||
HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError())
|
HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError())
|
||||||
) as restart:
|
) as restart, patch.object(
|
||||||
|
HomeAssistantCore, "rebuild", side_effect=err
|
||||||
|
) as rebuild:
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
await tasks._watchdog_homeassistant_api()
|
await tasks._watchdog_homeassistant_api()
|
||||||
restart.assert_not_called()
|
restart.assert_not_called()
|
||||||
|
|
||||||
await tasks._watchdog_homeassistant_api()
|
await tasks._watchdog_homeassistant_api()
|
||||||
restart.assert_called_once()
|
restart.assert_called_once_with()
|
||||||
assert "Home Assistant watchdog reanimation failed!" in caplog.text
|
assert "Home Assistant watchdog reanimation failed!" in caplog.text
|
||||||
|
|
||||||
|
rebuild.assert_not_called()
|
||||||
restart.reset_mock()
|
restart.reset_mock()
|
||||||
|
|
||||||
capture_exception.assert_called_once_with(err)
|
capture_exception.assert_called_once_with(err)
|
||||||
|
|
||||||
|
# Next time it should try safe mode
|
||||||
caplog.clear()
|
caplog.clear()
|
||||||
await tasks._watchdog_homeassistant_api()
|
await tasks._watchdog_homeassistant_api()
|
||||||
|
rebuild.assert_not_called()
|
||||||
|
|
||||||
|
await tasks._watchdog_homeassistant_api()
|
||||||
|
|
||||||
|
rebuild.assert_called_once_with(safe_mode=True)
|
||||||
restart.assert_not_called()
|
restart.assert_not_called()
|
||||||
assert "Watchdog missed an Home Assistant Core API response." not in caplog.text
|
|
||||||
assert "Watchdog found a problem with Home Assistant API!" not in caplog.text
|
|
||||||
assert (
|
assert (
|
||||||
"Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts."
|
"Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts. Restarting into safe mode"
|
||||||
|
in caplog.text
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
"Safe mode restart failed. Watchdog cannot bring Home Assistant online."
|
||||||
in caplog.text
|
in caplog.text
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# After safe mode has failed too, no more restart attempts
|
||||||
|
rebuild.reset_mock()
|
||||||
|
caplog.clear()
|
||||||
|
await tasks._watchdog_homeassistant_api()
|
||||||
|
assert "Watchdog missed an Home Assistant Core API response." in caplog.text
|
||||||
|
|
||||||
caplog.clear()
|
caplog.clear()
|
||||||
await tasks._watchdog_homeassistant_api()
|
await tasks._watchdog_homeassistant_api()
|
||||||
restart.assert_not_called()
|
|
||||||
assert not caplog.text
|
assert not caplog.text
|
||||||
|
restart.assert_not_called()
|
||||||
|
rebuild.assert_not_called()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user