Set max reanimation attempts on HA watchdog (#4784)

This commit is contained in:
Mike Degatano 2023-12-21 10:44:39 -05:00 committed by GitHub
parent 32f21d208f
commit b7ddfba71d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 152 additions and 1 deletions

View File

@ -15,6 +15,8 @@ from ..utils.sentry import capture_exception
_LOGGER: logging.Logger = logging.getLogger(__name__) _LOGGER: logging.Logger = logging.getLogger(__name__)
HASS_WATCHDOG_API = "HASS_WATCHDOG_API" HASS_WATCHDOG_API = "HASS_WATCHDOG_API"
HASS_WATCHDOG_REANIMATE_FAILURES = "HASS_WATCHDOG_REANIMATE_FAILURES"
HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS = 5
RUN_UPDATE_SUPERVISOR = 29100 RUN_UPDATE_SUPERVISOR = 29100
RUN_UPDATE_ADDONS = 57600 RUN_UPDATE_ADDONS = 57600
@ -154,6 +156,18 @@ class Tasks(CoreSysAttributes):
return return
if await self.sys_homeassistant.api.check_api_state(): if await self.sys_homeassistant.api.check_api_state():
# Home Assistant is running properly # Home Assistant is running properly
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
return
# Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue.
reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0)
if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS:
_LOGGER.critical(
"Watchdog cannot reanimate Home Assistant, failed all %s attempts.",
reanimate_fails,
)
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1
return return
# Init cache data # Init cache data
@ -171,7 +185,11 @@ class Tasks(CoreSysAttributes):
await self.sys_homeassistant.core.restart() await self.sys_homeassistant.core.restart()
except HomeAssistantError as err: except HomeAssistantError as err:
_LOGGER.error("Home Assistant watchdog reanimation failed!") _LOGGER.error("Home Assistant watchdog reanimation failed!")
if reanimate_fails == 0:
capture_exception(err) capture_exception(err)
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1
else:
self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0
finally: finally:
self._cache[HASS_WATCHDOG_API] = 0 self._cache[HASS_WATCHDOG_API] = 0

133
tests/misc/test_tasks.py Normal file
View File

@ -0,0 +1,133 @@
"""Test scheduled tasks."""
from unittest.mock import MagicMock, Mock, patch
from awesomeversion import AwesomeVersion
import pytest
from supervisor.coresys import CoreSys
from supervisor.exceptions import HomeAssistantError
from supervisor.homeassistant.api import HomeAssistantAPI
from supervisor.homeassistant.const import LANDINGPAGE
from supervisor.homeassistant.core import HomeAssistantCore
from supervisor.misc.tasks import Tasks
# pylint: disable=protected-access
@pytest.fixture(name="tasks")
async def fixture_tasks(coresys: CoreSys, container: MagicMock) -> Tasks:
"""Return task manager."""
coresys.homeassistant.watchdog = True
coresys.homeassistant.version = AwesomeVersion("2023.12.0")
container.status = "running"
yield Tasks(coresys)
async def test_watchdog_homeassistant_api(
tasks: Tasks, caplog: pytest.LogCaptureFixture
):
"""Test watchdog of homeassistant api."""
with patch.object(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(HomeAssistantCore, "restart") as restart:
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
assert "Watchdog miss API response from Home Assistant" in caplog.text
assert "Watchdog found a problem with Home Assistant API!" not in caplog.text
caplog.clear()
await tasks._watchdog_homeassistant_api()
restart.assert_called_once()
assert "Watchdog miss API response from Home Assistant" not in caplog.text
assert "Watchdog found a problem with Home Assistant API!" in caplog.text
async def test_watchdog_homeassistant_api_off(tasks: Tasks, coresys: CoreSys):
"""Test watchdog of homeassistant api does not run when disabled."""
coresys.homeassistant.watchdog = False
with patch.object(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(HomeAssistantCore, "restart") as restart:
await tasks._watchdog_homeassistant_api()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
async def test_watchdog_homeassistant_api_error_state(tasks: Tasks, coresys: CoreSys):
"""Test watchdog of homeassistant api does not restart when in error state."""
coresys.homeassistant.core._error_state = True
with patch.object(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(HomeAssistantCore, "restart") as restart:
await tasks._watchdog_homeassistant_api()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
async def test_watchdog_homeassistant_api_landing_page(tasks: Tasks, coresys: CoreSys):
"""Test watchdog of homeassistant api does not monitor landing page."""
coresys.homeassistant.version = LANDINGPAGE
with patch.object(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(HomeAssistantCore, "restart") as restart:
await tasks._watchdog_homeassistant_api()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
async def test_watchdog_homeassistant_api_not_running(
tasks: Tasks, container: MagicMock
):
"""Test watchdog of homeassistant api does not monitor when home assistant not running."""
container.status = "stopped"
with patch.object(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(HomeAssistantCore, "restart") as restart:
await tasks._watchdog_homeassistant_api()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
async def test_watchdog_homeassistant_api_reanimation_limit(
tasks: Tasks, caplog: pytest.LogCaptureFixture, capture_exception: Mock
):
"""Test watchdog of homeassistant api stops after max reanimation failures."""
with patch.object(
HomeAssistantAPI, "check_api_state", return_value=False
), patch.object(
HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError())
) as restart:
for _ in range(5):
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
await tasks._watchdog_homeassistant_api()
restart.assert_called_once()
assert "Home Assistant watchdog reanimation failed!" in caplog.text
restart.reset_mock()
capture_exception.assert_called_once_with(err)
caplog.clear()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
assert "Watchdog miss API response from Home Assistant" not in caplog.text
assert "Watchdog found a problem with Home Assistant API!" not in caplog.text
assert (
"Watchdog cannot reanimate Home Assistant, failed all 5 attempts."
in caplog.text
)
caplog.clear()
await tasks._watchdog_homeassistant_api()
restart.assert_not_called()
assert not caplog.text