From b7ddfba71d35469c84e99c61075c236966bf23ff Mon Sep 17 00:00:00 2001 From: Mike Degatano Date: Thu, 21 Dec 2023 10:44:39 -0500 Subject: [PATCH] Set max reanimation attempts on HA watchdog (#4784) --- supervisor/misc/tasks.py | 20 +++++- tests/misc/test_tasks.py | 133 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 tests/misc/test_tasks.py diff --git a/supervisor/misc/tasks.py b/supervisor/misc/tasks.py index 4333821c5..50497c580 100644 --- a/supervisor/misc/tasks.py +++ b/supervisor/misc/tasks.py @@ -15,6 +15,8 @@ from ..utils.sentry import capture_exception _LOGGER: logging.Logger = logging.getLogger(__name__) HASS_WATCHDOG_API = "HASS_WATCHDOG_API" +HASS_WATCHDOG_REANIMATE_FAILURES = "HASS_WATCHDOG_REANIMATE_FAILURES" +HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS = 5 RUN_UPDATE_SUPERVISOR = 29100 RUN_UPDATE_ADDONS = 57600 @@ -154,6 +156,18 @@ class Tasks(CoreSysAttributes): return if await self.sys_homeassistant.api.check_api_state(): # Home Assistant is running properly + self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0 + return + + # Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue. + reanimate_fails = self._cache.get(HASS_WATCHDOG_REANIMATE_FAILURES, 0) + if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: + if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: + _LOGGER.critical( + "Watchdog cannot reanimate Home Assistant, failed all %s attempts.", + reanimate_fails, + ) + self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1 return # Init cache data @@ -171,7 +185,11 @@ class Tasks(CoreSysAttributes): await self.sys_homeassistant.core.restart() except HomeAssistantError as err: _LOGGER.error("Home Assistant watchdog reanimation failed!") - capture_exception(err) + if reanimate_fails == 0: + capture_exception(err) + self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = reanimate_fails + 1 + else: + self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0 finally: self._cache[HASS_WATCHDOG_API] = 0 diff --git a/tests/misc/test_tasks.py b/tests/misc/test_tasks.py new file mode 100644 index 000000000..feff83403 --- /dev/null +++ b/tests/misc/test_tasks.py @@ -0,0 +1,133 @@ +"""Test scheduled tasks.""" + +from unittest.mock import MagicMock, Mock, patch + +from awesomeversion import AwesomeVersion +import pytest + +from supervisor.coresys import CoreSys +from supervisor.exceptions import HomeAssistantError +from supervisor.homeassistant.api import HomeAssistantAPI +from supervisor.homeassistant.const import LANDINGPAGE +from supervisor.homeassistant.core import HomeAssistantCore +from supervisor.misc.tasks import Tasks + +# pylint: disable=protected-access + + +@pytest.fixture(name="tasks") +async def fixture_tasks(coresys: CoreSys, container: MagicMock) -> Tasks: + """Return task manager.""" + coresys.homeassistant.watchdog = True + coresys.homeassistant.version = AwesomeVersion("2023.12.0") + container.status = "running" + yield Tasks(coresys) + + +async def test_watchdog_homeassistant_api( + tasks: Tasks, caplog: pytest.LogCaptureFixture +): + """Test watchdog of homeassistant api.""" + with patch.object( + HomeAssistantAPI, "check_api_state", return_value=False + ), patch.object(HomeAssistantCore, "restart") as restart: + await tasks._watchdog_homeassistant_api() + + restart.assert_not_called() + assert "Watchdog miss API response from Home Assistant" in caplog.text + assert "Watchdog found a problem with Home Assistant API!" not in caplog.text + + caplog.clear() + await tasks._watchdog_homeassistant_api() + + restart.assert_called_once() + assert "Watchdog miss API response from Home Assistant" not in caplog.text + assert "Watchdog found a problem with Home Assistant API!" in caplog.text + + +async def test_watchdog_homeassistant_api_off(tasks: Tasks, coresys: CoreSys): + """Test watchdog of homeassistant api does not run when disabled.""" + coresys.homeassistant.watchdog = False + + with patch.object( + HomeAssistantAPI, "check_api_state", return_value=False + ), patch.object(HomeAssistantCore, "restart") as restart: + await tasks._watchdog_homeassistant_api() + await tasks._watchdog_homeassistant_api() + restart.assert_not_called() + + +async def test_watchdog_homeassistant_api_error_state(tasks: Tasks, coresys: CoreSys): + """Test watchdog of homeassistant api does not restart when in error state.""" + coresys.homeassistant.core._error_state = True + + with patch.object( + HomeAssistantAPI, "check_api_state", return_value=False + ), patch.object(HomeAssistantCore, "restart") as restart: + await tasks._watchdog_homeassistant_api() + await tasks._watchdog_homeassistant_api() + restart.assert_not_called() + + +async def test_watchdog_homeassistant_api_landing_page(tasks: Tasks, coresys: CoreSys): + """Test watchdog of homeassistant api does not monitor landing page.""" + coresys.homeassistant.version = LANDINGPAGE + + with patch.object( + HomeAssistantAPI, "check_api_state", return_value=False + ), patch.object(HomeAssistantCore, "restart") as restart: + await tasks._watchdog_homeassistant_api() + await tasks._watchdog_homeassistant_api() + restart.assert_not_called() + + +async def test_watchdog_homeassistant_api_not_running( + tasks: Tasks, container: MagicMock +): + """Test watchdog of homeassistant api does not monitor when home assistant not running.""" + container.status = "stopped" + + with patch.object( + HomeAssistantAPI, "check_api_state", return_value=False + ), patch.object(HomeAssistantCore, "restart") as restart: + await tasks._watchdog_homeassistant_api() + await tasks._watchdog_homeassistant_api() + restart.assert_not_called() + + +async def test_watchdog_homeassistant_api_reanimation_limit( + tasks: Tasks, caplog: pytest.LogCaptureFixture, capture_exception: Mock +): + """Test watchdog of homeassistant api stops after max reanimation failures.""" + with patch.object( + HomeAssistantAPI, "check_api_state", return_value=False + ), patch.object( + HomeAssistantCore, "restart", side_effect=(err := HomeAssistantError()) + ) as restart: + for _ in range(5): + await tasks._watchdog_homeassistant_api() + restart.assert_not_called() + + await tasks._watchdog_homeassistant_api() + restart.assert_called_once() + assert "Home Assistant watchdog reanimation failed!" in caplog.text + + restart.reset_mock() + + capture_exception.assert_called_once_with(err) + + caplog.clear() + await tasks._watchdog_homeassistant_api() + + restart.assert_not_called() + assert "Watchdog miss API response from Home Assistant" not in caplog.text + assert "Watchdog found a problem with Home Assistant API!" not in caplog.text + assert ( + "Watchdog cannot reanimate Home Assistant, failed all 5 attempts." + in caplog.text + ) + + caplog.clear() + await tasks._watchdog_homeassistant_api() + restart.assert_not_called() + assert not caplog.text