Use longer timeouts for API checks before trigger a rollback (#4658)

* Don't check if Core is running to trigger rollback

Currently we check for Core API access and that the state is running. If
this is not fulfilled within 5 minutes, we rollback to the previous
version.

It can take quite a while until Home Assistant Core is in state running.
In fact, after going through bootstrap, it can theoretically take
indefinitely (as in there is no timeout from Core side).

So to trigger rollback, rather than check the state to be running, just
check if the API is accessible in this case. This prevents spurious
rollbacks.

* Check Core status with and timeout after a longer time

Instead of checking the Core API just for response, do check the
state. Use a timeout which is long enough to cover all stages and
other timeouts during Core startup.

* Introduce get_api_state and better status messages

* Update supervisor/homeassistant/api.py

Co-authored-by: J. Nick Koston <nick@koston.org>

* Add successful start test

---------

Co-authored-by: J. Nick Koston <nick@koston.org>
This commit is contained in:
Stefan Agner 2023-11-01 21:01:38 +01:00 committed by GitHub
parent a8f818fca5
commit 1e49129197
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 19 deletions

View File

@ -130,14 +130,14 @@ class HomeAssistantAPI(CoreSysAttributes):
"""Return Home Assistant core state."""
return await self._get_json("api/core/state")
async def check_api_state(self) -> bool:
"""Return True if Home Assistant up and running."""
async def get_api_state(self) -> str | None:
"""Return state of Home Assistant Core or None."""
# Skip check on landingpage
if (
self.sys_homeassistant.version is None
or self.sys_homeassistant.version == LANDINGPAGE
):
return False
return None
# Check if port is up
if not await self.sys_run_in_executor(
@ -145,7 +145,7 @@ class HomeAssistantAPI(CoreSysAttributes):
self.sys_homeassistant.ip_address,
self.sys_homeassistant.api_port,
):
return False
return None
# Check if API is up
with suppress(HomeAssistantAPIError):
@ -157,7 +157,13 @@ class HomeAssistantAPI(CoreSysAttributes):
else:
data = await self.get_config()
# Older versions of home assistant does not expose the state
if data and data.get("state", "RUNNING") == "RUNNING":
return True
if data:
return data.get("state", "RUNNING")
return None
async def check_api_state(self) -> bool:
"""Return Home Assistant Core state if up."""
if state := self.get_api_state():
return state == "RUNNING"
return False

View File

@ -44,7 +44,10 @@ from .const import (
_LOGGER: logging.Logger = logging.getLogger(__name__)
SECONDS_BETWEEN_API_CHECKS: Final[int] = 5
STARTUP_API_CHECK_TIMEOUT: Final[timedelta] = timedelta(minutes=5)
# Core Stage 1 and some wiggle room
STARTUP_API_RESPONSE_TIMEOUT: Final[timedelta] = timedelta(minutes=3)
# All stages plus event start timeout and some wiggle rooom
STARTUP_API_CHECK_RUNNING_TIMEOUT: Final[timedelta] = timedelta(minutes=15)
RE_YAML_ERROR = re.compile(r"homeassistant\.util\.yaml")
@ -440,8 +443,9 @@ class HomeAssistantCore(JobGroup):
return
_LOGGER.info("Wait until Home Assistant is ready")
start = datetime.now()
while not (timeout := datetime.now() >= start + STARTUP_API_CHECK_TIMEOUT):
deadline = datetime.now() + STARTUP_API_RESPONSE_TIMEOUT
last_state = None
while not (timeout := datetime.now() >= deadline):
await asyncio.sleep(SECONDS_BETWEEN_API_CHECKS)
# 1: Check if Container is is_running
@ -449,8 +453,18 @@ class HomeAssistantCore(JobGroup):
_LOGGER.error("Home Assistant has crashed!")
break
# 2: Check if API response
if await self.sys_homeassistant.api.check_api_state():
# 2: Check API response
if state := await self.sys_homeassistant.api.get_api_state():
if last_state is None:
# API initially available, move deadline up and check API
# state to be running now
deadline = datetime.now() + STARTUP_API_CHECK_RUNNING_TIMEOUT
if last_state != state:
_LOGGER.info("Home Assistant Core state changed to %s", state)
last_state = state
if state == "RUNNING":
_LOGGER.info("Detect a running Home Assistant instance")
self._error_state = False
return
@ -458,7 +472,7 @@ class HomeAssistantCore(JobGroup):
self._error_state = True
if timeout:
raise HomeAssistantStartupTimeout(
"No API response in 5 minutes, assuming core has had a fatal startup error",
"No Home Assistant Core response, assuming a fatal startup error",
_LOGGER.error,
)
raise HomeAssistantCrashError()

View File

@ -354,6 +354,7 @@ async def coresys(
# WebSocket
coresys_obj.homeassistant.api.check_api_state = AsyncMock(return_value=True)
coresys_obj.homeassistant.api.get_api_state = AsyncMock(return_value="RUNNING")
coresys_obj.homeassistant._websocket._client = AsyncMock(
ha_version=AwesomeVersion("2021.2.4")
)

View File

@ -275,7 +275,7 @@ async def test_api_check_timeout(
"""Test attempts to contact the API timeout."""
container.status = "stopped"
coresys.homeassistant.version = AwesomeVersion("2023.9.0")
coresys.homeassistant.api.check_api_state.return_value = False
coresys.homeassistant.api.get_api_state.return_value = None
async def mock_instance_start(*_):
container.status = "running"
@ -294,8 +294,33 @@ async def test_api_check_timeout(
), pytest.raises(HomeAssistantCrashError):
await coresys.homeassistant.core.start()
assert coresys.homeassistant.api.check_api_state.call_count == 5
assert coresys.homeassistant.api.get_api_state.call_count == 3
assert (
"No API response in 5 minutes, assuming core has had a fatal startup error"
in caplog.text
"No Home Assistant Core response, assuming a fatal startup error" in caplog.text
)
async def test_api_check_success(
coresys: CoreSys, container: MagicMock, caplog: pytest.LogCaptureFixture
):
"""Test attempts to contact the API timeout."""
container.status = "stopped"
coresys.homeassistant.version = AwesomeVersion("2023.9.0")
async def mock_instance_start(*_):
container.status = "running"
with patch.object(
DockerHomeAssistant, "start", new=mock_instance_start
), patch.object(DockerAPI, "container_is_initialized", return_value=True), travel(
datetime(2023, 10, 2, 0, 0, 0), tick=False
) as traveller:
async def mock_sleep(*args):
traveller.shift(timedelta(minutes=1))
with patch("supervisor.homeassistant.core.asyncio.sleep", new=mock_sleep):
await coresys.homeassistant.core.start()
assert coresys.homeassistant.api.get_api_state.call_count == 1
assert "Detect a running Home Assistant instance" in caplog.text