mirror of
https://github.com/home-assistant/supervisor.git
synced 2025-07-10 18:56:30 +00:00
Use longer timeouts for API checks before trigger a rollback (#4658)
* Don't check if Core is running to trigger rollback Currently we check for Core API access and that the state is running. If this is not fulfilled within 5 minutes, we rollback to the previous version. It can take quite a while until Home Assistant Core is in state running. In fact, after going through bootstrap, it can theoretically take indefinitely (as in there is no timeout from Core side). So to trigger rollback, rather than check the state to be running, just check if the API is accessible in this case. This prevents spurious rollbacks. * Check Core status with and timeout after a longer time Instead of checking the Core API just for response, do check the state. Use a timeout which is long enough to cover all stages and other timeouts during Core startup. * Introduce get_api_state and better status messages * Update supervisor/homeassistant/api.py Co-authored-by: J. Nick Koston <nick@koston.org> * Add successful start test --------- Co-authored-by: J. Nick Koston <nick@koston.org>
This commit is contained in:
parent
a8f818fca5
commit
1e49129197
@ -130,14 +130,14 @@ class HomeAssistantAPI(CoreSysAttributes):
|
|||||||
"""Return Home Assistant core state."""
|
"""Return Home Assistant core state."""
|
||||||
return await self._get_json("api/core/state")
|
return await self._get_json("api/core/state")
|
||||||
|
|
||||||
async def check_api_state(self) -> bool:
|
async def get_api_state(self) -> str | None:
|
||||||
"""Return True if Home Assistant up and running."""
|
"""Return state of Home Assistant Core or None."""
|
||||||
# Skip check on landingpage
|
# Skip check on landingpage
|
||||||
if (
|
if (
|
||||||
self.sys_homeassistant.version is None
|
self.sys_homeassistant.version is None
|
||||||
or self.sys_homeassistant.version == LANDINGPAGE
|
or self.sys_homeassistant.version == LANDINGPAGE
|
||||||
):
|
):
|
||||||
return False
|
return None
|
||||||
|
|
||||||
# Check if port is up
|
# Check if port is up
|
||||||
if not await self.sys_run_in_executor(
|
if not await self.sys_run_in_executor(
|
||||||
@ -145,7 +145,7 @@ class HomeAssistantAPI(CoreSysAttributes):
|
|||||||
self.sys_homeassistant.ip_address,
|
self.sys_homeassistant.ip_address,
|
||||||
self.sys_homeassistant.api_port,
|
self.sys_homeassistant.api_port,
|
||||||
):
|
):
|
||||||
return False
|
return None
|
||||||
|
|
||||||
# Check if API is up
|
# Check if API is up
|
||||||
with suppress(HomeAssistantAPIError):
|
with suppress(HomeAssistantAPIError):
|
||||||
@ -157,7 +157,13 @@ class HomeAssistantAPI(CoreSysAttributes):
|
|||||||
else:
|
else:
|
||||||
data = await self.get_config()
|
data = await self.get_config()
|
||||||
# Older versions of home assistant does not expose the state
|
# Older versions of home assistant does not expose the state
|
||||||
if data and data.get("state", "RUNNING") == "RUNNING":
|
if data:
|
||||||
return True
|
return data.get("state", "RUNNING")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def check_api_state(self) -> bool:
|
||||||
|
"""Return Home Assistant Core state if up."""
|
||||||
|
if state := self.get_api_state():
|
||||||
|
return state == "RUNNING"
|
||||||
return False
|
return False
|
||||||
|
@ -44,7 +44,10 @@ from .const import (
|
|||||||
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
_LOGGER: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
SECONDS_BETWEEN_API_CHECKS: Final[int] = 5
|
SECONDS_BETWEEN_API_CHECKS: Final[int] = 5
|
||||||
STARTUP_API_CHECK_TIMEOUT: Final[timedelta] = timedelta(minutes=5)
|
# Core Stage 1 and some wiggle room
|
||||||
|
STARTUP_API_RESPONSE_TIMEOUT: Final[timedelta] = timedelta(minutes=3)
|
||||||
|
# All stages plus event start timeout and some wiggle rooom
|
||||||
|
STARTUP_API_CHECK_RUNNING_TIMEOUT: Final[timedelta] = timedelta(minutes=15)
|
||||||
RE_YAML_ERROR = re.compile(r"homeassistant\.util\.yaml")
|
RE_YAML_ERROR = re.compile(r"homeassistant\.util\.yaml")
|
||||||
|
|
||||||
|
|
||||||
@ -440,8 +443,9 @@ class HomeAssistantCore(JobGroup):
|
|||||||
return
|
return
|
||||||
_LOGGER.info("Wait until Home Assistant is ready")
|
_LOGGER.info("Wait until Home Assistant is ready")
|
||||||
|
|
||||||
start = datetime.now()
|
deadline = datetime.now() + STARTUP_API_RESPONSE_TIMEOUT
|
||||||
while not (timeout := datetime.now() >= start + STARTUP_API_CHECK_TIMEOUT):
|
last_state = None
|
||||||
|
while not (timeout := datetime.now() >= deadline):
|
||||||
await asyncio.sleep(SECONDS_BETWEEN_API_CHECKS)
|
await asyncio.sleep(SECONDS_BETWEEN_API_CHECKS)
|
||||||
|
|
||||||
# 1: Check if Container is is_running
|
# 1: Check if Container is is_running
|
||||||
@ -449,16 +453,26 @@ class HomeAssistantCore(JobGroup):
|
|||||||
_LOGGER.error("Home Assistant has crashed!")
|
_LOGGER.error("Home Assistant has crashed!")
|
||||||
break
|
break
|
||||||
|
|
||||||
# 2: Check if API response
|
# 2: Check API response
|
||||||
if await self.sys_homeassistant.api.check_api_state():
|
if state := await self.sys_homeassistant.api.get_api_state():
|
||||||
_LOGGER.info("Detect a running Home Assistant instance")
|
if last_state is None:
|
||||||
self._error_state = False
|
# API initially available, move deadline up and check API
|
||||||
return
|
# state to be running now
|
||||||
|
deadline = datetime.now() + STARTUP_API_CHECK_RUNNING_TIMEOUT
|
||||||
|
|
||||||
|
if last_state != state:
|
||||||
|
_LOGGER.info("Home Assistant Core state changed to %s", state)
|
||||||
|
last_state = state
|
||||||
|
|
||||||
|
if state == "RUNNING":
|
||||||
|
_LOGGER.info("Detect a running Home Assistant instance")
|
||||||
|
self._error_state = False
|
||||||
|
return
|
||||||
|
|
||||||
self._error_state = True
|
self._error_state = True
|
||||||
if timeout:
|
if timeout:
|
||||||
raise HomeAssistantStartupTimeout(
|
raise HomeAssistantStartupTimeout(
|
||||||
"No API response in 5 minutes, assuming core has had a fatal startup error",
|
"No Home Assistant Core response, assuming a fatal startup error",
|
||||||
_LOGGER.error,
|
_LOGGER.error,
|
||||||
)
|
)
|
||||||
raise HomeAssistantCrashError()
|
raise HomeAssistantCrashError()
|
||||||
|
@ -354,6 +354,7 @@ async def coresys(
|
|||||||
|
|
||||||
# WebSocket
|
# WebSocket
|
||||||
coresys_obj.homeassistant.api.check_api_state = AsyncMock(return_value=True)
|
coresys_obj.homeassistant.api.check_api_state = AsyncMock(return_value=True)
|
||||||
|
coresys_obj.homeassistant.api.get_api_state = AsyncMock(return_value="RUNNING")
|
||||||
coresys_obj.homeassistant._websocket._client = AsyncMock(
|
coresys_obj.homeassistant._websocket._client = AsyncMock(
|
||||||
ha_version=AwesomeVersion("2021.2.4")
|
ha_version=AwesomeVersion("2021.2.4")
|
||||||
)
|
)
|
||||||
|
@ -275,7 +275,7 @@ async def test_api_check_timeout(
|
|||||||
"""Test attempts to contact the API timeout."""
|
"""Test attempts to contact the API timeout."""
|
||||||
container.status = "stopped"
|
container.status = "stopped"
|
||||||
coresys.homeassistant.version = AwesomeVersion("2023.9.0")
|
coresys.homeassistant.version = AwesomeVersion("2023.9.0")
|
||||||
coresys.homeassistant.api.check_api_state.return_value = False
|
coresys.homeassistant.api.get_api_state.return_value = None
|
||||||
|
|
||||||
async def mock_instance_start(*_):
|
async def mock_instance_start(*_):
|
||||||
container.status = "running"
|
container.status = "running"
|
||||||
@ -294,8 +294,33 @@ async def test_api_check_timeout(
|
|||||||
), pytest.raises(HomeAssistantCrashError):
|
), pytest.raises(HomeAssistantCrashError):
|
||||||
await coresys.homeassistant.core.start()
|
await coresys.homeassistant.core.start()
|
||||||
|
|
||||||
assert coresys.homeassistant.api.check_api_state.call_count == 5
|
assert coresys.homeassistant.api.get_api_state.call_count == 3
|
||||||
assert (
|
assert (
|
||||||
"No API response in 5 minutes, assuming core has had a fatal startup error"
|
"No Home Assistant Core response, assuming a fatal startup error" in caplog.text
|
||||||
in caplog.text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def test_api_check_success(
|
||||||
|
coresys: CoreSys, container: MagicMock, caplog: pytest.LogCaptureFixture
|
||||||
|
):
|
||||||
|
"""Test attempts to contact the API timeout."""
|
||||||
|
container.status = "stopped"
|
||||||
|
coresys.homeassistant.version = AwesomeVersion("2023.9.0")
|
||||||
|
|
||||||
|
async def mock_instance_start(*_):
|
||||||
|
container.status = "running"
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
DockerHomeAssistant, "start", new=mock_instance_start
|
||||||
|
), patch.object(DockerAPI, "container_is_initialized", return_value=True), travel(
|
||||||
|
datetime(2023, 10, 2, 0, 0, 0), tick=False
|
||||||
|
) as traveller:
|
||||||
|
|
||||||
|
async def mock_sleep(*args):
|
||||||
|
traveller.shift(timedelta(minutes=1))
|
||||||
|
|
||||||
|
with patch("supervisor.homeassistant.core.asyncio.sleep", new=mock_sleep):
|
||||||
|
await coresys.homeassistant.core.start()
|
||||||
|
|
||||||
|
assert coresys.homeassistant.api.get_api_state.call_count == 1
|
||||||
|
assert "Detect a running Home Assistant instance" in caplog.text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user