Improve Supervisor startup error handling (#5918)

Instead of starting a task in the background synchronously wait
for Supervisor start sequence to complete. This should be functional
equivalent, as we anyways would loop forever in the event loop just
afterwards.

The advantage is that we now can catch any exceptions during the
start sequence and report any errors with critical logging to report
those to Sentry, if enabled. It also avoids "Task exception was never
retrieved" errors. Reporting errors is especially important since we
can't use the asyncio Sentry integration (see #5729 for details).

Also handle early add-on start errors just like other add-on start
errors (make sure the finally block is executed as well). And finally,
register signal handlers synchronously. There is no real benefit in
doing them asynchronously, and it avoids a potential race condition.
This commit is contained in:
Stefan Agner 2025-05-29 11:42:28 +02:00 committed by GitHub
parent 4d1a5e2dc2
commit 2a88cb9339
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 19 additions and 8 deletions

View File

@ -66,8 +66,15 @@ if __name__ == "__main__":
_LOGGER.info("Setting up Supervisor")
loop.run_until_complete(coresys.core.setup())
loop.call_soon_threadsafe(loop.create_task, coresys.core.start())
loop.call_soon_threadsafe(bootstrap.reg_signal, loop, coresys)
bootstrap.register_signal_handlers(loop, coresys)
try:
loop.run_until_complete(coresys.core.start())
except Exception as err: # pylint: disable=broad-except
# Supervisor itself is running at this point, just something didn't
# start as expected. Log with traceback to get more insights for
# such cases.
_LOGGER.critical("Supervisor start failed: %s", err, exc_info=True)
try:
_LOGGER.info("Running Supervisor")

View File

@ -1,6 +1,7 @@
"""Bootstrap Supervisor."""
# ruff: noqa: T100
import asyncio
from importlib import import_module
import logging
import os
@ -284,8 +285,8 @@ def check_environment() -> None:
_LOGGER.critical("Can't find Docker socket!")
def reg_signal(loop, coresys: CoreSys) -> None:
"""Register SIGTERM and SIGKILL to stop system."""
def register_signal_handlers(loop: asyncio.BaseEventLoop, coresys: CoreSys) -> None:
"""Register SIGTERM, SIGHUP and SIGKILL to stop the Supervisor."""
try:
loop.add_signal_handler(
signal.SIGTERM, lambda: loop.create_task(coresys.core.stop())

View File

@ -188,7 +188,10 @@ class Core(CoreSysAttributes):
await setup_task
except Exception as err: # pylint: disable=broad-except
_LOGGER.critical(
"Fatal error happening on load Task %s: %s", setup_task, err
"Fatal error happening on load Task %s: %s",
setup_task,
err,
exc_info=True,
)
self.sys_resolution.add_unhealthy_reason(UnhealthyReason.SETUP)
await async_capture_exception(err)
@ -237,10 +240,10 @@ class Core(CoreSysAttributes):
await self.sys_supervisor.update()
return
# Start addon mark as initialize
await self.sys_addons.boot(AddonStartup.INITIALIZE)
try:
# Start addon mark as initialize
await self.sys_addons.boot(AddonStartup.INITIALIZE)
# HomeAssistant is already running, only Supervisor restarted
if await self.sys_hardware.helper.last_boot() == self.sys_config.last_boot:
_LOGGER.info("Detected Supervisor restart")