Watchdog addon on successful but unexpected exit (#3815)

This commit is contained in:
Mike Degatano 2022-08-22 20:29:27 -04:00 committed by GitHub
parent 024b813865
commit f4811a0243
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 123 additions and 126 deletions

View File

@ -112,6 +112,9 @@ class Addon(AddonModel):
super().__init__(coresys, slug) super().__init__(coresys, slug)
self.instance: DockerAddon = DockerAddon(coresys, self) self.instance: DockerAddon = DockerAddon(coresys, self)
self._state: AddonState = AddonState.UNKNOWN self._state: AddonState = AddonState.UNKNOWN
self._manual_stop: bool = (
self.sys_hardware.helper.last_boot == self.sys_config.last_boot
)
@Job( @Job(
name=f"addon_{slug}_restart_after_problem", name=f"addon_{slug}_restart_after_problem",
@ -682,6 +685,7 @@ class Addon(AddonModel):
async def stop(self) -> None: async def stop(self) -> None:
"""Stop add-on.""" """Stop add-on."""
self._manual_stop = True
try: try:
await self.instance.stop() await self.instance.stop()
except DockerError as err: except DockerError as err:
@ -950,6 +954,7 @@ class Addon(AddonModel):
ContainerState.HEALTHY, ContainerState.HEALTHY,
ContainerState.UNHEALTHY, ContainerState.UNHEALTHY,
]: ]:
self._manual_stop = False
self.state = AddonState.STARTED self.state = AddonState.STARTED
elif event.state == ContainerState.STOPPED: elif event.state == ContainerState.STOPPED:
self.state = AddonState.STOPPED self.state = AddonState.STOPPED
@ -958,8 +963,15 @@ class Addon(AddonModel):
async def watchdog_container(self, event: DockerContainerStateEvent) -> None: async def watchdog_container(self, event: DockerContainerStateEvent) -> None:
"""Process state changes in addon container and restart if necessary.""" """Process state changes in addon container and restart if necessary."""
if not (event.name == self.instance.name and self.watchdog): if (
not (event.name == self.instance.name and self.watchdog)
or self._manual_stop
):
return return
if event.state in [ContainerState.FAILED, ContainerState.UNHEALTHY]: if event.state in [
ContainerState.FAILED,
ContainerState.STOPPED,
ContainerState.UNHEALTHY,
]:
await self._restart_after_problem(self, event.state) await self._restart_after_problem(self, event.state)

View File

@ -41,7 +41,7 @@ class HwHelper(CoreSysAttributes):
return bool(self.sys_hardware.filter_devices(subsystem=UdevSubsystem.USB)) return bool(self.sys_hardware.filter_devices(subsystem=UdevSubsystem.USB))
@property @property
def last_boot(self) -> str | None: def last_boot(self) -> datetime | None:
"""Return last boot time.""" """Return last boot time."""
try: try:
stats: str = _PROC_STAT.read_text(encoding="utf-8") stats: str = _PROC_STAT.read_text(encoding="utf-8")

View File

@ -1,6 +1,7 @@
"""Test Home Assistant Add-ons.""" """Test Home Assistant Add-ons."""
import asyncio import asyncio
from datetime import timedelta
from unittest.mock import MagicMock, PropertyMock, patch from unittest.mock import MagicMock, PropertyMock, patch
from docker.errors import DockerException from docker.errors import DockerException
@ -9,13 +10,38 @@ import pytest
from supervisor.addons.addon import Addon from supervisor.addons.addon import Addon
from supervisor.const import AddonState, BusEvent from supervisor.const import AddonState, BusEvent
from supervisor.coresys import CoreSys from supervisor.coresys import CoreSys
from supervisor.docker.addon import DockerAddon
from supervisor.docker.const import ContainerState from supervisor.docker.const import ContainerState
from supervisor.docker.monitor import DockerContainerStateEvent from supervisor.docker.monitor import DockerContainerStateEvent
from supervisor.exceptions import AddonsJobError, AudioUpdateError from supervisor.exceptions import AddonsJobError, AudioUpdateError
from supervisor.store.repository import Repository
from supervisor.utils.dt import utcnow
from ..const import TEST_ADDON_SLUG from ..const import TEST_ADDON_SLUG
def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState):
"""Fire a test event."""
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=name,
state=state,
id="abc123",
time=1,
),
)
async def mock_current_state(state: ContainerState) -> ContainerState:
"""Mock for current state method."""
return state
async def mock_stop() -> None:
"""Mock for stop method."""
def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None: def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test options merge.""" """Test options merge."""
addon = coresys.addons.get(TEST_ADDON_SLUG) addon = coresys.addons.get(TEST_ADDON_SLUG)
@ -71,174 +97,107 @@ def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None:
async def test_addon_state_listener(coresys: CoreSys, install_addon_ssh: Addon) -> None: async def test_addon_state_listener(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test addon is setting state from docker events.""" """Test addon is setting state from docker events."""
with patch.object(type(install_addon_ssh.instance), "attach"): with patch.object(DockerAddon, "attach"):
await install_addon_ssh.load() await install_addon_ssh.load()
assert install_addon_ssh.state == AddonState.UNKNOWN assert install_addon_ssh.state == AddonState.UNKNOWN
with patch.object(type(install_addon_ssh), "watchdog_container"): with patch.object(Addon, "watchdog_container"):
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.RUNNING,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
assert install_addon_ssh.state == AddonState.STARTED assert install_addon_ssh.state == AddonState.STARTED
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.STOPPED,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
assert install_addon_ssh.state == AddonState.STOPPED assert install_addon_ssh.state == AddonState.STOPPED
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.HEALTHY)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.HEALTHY,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
assert install_addon_ssh.state == AddonState.STARTED assert install_addon_ssh.state == AddonState.STARTED
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.FAILED,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
assert install_addon_ssh.state == AddonState.ERROR assert install_addon_ssh.state == AddonState.ERROR
# Test other addons are ignored # Test other addons are ignored
coresys.bus.fire_event( _fire_test_event(coresys, "addon_local_non_installed", ContainerState.RUNNING)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name="addon_local_non_installed",
state=ContainerState.RUNNING,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
assert install_addon_ssh.state == AddonState.ERROR assert install_addon_ssh.state == AddonState.ERROR
async def mock_current_state(state: ContainerState) -> ContainerState:
"""Mock for current state method."""
return state
async def mock_stop() -> None:
"""Mock for stop method."""
async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> None: async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test addon watchdog works correctly.""" """Test addon watchdog works correctly."""
with patch.object(type(install_addon_ssh.instance), "attach"): with patch.object(DockerAddon, "attach"):
await install_addon_ssh.load() await install_addon_ssh.load()
install_addon_ssh.watchdog = True install_addon_ssh.watchdog = True
with patch.object(Addon, "restart") as restart, patch.object( with patch.object(Addon, "restart") as restart, patch.object(
Addon, "start" Addon, "start"
) as start, patch.object( ) as start, patch.object(DockerAddon, "current_state") as current_state:
type(install_addon_ssh.instance), "current_state" # Restart if it becomes unhealthy
) as current_state:
current_state.return_value = mock_current_state(ContainerState.UNHEALTHY) current_state.return_value = mock_current_state(ContainerState.UNHEALTHY)
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.UNHEALTHY)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.UNHEALTHY,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
restart.assert_called_once() restart.assert_called_once()
start.assert_not_called() start.assert_not_called()
restart.reset_mock() restart.reset_mock()
current_state.return_value = mock_current_state(ContainerState.FAILED)
with patch.object( # Rebuild if it failed
type(install_addon_ssh.instance), "stop", return_value=mock_stop() current_state.return_value = mock_current_state(ContainerState.FAILED)
) as stop: with patch.object(DockerAddon, "stop", return_value=mock_stop()) as stop:
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.FAILED,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
stop.assert_called_once_with(remove_container=True) stop.assert_called_once_with(remove_container=True)
restart.assert_not_called() restart.assert_not_called()
start.assert_called_once() start.assert_called_once()
start.reset_mock() start.reset_mock()
# Do not process event if container state has changed since fired # Do not process event if container state has changed since fired
current_state.return_value = mock_current_state(ContainerState.HEALTHY) current_state.return_value = mock_current_state(ContainerState.HEALTHY)
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.FAILED,
id="abc123",
time=1,
),
)
await asyncio.sleep(0)
restart.assert_not_called()
start.assert_not_called()
# Do not restart when addon stopped normally
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.STOPPED,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
restart.assert_not_called() restart.assert_not_called()
start.assert_not_called() start.assert_not_called()
# Other addons ignored # Other addons ignored
coresys.bus.fire_event( current_state.return_value = mock_current_state(ContainerState.UNHEALTHY)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE, _fire_test_event(coresys, "addon_local_non_installed", ContainerState.UNHEALTHY)
DockerContainerStateEvent(
name="addon_local_non_installed",
state=ContainerState.UNHEALTHY,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
restart.assert_not_called() restart.assert_not_called()
start.assert_not_called() start.assert_not_called()
async def test_watchdog_on_stop(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test addon watchdog restarts addon on stop if not manual."""
with patch.object(DockerAddon, "attach"):
await install_addon_ssh.load()
install_addon_ssh.watchdog = True
with patch.object(Addon, "restart") as restart, patch.object(
DockerAddon,
"current_state",
return_value=mock_current_state(ContainerState.STOPPED),
), patch.object(DockerAddon, "stop", return_value=mock_stop()):
# Do not restart when addon stopped by user
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
await asyncio.sleep(0)
await install_addon_ssh.stop()
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
await asyncio.sleep(0)
restart.assert_not_called()
# Do restart addon if it stops and user didn't do it
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
await asyncio.sleep(0)
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
await asyncio.sleep(0)
restart.assert_called_once()
async def test_listener_attached_on_install(coresys: CoreSys, repository): async def test_listener_attached_on_install(coresys: CoreSys, repository):
"""Test events listener attached on addon install.""" """Test events listener attached on addon install."""
container_collection = MagicMock() container_collection = MagicMock()
@ -258,19 +217,45 @@ async def test_listener_attached_on_install(coresys: CoreSys, repository):
): ):
await coresys.addons.install.__wrapped__(coresys.addons, TEST_ADDON_SLUG) await coresys.addons.install.__wrapped__(coresys.addons, TEST_ADDON_SLUG)
coresys.bus.fire_event( _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.RUNNING,
id="abc123",
time=1,
),
)
await asyncio.sleep(0) await asyncio.sleep(0)
assert coresys.addons.get(TEST_ADDON_SLUG).state == AddonState.STARTED assert coresys.addons.get(TEST_ADDON_SLUG).state == AddonState.STARTED
@pytest.mark.parametrize(
"boot_timedelta,restart_count", [(timedelta(), 0), (timedelta(days=1), 1)]
)
async def test_watchdog_during_attach(
coresys: CoreSys,
repository: Repository,
boot_timedelta: timedelta,
restart_count: int,
):
"""Test host reboot treated as manual stop but not supervisor restart."""
store = coresys.addons.store[TEST_ADDON_SLUG]
coresys.addons.data.install(store)
with patch.object(Addon, "restart") as restart, patch.object(
type(coresys.hardware.helper),
"last_boot",
new=PropertyMock(return_value=utcnow()),
), patch.object(DockerAddon, "attach"), patch.object(
DockerAddon,
"current_state",
return_value=mock_current_state(ContainerState.STOPPED),
):
coresys.config.last_boot = coresys.hardware.helper.last_boot + boot_timedelta
addon = Addon(coresys, store.slug)
coresys.addons.local[addon.slug] = addon
addon.watchdog = True
await addon.load()
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
await asyncio.sleep(0)
assert restart.call_count == restart_count
async def test_install_update_fails_if_out_of_date( async def test_install_update_fails_if_out_of_date(
coresys: CoreSys, install_addon_ssh: Addon coresys: CoreSys, install_addon_ssh: Addon
): ):