mirror of
https://github.com/home-assistant/supervisor.git
synced 2025-07-13 12:16:29 +00:00
Remove race with watchdog during backup, restore and update (#4635)
* Remove race with watchdog during backup, restore and update * Fix pylint issues and test * Stop after image pull during update * Add test for max failed attempts for plugin watchdog
This commit is contained in:
parent
010043f116
commit
37c1c89d44
@ -288,33 +288,40 @@ class AddonManager(CoreSysAttributes):
|
||||
)
|
||||
|
||||
# Update instance
|
||||
last_state: AddonState = addon.state
|
||||
old_image = addon.image
|
||||
# Cache data to prevent races with other updates to global
|
||||
store = store.clone()
|
||||
|
||||
try:
|
||||
await addon.instance.update(store.version, store.image)
|
||||
except DockerError as err:
|
||||
raise AddonsError() from err
|
||||
|
||||
_LOGGER.info("Add-on '%s' successfully updated", slug)
|
||||
self.data.update(store)
|
||||
# Stop the addon if running
|
||||
if (last_state := addon.state) in {AddonState.STARTED, AddonState.STARTUP}:
|
||||
await addon.stop()
|
||||
|
||||
# Cleanup
|
||||
with suppress(DockerError):
|
||||
await addon.instance.cleanup(
|
||||
old_image=old_image, image=store.image, version=store.version
|
||||
try:
|
||||
_LOGGER.info("Add-on '%s' successfully updated", slug)
|
||||
self.data.update(store)
|
||||
|
||||
# Cleanup
|
||||
with suppress(DockerError):
|
||||
await addon.instance.cleanup(
|
||||
old_image=old_image, image=store.image, version=store.version
|
||||
)
|
||||
|
||||
# Setup/Fix AppArmor profile
|
||||
await addon.install_apparmor()
|
||||
|
||||
finally:
|
||||
# restore state. Return awaitable for caller if no exception
|
||||
out = (
|
||||
await addon.start()
|
||||
if last_state in {AddonState.STARTED, AddonState.STARTUP}
|
||||
else None
|
||||
)
|
||||
|
||||
# Setup/Fix AppArmor profile
|
||||
await addon.install_apparmor()
|
||||
|
||||
# restore state
|
||||
return (
|
||||
await addon.start()
|
||||
if last_state in [AddonState.STARTED, AddonState.STARTUP]
|
||||
else None
|
||||
)
|
||||
return out
|
||||
|
||||
@Job(
|
||||
name="addon_manager_rebuild",
|
||||
|
@ -782,10 +782,7 @@ class Addon(AddonModel):
|
||||
|
||||
if self.backup_mode == AddonBackupMode.COLD:
|
||||
_LOGGER.info("Shutdown add-on %s for cold backup", self.slug)
|
||||
try:
|
||||
await self.instance.stop()
|
||||
except DockerError as err:
|
||||
raise AddonsError() from err
|
||||
await self.stop()
|
||||
|
||||
elif self.backup_pre is not None:
|
||||
await self._backup_command(self.backup_pre)
|
||||
@ -933,64 +930,67 @@ class Addon(AddonModel):
|
||||
|
||||
# Stop it first if its running
|
||||
if await self.instance.is_running():
|
||||
with suppress(DockerError):
|
||||
await self.instance.stop()
|
||||
await self.stop()
|
||||
|
||||
# Check version / restore image
|
||||
version = data[ATTR_VERSION]
|
||||
if not await self.instance.exists():
|
||||
_LOGGER.info("Restore/Install of image for addon %s", self.slug)
|
||||
|
||||
image_file = Path(temp, "image.tar")
|
||||
if image_file.is_file():
|
||||
with suppress(DockerError):
|
||||
await self.instance.import_image(image_file)
|
||||
else:
|
||||
with suppress(DockerError):
|
||||
await self.instance.install(version, restore_image)
|
||||
await self.instance.cleanup()
|
||||
elif self.instance.version != version or self.legacy:
|
||||
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
|
||||
with suppress(DockerError):
|
||||
await self.instance.update(version, restore_image)
|
||||
|
||||
# Restore data
|
||||
def _restore_data():
|
||||
"""Restore data."""
|
||||
temp_data = Path(temp, "data")
|
||||
if temp_data.is_dir():
|
||||
shutil.copytree(temp_data, self.path_data, symlinks=True)
|
||||
else:
|
||||
self.path_data.mkdir()
|
||||
|
||||
_LOGGER.info("Restoring data for addon %s", self.slug)
|
||||
if self.path_data.is_dir():
|
||||
await remove_data(self.path_data)
|
||||
try:
|
||||
await self.sys_run_in_executor(_restore_data)
|
||||
except shutil.Error as err:
|
||||
raise AddonsError(
|
||||
f"Can't restore origin data: {err}", _LOGGER.error
|
||||
) from err
|
||||
# Check version / restore image
|
||||
version = data[ATTR_VERSION]
|
||||
if not await self.instance.exists():
|
||||
_LOGGER.info("Restore/Install of image for addon %s", self.slug)
|
||||
|
||||
# Restore AppArmor
|
||||
profile_file = Path(temp, "apparmor.txt")
|
||||
if profile_file.exists():
|
||||
image_file = Path(temp, "image.tar")
|
||||
if image_file.is_file():
|
||||
with suppress(DockerError):
|
||||
await self.instance.import_image(image_file)
|
||||
else:
|
||||
with suppress(DockerError):
|
||||
await self.instance.install(version, restore_image)
|
||||
await self.instance.cleanup()
|
||||
elif self.instance.version != version or self.legacy:
|
||||
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
|
||||
with suppress(DockerError):
|
||||
await self.instance.update(version, restore_image)
|
||||
|
||||
# Restore data
|
||||
def _restore_data():
|
||||
"""Restore data."""
|
||||
temp_data = Path(temp, "data")
|
||||
if temp_data.is_dir():
|
||||
shutil.copytree(temp_data, self.path_data, symlinks=True)
|
||||
else:
|
||||
self.path_data.mkdir()
|
||||
|
||||
_LOGGER.info("Restoring data for addon %s", self.slug)
|
||||
if self.path_data.is_dir():
|
||||
await remove_data(self.path_data)
|
||||
try:
|
||||
await self.sys_host.apparmor.load_profile(self.slug, profile_file)
|
||||
except HostAppArmorError as err:
|
||||
_LOGGER.error(
|
||||
"Can't restore AppArmor profile for add-on %s", self.slug
|
||||
)
|
||||
raise AddonsError() from err
|
||||
await self.sys_run_in_executor(_restore_data)
|
||||
except shutil.Error as err:
|
||||
raise AddonsError(
|
||||
f"Can't restore origin data: {err}", _LOGGER.error
|
||||
) from err
|
||||
|
||||
# Is add-on loaded
|
||||
if not self.loaded:
|
||||
await self.load()
|
||||
# Restore AppArmor
|
||||
profile_file = Path(temp, "apparmor.txt")
|
||||
if profile_file.exists():
|
||||
try:
|
||||
await self.sys_host.apparmor.load_profile(
|
||||
self.slug, profile_file
|
||||
)
|
||||
except HostAppArmorError as err:
|
||||
_LOGGER.error(
|
||||
"Can't restore AppArmor profile for add-on %s", self.slug
|
||||
)
|
||||
raise AddonsError() from err
|
||||
|
||||
# Run add-on
|
||||
if data[ATTR_STATE] == AddonState.STARTED:
|
||||
wait_for_start = await self.start()
|
||||
# Is add-on loaded
|
||||
if not self.loaded:
|
||||
await self.load()
|
||||
|
||||
finally:
|
||||
# Run add-on
|
||||
if data[ATTR_STATE] == AddonState.STARTED:
|
||||
wait_for_start = await self.start()
|
||||
|
||||
_LOGGER.info("Finished restore for add-on %s", self.slug)
|
||||
return wait_for_start
|
||||
|
@ -580,10 +580,6 @@ class DockerAddon(DockerInterface):
|
||||
version, image=image, latest=latest, need_build=self.addon.latest_need_build
|
||||
)
|
||||
|
||||
# Stop container & cleanup
|
||||
with suppress(DockerError):
|
||||
await self.stop()
|
||||
|
||||
@Job(
|
||||
name="docker_addon_install",
|
||||
limit=JobExecutionLimit.GROUP_ONCE,
|
||||
|
@ -105,11 +105,7 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
|
||||
if not (event.name == self.instance.name):
|
||||
return
|
||||
|
||||
if event.state in [
|
||||
ContainerState.FAILED,
|
||||
ContainerState.STOPPED,
|
||||
ContainerState.UNHEALTHY,
|
||||
]:
|
||||
if event.state in {ContainerState.FAILED, ContainerState.UNHEALTHY}:
|
||||
await self._restart_after_problem(event.state)
|
||||
|
||||
async def _restart_after_problem(self, state: ContainerState):
|
||||
@ -123,10 +119,7 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
|
||||
state,
|
||||
)
|
||||
try:
|
||||
if state == ContainerState.STOPPED and attempts == 0:
|
||||
await self.start()
|
||||
else:
|
||||
await self.rebuild()
|
||||
await self.rebuild()
|
||||
except PluginError as err:
|
||||
attempts = attempts + 1
|
||||
_LOGGER.error("Watchdog restart of %s plugin failed!", self.slug)
|
||||
|
@ -39,10 +39,6 @@ def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState):
|
||||
)
|
||||
|
||||
|
||||
async def mock_stop() -> None:
|
||||
"""Mock for stop method."""
|
||||
|
||||
|
||||
def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None:
|
||||
"""Test options merge."""
|
||||
addon = coresys.addons.get(TEST_ADDON_SLUG)
|
||||
@ -148,7 +144,7 @@ async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> Non
|
||||
|
||||
# Rebuild if it failed
|
||||
current_state.return_value = ContainerState.FAILED
|
||||
with patch.object(DockerAddon, "stop", return_value=mock_stop()) as stop:
|
||||
with patch.object(DockerAddon, "stop") as stop:
|
||||
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
|
||||
await asyncio.sleep(0)
|
||||
stop.assert_called_once_with(remove_container=True)
|
||||
@ -183,7 +179,7 @@ async def test_watchdog_on_stop(coresys: CoreSys, install_addon_ssh: Addon) -> N
|
||||
DockerAddon,
|
||||
"current_state",
|
||||
return_value=ContainerState.STOPPED,
|
||||
), patch.object(DockerAddon, "stop", return_value=mock_stop()):
|
||||
), patch.object(DockerAddon, "stop"):
|
||||
# Do not restart when addon stopped by user
|
||||
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
|
||||
await asyncio.sleep(0)
|
||||
@ -515,6 +511,42 @@ async def test_backup_cold_mode(
|
||||
assert bool(start_task) is (status == "running")
|
||||
|
||||
|
||||
async def test_backup_cold_mode_with_watchdog(
|
||||
coresys: CoreSys,
|
||||
install_addon_ssh: Addon,
|
||||
container: MagicMock,
|
||||
tmp_supervisor_data,
|
||||
path_extern,
|
||||
):
|
||||
"""Test backing up an addon in cold mode with watchdog active."""
|
||||
container.status = "running"
|
||||
install_addon_ssh.watchdog = True
|
||||
install_addon_ssh.path_data.mkdir()
|
||||
await install_addon_ssh.load()
|
||||
|
||||
# Simulate stop firing the docker event for stopped container like it normally would
|
||||
async def mock_stop(*args, **kwargs):
|
||||
container.status = "stopped"
|
||||
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
|
||||
|
||||
# Patching out the normal end of backup process leaves the container in a stopped state
|
||||
# Watchdog should still not try to restart it though, it should remain this way
|
||||
tarfile = SecureTarFile(coresys.config.path_tmp / "test.tar.gz", "w")
|
||||
with patch.object(Addon, "start") as start, patch.object(
|
||||
Addon, "restart"
|
||||
) as restart, patch.object(Addon, "end_backup"), patch.object(
|
||||
DockerAddon, "stop", new=mock_stop
|
||||
), patch.object(
|
||||
AddonModel,
|
||||
"backup_mode",
|
||||
new=PropertyMock(return_value=AddonBackupMode.COLD),
|
||||
):
|
||||
await install_addon_ssh.backup(tarfile)
|
||||
await asyncio.sleep(0)
|
||||
start.assert_not_called()
|
||||
restart.assert_not_called()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("status", ["running", "stopped"])
|
||||
async def test_restore(
|
||||
coresys: CoreSys,
|
||||
@ -561,6 +593,41 @@ async def test_restore_while_running(
|
||||
container.stop.assert_called_once()
|
||||
|
||||
|
||||
async def test_restore_while_running_with_watchdog(
|
||||
coresys: CoreSys,
|
||||
install_addon_ssh: Addon,
|
||||
container: MagicMock,
|
||||
tmp_supervisor_data,
|
||||
path_extern,
|
||||
):
|
||||
"""Test restore of a running addon with watchdog interference."""
|
||||
container.status = "running"
|
||||
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
|
||||
install_addon_ssh.path_data.mkdir()
|
||||
install_addon_ssh.watchdog = True
|
||||
await install_addon_ssh.load()
|
||||
|
||||
# Simulate stop firing the docker event for stopped container like it normally would
|
||||
async def mock_stop(*args, **kwargs):
|
||||
container.status = "stopped"
|
||||
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
|
||||
|
||||
# We restore a stopped backup so restore will not restart it
|
||||
# Watchdog will see it stop and should not attempt reanimation either
|
||||
tarfile = SecureTarFile(get_fixture_path("backup_local_ssh_stopped.tar.gz"), "r")
|
||||
with patch.object(Addon, "start") as start, patch.object(
|
||||
Addon, "restart"
|
||||
) as restart, patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
|
||||
CpuArch, "supported", new=PropertyMock(return_value=["aarch64"])
|
||||
), patch.object(
|
||||
Ingress, "update_hass_panel"
|
||||
):
|
||||
await coresys.addons.restore(TEST_ADDON_SLUG, tarfile)
|
||||
await asyncio.sleep(0)
|
||||
start.assert_not_called()
|
||||
restart.assert_not_called()
|
||||
|
||||
|
||||
async def test_start_when_running(
|
||||
coresys: CoreSys,
|
||||
install_addon_ssh: Addon,
|
||||
|
@ -404,3 +404,40 @@ async def test_store_data_changes_during_update(
|
||||
|
||||
assert install_addon_ssh.image == "test_image"
|
||||
assert install_addon_ssh.version == AwesomeVersion("1.1.1")
|
||||
|
||||
|
||||
async def test_watchdog_runs_during_update(
|
||||
coresys: CoreSys, install_addon_ssh: Addon, container: MagicMock
|
||||
):
|
||||
"""Test watchdog running during a long update."""
|
||||
container.status = "running"
|
||||
install_addon_ssh.watchdog = True
|
||||
coresys.store.data.addons["local_ssh"]["image"] = "test_image"
|
||||
coresys.store.data.addons["local_ssh"]["version"] = AwesomeVersion("1.1.1")
|
||||
await install_addon_ssh.load()
|
||||
|
||||
# Simulate stop firing the docker event for stopped container like it normally would
|
||||
async def mock_stop(*args, **kwargs):
|
||||
container.status = "stopped"
|
||||
coresys.bus.fire_event(
|
||||
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
|
||||
DockerContainerStateEvent(
|
||||
name=f"addon_{TEST_ADDON_SLUG}",
|
||||
state=ContainerState.STOPPED,
|
||||
id="abc123",
|
||||
time=1,
|
||||
),
|
||||
)
|
||||
|
||||
# Mock update to just wait and let other tasks run as if it is long running
|
||||
async def mock_update(*args, **kwargs):
|
||||
await asyncio.sleep(0)
|
||||
|
||||
# Start should be called exactly once by update itself. Restart should never be called
|
||||
with patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
|
||||
DockerAddon, "update", new=mock_update
|
||||
), patch.object(Addon, "start") as start, patch.object(Addon, "restart") as restart:
|
||||
await coresys.addons.update("local_ssh")
|
||||
await asyncio.sleep(0)
|
||||
start.assert_called_once()
|
||||
restart.assert_not_called()
|
||||
|
@ -1,6 +1,6 @@
|
||||
"""Test base plugin functionality."""
|
||||
import asyncio
|
||||
from unittest.mock import Mock, PropertyMock, patch
|
||||
from unittest.mock import MagicMock, Mock, PropertyMock, patch
|
||||
|
||||
from awesomeversion import AwesomeVersion
|
||||
import pytest
|
||||
@ -98,7 +98,7 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
|
||||
start.assert_not_called()
|
||||
|
||||
rebuild.reset_mock()
|
||||
# Plugins are restarted anytime they stop, not just on failure
|
||||
# Stop should be ignored as it means an update or system shutdown, plugins don't stop otherwise
|
||||
current_state.return_value = ContainerState.STOPPED
|
||||
coresys.bus.fire_event(
|
||||
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
|
||||
@ -111,9 +111,8 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
|
||||
)
|
||||
await asyncio.sleep(0)
|
||||
rebuild.assert_not_called()
|
||||
start.assert_called_once()
|
||||
start.assert_not_called()
|
||||
|
||||
start.reset_mock()
|
||||
# Do not process event if container state has changed since fired
|
||||
current_state.return_value = ContainerState.HEALTHY
|
||||
coresys.bus.fire_event(
|
||||
@ -155,41 +154,38 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
|
||||
],
|
||||
indirect=["plugin"],
|
||||
)
|
||||
async def test_plugin_watchdog_rebuild_on_failure(
|
||||
coresys: CoreSys, capture_exception: Mock, plugin: PluginBase, error: PluginError
|
||||
async def test_plugin_watchdog_max_failed_attempts(
|
||||
coresys: CoreSys,
|
||||
capture_exception: Mock,
|
||||
plugin: PluginBase,
|
||||
error: PluginError,
|
||||
container: MagicMock,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""Test plugin watchdog rebuilds if start fails."""
|
||||
with patch.object(type(plugin.instance), "attach"), patch.object(
|
||||
type(plugin.instance), "is_running", return_value=True
|
||||
):
|
||||
"""Test plugin watchdog gives up after max failed attempts."""
|
||||
with patch.object(type(plugin.instance), "attach"):
|
||||
await plugin.load()
|
||||
|
||||
container.status = "stopped"
|
||||
container.attrs = {"State": {"ExitCode": 1}}
|
||||
with patch("supervisor.plugins.base.WATCHDOG_RETRY_SECONDS", 0), patch.object(
|
||||
type(plugin), "rebuild"
|
||||
) as rebuild, patch.object(
|
||||
type(plugin), "start", side_effect=error
|
||||
) as start, patch.object(
|
||||
type(plugin.instance),
|
||||
"current_state",
|
||||
side_effect=[
|
||||
ContainerState.STOPPED,
|
||||
ContainerState.STOPPED,
|
||||
],
|
||||
):
|
||||
coresys.bus.fire_event(
|
||||
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
|
||||
) as start:
|
||||
await plugin.watchdog_container(
|
||||
DockerContainerStateEvent(
|
||||
name=plugin.instance.name,
|
||||
state=ContainerState.STOPPED,
|
||||
state=ContainerState.FAILED,
|
||||
id="abc123",
|
||||
time=1,
|
||||
),
|
||||
)
|
||||
)
|
||||
await asyncio.sleep(0.1)
|
||||
start.assert_called_once()
|
||||
rebuild.assert_called_once()
|
||||
assert start.call_count == 5
|
||||
|
||||
capture_exception.assert_called_once_with(error)
|
||||
capture_exception.assert_called_with(error)
|
||||
assert (
|
||||
f"Watchdog cannot restart {plugin.slug} plugin, failed all 5 attempts"
|
||||
in caplog.text
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
Loading…
x
Reference in New Issue
Block a user