Remove race with watchdog during backup, restore and update (#4635)

* Remove race with watchdog during backup, restore and update

* Fix pylint issues and test

* Stop after image pull during update

* Add test for max failed attempts for plugin watchdog
This commit is contained in:
Mike Degatano 2023-10-19 22:01:56 -04:00 committed by GitHub
parent 010043f116
commit 37c1c89d44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 216 additions and 120 deletions

View File

@ -288,33 +288,40 @@ class AddonManager(CoreSysAttributes):
) )
# Update instance # Update instance
last_state: AddonState = addon.state
old_image = addon.image old_image = addon.image
# Cache data to prevent races with other updates to global # Cache data to prevent races with other updates to global
store = store.clone() store = store.clone()
try: try:
await addon.instance.update(store.version, store.image) await addon.instance.update(store.version, store.image)
except DockerError as err: except DockerError as err:
raise AddonsError() from err raise AddonsError() from err
_LOGGER.info("Add-on '%s' successfully updated", slug) # Stop the addon if running
self.data.update(store) if (last_state := addon.state) in {AddonState.STARTED, AddonState.STARTUP}:
await addon.stop()
# Cleanup try:
with suppress(DockerError): _LOGGER.info("Add-on '%s' successfully updated", slug)
await addon.instance.cleanup( self.data.update(store)
old_image=old_image, image=store.image, version=store.version
# Cleanup
with suppress(DockerError):
await addon.instance.cleanup(
old_image=old_image, image=store.image, version=store.version
)
# Setup/Fix AppArmor profile
await addon.install_apparmor()
finally:
# restore state. Return awaitable for caller if no exception
out = (
await addon.start()
if last_state in {AddonState.STARTED, AddonState.STARTUP}
else None
) )
return out
# Setup/Fix AppArmor profile
await addon.install_apparmor()
# restore state
return (
await addon.start()
if last_state in [AddonState.STARTED, AddonState.STARTUP]
else None
)
@Job( @Job(
name="addon_manager_rebuild", name="addon_manager_rebuild",

View File

@ -782,10 +782,7 @@ class Addon(AddonModel):
if self.backup_mode == AddonBackupMode.COLD: if self.backup_mode == AddonBackupMode.COLD:
_LOGGER.info("Shutdown add-on %s for cold backup", self.slug) _LOGGER.info("Shutdown add-on %s for cold backup", self.slug)
try: await self.stop()
await self.instance.stop()
except DockerError as err:
raise AddonsError() from err
elif self.backup_pre is not None: elif self.backup_pre is not None:
await self._backup_command(self.backup_pre) await self._backup_command(self.backup_pre)
@ -933,64 +930,67 @@ class Addon(AddonModel):
# Stop it first if its running # Stop it first if its running
if await self.instance.is_running(): if await self.instance.is_running():
with suppress(DockerError): await self.stop()
await self.instance.stop()
# Check version / restore image
version = data[ATTR_VERSION]
if not await self.instance.exists():
_LOGGER.info("Restore/Install of image for addon %s", self.slug)
image_file = Path(temp, "image.tar")
if image_file.is_file():
with suppress(DockerError):
await self.instance.import_image(image_file)
else:
with suppress(DockerError):
await self.instance.install(version, restore_image)
await self.instance.cleanup()
elif self.instance.version != version or self.legacy:
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
with suppress(DockerError):
await self.instance.update(version, restore_image)
# Restore data
def _restore_data():
"""Restore data."""
temp_data = Path(temp, "data")
if temp_data.is_dir():
shutil.copytree(temp_data, self.path_data, symlinks=True)
else:
self.path_data.mkdir()
_LOGGER.info("Restoring data for addon %s", self.slug)
if self.path_data.is_dir():
await remove_data(self.path_data)
try: try:
await self.sys_run_in_executor(_restore_data) # Check version / restore image
except shutil.Error as err: version = data[ATTR_VERSION]
raise AddonsError( if not await self.instance.exists():
f"Can't restore origin data: {err}", _LOGGER.error _LOGGER.info("Restore/Install of image for addon %s", self.slug)
) from err
# Restore AppArmor image_file = Path(temp, "image.tar")
profile_file = Path(temp, "apparmor.txt") if image_file.is_file():
if profile_file.exists(): with suppress(DockerError):
await self.instance.import_image(image_file)
else:
with suppress(DockerError):
await self.instance.install(version, restore_image)
await self.instance.cleanup()
elif self.instance.version != version or self.legacy:
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
with suppress(DockerError):
await self.instance.update(version, restore_image)
# Restore data
def _restore_data():
"""Restore data."""
temp_data = Path(temp, "data")
if temp_data.is_dir():
shutil.copytree(temp_data, self.path_data, symlinks=True)
else:
self.path_data.mkdir()
_LOGGER.info("Restoring data for addon %s", self.slug)
if self.path_data.is_dir():
await remove_data(self.path_data)
try: try:
await self.sys_host.apparmor.load_profile(self.slug, profile_file) await self.sys_run_in_executor(_restore_data)
except HostAppArmorError as err: except shutil.Error as err:
_LOGGER.error( raise AddonsError(
"Can't restore AppArmor profile for add-on %s", self.slug f"Can't restore origin data: {err}", _LOGGER.error
) ) from err
raise AddonsError() from err
# Is add-on loaded # Restore AppArmor
if not self.loaded: profile_file = Path(temp, "apparmor.txt")
await self.load() if profile_file.exists():
try:
await self.sys_host.apparmor.load_profile(
self.slug, profile_file
)
except HostAppArmorError as err:
_LOGGER.error(
"Can't restore AppArmor profile for add-on %s", self.slug
)
raise AddonsError() from err
# Run add-on # Is add-on loaded
if data[ATTR_STATE] == AddonState.STARTED: if not self.loaded:
wait_for_start = await self.start() await self.load()
finally:
# Run add-on
if data[ATTR_STATE] == AddonState.STARTED:
wait_for_start = await self.start()
_LOGGER.info("Finished restore for add-on %s", self.slug) _LOGGER.info("Finished restore for add-on %s", self.slug)
return wait_for_start return wait_for_start

View File

@ -580,10 +580,6 @@ class DockerAddon(DockerInterface):
version, image=image, latest=latest, need_build=self.addon.latest_need_build version, image=image, latest=latest, need_build=self.addon.latest_need_build
) )
# Stop container & cleanup
with suppress(DockerError):
await self.stop()
@Job( @Job(
name="docker_addon_install", name="docker_addon_install",
limit=JobExecutionLimit.GROUP_ONCE, limit=JobExecutionLimit.GROUP_ONCE,

View File

@ -105,11 +105,7 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
if not (event.name == self.instance.name): if not (event.name == self.instance.name):
return return
if event.state in [ if event.state in {ContainerState.FAILED, ContainerState.UNHEALTHY}:
ContainerState.FAILED,
ContainerState.STOPPED,
ContainerState.UNHEALTHY,
]:
await self._restart_after_problem(event.state) await self._restart_after_problem(event.state)
async def _restart_after_problem(self, state: ContainerState): async def _restart_after_problem(self, state: ContainerState):
@ -123,10 +119,7 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
state, state,
) )
try: try:
if state == ContainerState.STOPPED and attempts == 0: await self.rebuild()
await self.start()
else:
await self.rebuild()
except PluginError as err: except PluginError as err:
attempts = attempts + 1 attempts = attempts + 1
_LOGGER.error("Watchdog restart of %s plugin failed!", self.slug) _LOGGER.error("Watchdog restart of %s plugin failed!", self.slug)

View File

@ -39,10 +39,6 @@ def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState):
) )
async def mock_stop() -> None:
"""Mock for stop method."""
def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None: def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test options merge.""" """Test options merge."""
addon = coresys.addons.get(TEST_ADDON_SLUG) addon = coresys.addons.get(TEST_ADDON_SLUG)
@ -148,7 +144,7 @@ async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> Non
# Rebuild if it failed # Rebuild if it failed
current_state.return_value = ContainerState.FAILED current_state.return_value = ContainerState.FAILED
with patch.object(DockerAddon, "stop", return_value=mock_stop()) as stop: with patch.object(DockerAddon, "stop") as stop:
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED) _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
await asyncio.sleep(0) await asyncio.sleep(0)
stop.assert_called_once_with(remove_container=True) stop.assert_called_once_with(remove_container=True)
@ -183,7 +179,7 @@ async def test_watchdog_on_stop(coresys: CoreSys, install_addon_ssh: Addon) -> N
DockerAddon, DockerAddon,
"current_state", "current_state",
return_value=ContainerState.STOPPED, return_value=ContainerState.STOPPED,
), patch.object(DockerAddon, "stop", return_value=mock_stop()): ), patch.object(DockerAddon, "stop"):
# Do not restart when addon stopped by user # Do not restart when addon stopped by user
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING) _fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
await asyncio.sleep(0) await asyncio.sleep(0)
@ -515,6 +511,42 @@ async def test_backup_cold_mode(
assert bool(start_task) is (status == "running") assert bool(start_task) is (status == "running")
async def test_backup_cold_mode_with_watchdog(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
tmp_supervisor_data,
path_extern,
):
"""Test backing up an addon in cold mode with watchdog active."""
container.status = "running"
install_addon_ssh.watchdog = True
install_addon_ssh.path_data.mkdir()
await install_addon_ssh.load()
# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
# Patching out the normal end of backup process leaves the container in a stopped state
# Watchdog should still not try to restart it though, it should remain this way
tarfile = SecureTarFile(coresys.config.path_tmp / "test.tar.gz", "w")
with patch.object(Addon, "start") as start, patch.object(
Addon, "restart"
) as restart, patch.object(Addon, "end_backup"), patch.object(
DockerAddon, "stop", new=mock_stop
), patch.object(
AddonModel,
"backup_mode",
new=PropertyMock(return_value=AddonBackupMode.COLD),
):
await install_addon_ssh.backup(tarfile)
await asyncio.sleep(0)
start.assert_not_called()
restart.assert_not_called()
@pytest.mark.parametrize("status", ["running", "stopped"]) @pytest.mark.parametrize("status", ["running", "stopped"])
async def test_restore( async def test_restore(
coresys: CoreSys, coresys: CoreSys,
@ -561,6 +593,41 @@ async def test_restore_while_running(
container.stop.assert_called_once() container.stop.assert_called_once()
async def test_restore_while_running_with_watchdog(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
tmp_supervisor_data,
path_extern,
):
"""Test restore of a running addon with watchdog interference."""
container.status = "running"
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.path_data.mkdir()
install_addon_ssh.watchdog = True
await install_addon_ssh.load()
# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
# We restore a stopped backup so restore will not restart it
# Watchdog will see it stop and should not attempt reanimation either
tarfile = SecureTarFile(get_fixture_path("backup_local_ssh_stopped.tar.gz"), "r")
with patch.object(Addon, "start") as start, patch.object(
Addon, "restart"
) as restart, patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
CpuArch, "supported", new=PropertyMock(return_value=["aarch64"])
), patch.object(
Ingress, "update_hass_panel"
):
await coresys.addons.restore(TEST_ADDON_SLUG, tarfile)
await asyncio.sleep(0)
start.assert_not_called()
restart.assert_not_called()
async def test_start_when_running( async def test_start_when_running(
coresys: CoreSys, coresys: CoreSys,
install_addon_ssh: Addon, install_addon_ssh: Addon,

View File

@ -404,3 +404,40 @@ async def test_store_data_changes_during_update(
assert install_addon_ssh.image == "test_image" assert install_addon_ssh.image == "test_image"
assert install_addon_ssh.version == AwesomeVersion("1.1.1") assert install_addon_ssh.version == AwesomeVersion("1.1.1")
async def test_watchdog_runs_during_update(
coresys: CoreSys, install_addon_ssh: Addon, container: MagicMock
):
"""Test watchdog running during a long update."""
container.status = "running"
install_addon_ssh.watchdog = True
coresys.store.data.addons["local_ssh"]["image"] = "test_image"
coresys.store.data.addons["local_ssh"]["version"] = AwesomeVersion("1.1.1")
await install_addon_ssh.load()
# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.STOPPED,
id="abc123",
time=1,
),
)
# Mock update to just wait and let other tasks run as if it is long running
async def mock_update(*args, **kwargs):
await asyncio.sleep(0)
# Start should be called exactly once by update itself. Restart should never be called
with patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
DockerAddon, "update", new=mock_update
), patch.object(Addon, "start") as start, patch.object(Addon, "restart") as restart:
await coresys.addons.update("local_ssh")
await asyncio.sleep(0)
start.assert_called_once()
restart.assert_not_called()

View File

@ -1,6 +1,6 @@
"""Test base plugin functionality.""" """Test base plugin functionality."""
import asyncio import asyncio
from unittest.mock import Mock, PropertyMock, patch from unittest.mock import MagicMock, Mock, PropertyMock, patch
from awesomeversion import AwesomeVersion from awesomeversion import AwesomeVersion
import pytest import pytest
@ -98,7 +98,7 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
start.assert_not_called() start.assert_not_called()
rebuild.reset_mock() rebuild.reset_mock()
# Plugins are restarted anytime they stop, not just on failure # Stop should be ignored as it means an update or system shutdown, plugins don't stop otherwise
current_state.return_value = ContainerState.STOPPED current_state.return_value = ContainerState.STOPPED
coresys.bus.fire_event( coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE, BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
@ -111,9 +111,8 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
) )
await asyncio.sleep(0) await asyncio.sleep(0)
rebuild.assert_not_called() rebuild.assert_not_called()
start.assert_called_once() start.assert_not_called()
start.reset_mock()
# Do not process event if container state has changed since fired # Do not process event if container state has changed since fired
current_state.return_value = ContainerState.HEALTHY current_state.return_value = ContainerState.HEALTHY
coresys.bus.fire_event( coresys.bus.fire_event(
@ -155,41 +154,38 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
], ],
indirect=["plugin"], indirect=["plugin"],
) )
async def test_plugin_watchdog_rebuild_on_failure( async def test_plugin_watchdog_max_failed_attempts(
coresys: CoreSys, capture_exception: Mock, plugin: PluginBase, error: PluginError coresys: CoreSys,
capture_exception: Mock,
plugin: PluginBase,
error: PluginError,
container: MagicMock,
caplog: pytest.LogCaptureFixture,
) -> None: ) -> None:
"""Test plugin watchdog rebuilds if start fails.""" """Test plugin watchdog gives up after max failed attempts."""
with patch.object(type(plugin.instance), "attach"), patch.object( with patch.object(type(plugin.instance), "attach"):
type(plugin.instance), "is_running", return_value=True
):
await plugin.load() await plugin.load()
container.status = "stopped"
container.attrs = {"State": {"ExitCode": 1}}
with patch("supervisor.plugins.base.WATCHDOG_RETRY_SECONDS", 0), patch.object( with patch("supervisor.plugins.base.WATCHDOG_RETRY_SECONDS", 0), patch.object(
type(plugin), "rebuild"
) as rebuild, patch.object(
type(plugin), "start", side_effect=error type(plugin), "start", side_effect=error
) as start, patch.object( ) as start:
type(plugin.instance), await plugin.watchdog_container(
"current_state",
side_effect=[
ContainerState.STOPPED,
ContainerState.STOPPED,
],
):
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent( DockerContainerStateEvent(
name=plugin.instance.name, name=plugin.instance.name,
state=ContainerState.STOPPED, state=ContainerState.FAILED,
id="abc123", id="abc123",
time=1, time=1,
), )
) )
await asyncio.sleep(0.1) assert start.call_count == 5
start.assert_called_once()
rebuild.assert_called_once()
capture_exception.assert_called_once_with(error) capture_exception.assert_called_with(error)
assert (
f"Watchdog cannot restart {plugin.slug} plugin, failed all 5 attempts"
in caplog.text
)
@pytest.mark.parametrize( @pytest.mark.parametrize(