Remove race with watchdog during backup, restore and update (#4635)

* Remove race with watchdog during backup, restore and update

* Fix pylint issues and test

* Stop after image pull during update

* Add test for max failed attempts for plugin watchdog
This commit is contained in:
Mike Degatano 2023-10-19 22:01:56 -04:00 committed by GitHub
parent 010043f116
commit 37c1c89d44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 216 additions and 120 deletions

View File

@ -288,33 +288,40 @@ class AddonManager(CoreSysAttributes):
)
# Update instance
last_state: AddonState = addon.state
old_image = addon.image
# Cache data to prevent races with other updates to global
store = store.clone()
try:
await addon.instance.update(store.version, store.image)
except DockerError as err:
raise AddonsError() from err
_LOGGER.info("Add-on '%s' successfully updated", slug)
self.data.update(store)
# Stop the addon if running
if (last_state := addon.state) in {AddonState.STARTED, AddonState.STARTUP}:
await addon.stop()
# Cleanup
with suppress(DockerError):
await addon.instance.cleanup(
old_image=old_image, image=store.image, version=store.version
try:
_LOGGER.info("Add-on '%s' successfully updated", slug)
self.data.update(store)
# Cleanup
with suppress(DockerError):
await addon.instance.cleanup(
old_image=old_image, image=store.image, version=store.version
)
# Setup/Fix AppArmor profile
await addon.install_apparmor()
finally:
# restore state. Return awaitable for caller if no exception
out = (
await addon.start()
if last_state in {AddonState.STARTED, AddonState.STARTUP}
else None
)
# Setup/Fix AppArmor profile
await addon.install_apparmor()
# restore state
return (
await addon.start()
if last_state in [AddonState.STARTED, AddonState.STARTUP]
else None
)
return out
@Job(
name="addon_manager_rebuild",

View File

@ -782,10 +782,7 @@ class Addon(AddonModel):
if self.backup_mode == AddonBackupMode.COLD:
_LOGGER.info("Shutdown add-on %s for cold backup", self.slug)
try:
await self.instance.stop()
except DockerError as err:
raise AddonsError() from err
await self.stop()
elif self.backup_pre is not None:
await self._backup_command(self.backup_pre)
@ -933,64 +930,67 @@ class Addon(AddonModel):
# Stop it first if its running
if await self.instance.is_running():
with suppress(DockerError):
await self.instance.stop()
await self.stop()
# Check version / restore image
version = data[ATTR_VERSION]
if not await self.instance.exists():
_LOGGER.info("Restore/Install of image for addon %s", self.slug)
image_file = Path(temp, "image.tar")
if image_file.is_file():
with suppress(DockerError):
await self.instance.import_image(image_file)
else:
with suppress(DockerError):
await self.instance.install(version, restore_image)
await self.instance.cleanup()
elif self.instance.version != version or self.legacy:
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
with suppress(DockerError):
await self.instance.update(version, restore_image)
# Restore data
def _restore_data():
"""Restore data."""
temp_data = Path(temp, "data")
if temp_data.is_dir():
shutil.copytree(temp_data, self.path_data, symlinks=True)
else:
self.path_data.mkdir()
_LOGGER.info("Restoring data for addon %s", self.slug)
if self.path_data.is_dir():
await remove_data(self.path_data)
try:
await self.sys_run_in_executor(_restore_data)
except shutil.Error as err:
raise AddonsError(
f"Can't restore origin data: {err}", _LOGGER.error
) from err
# Check version / restore image
version = data[ATTR_VERSION]
if not await self.instance.exists():
_LOGGER.info("Restore/Install of image for addon %s", self.slug)
# Restore AppArmor
profile_file = Path(temp, "apparmor.txt")
if profile_file.exists():
image_file = Path(temp, "image.tar")
if image_file.is_file():
with suppress(DockerError):
await self.instance.import_image(image_file)
else:
with suppress(DockerError):
await self.instance.install(version, restore_image)
await self.instance.cleanup()
elif self.instance.version != version or self.legacy:
_LOGGER.info("Restore/Update of image for addon %s", self.slug)
with suppress(DockerError):
await self.instance.update(version, restore_image)
# Restore data
def _restore_data():
"""Restore data."""
temp_data = Path(temp, "data")
if temp_data.is_dir():
shutil.copytree(temp_data, self.path_data, symlinks=True)
else:
self.path_data.mkdir()
_LOGGER.info("Restoring data for addon %s", self.slug)
if self.path_data.is_dir():
await remove_data(self.path_data)
try:
await self.sys_host.apparmor.load_profile(self.slug, profile_file)
except HostAppArmorError as err:
_LOGGER.error(
"Can't restore AppArmor profile for add-on %s", self.slug
)
raise AddonsError() from err
await self.sys_run_in_executor(_restore_data)
except shutil.Error as err:
raise AddonsError(
f"Can't restore origin data: {err}", _LOGGER.error
) from err
# Is add-on loaded
if not self.loaded:
await self.load()
# Restore AppArmor
profile_file = Path(temp, "apparmor.txt")
if profile_file.exists():
try:
await self.sys_host.apparmor.load_profile(
self.slug, profile_file
)
except HostAppArmorError as err:
_LOGGER.error(
"Can't restore AppArmor profile for add-on %s", self.slug
)
raise AddonsError() from err
# Run add-on
if data[ATTR_STATE] == AddonState.STARTED:
wait_for_start = await self.start()
# Is add-on loaded
if not self.loaded:
await self.load()
finally:
# Run add-on
if data[ATTR_STATE] == AddonState.STARTED:
wait_for_start = await self.start()
_LOGGER.info("Finished restore for add-on %s", self.slug)
return wait_for_start

View File

@ -580,10 +580,6 @@ class DockerAddon(DockerInterface):
version, image=image, latest=latest, need_build=self.addon.latest_need_build
)
# Stop container & cleanup
with suppress(DockerError):
await self.stop()
@Job(
name="docker_addon_install",
limit=JobExecutionLimit.GROUP_ONCE,

View File

@ -105,11 +105,7 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
if not (event.name == self.instance.name):
return
if event.state in [
ContainerState.FAILED,
ContainerState.STOPPED,
ContainerState.UNHEALTHY,
]:
if event.state in {ContainerState.FAILED, ContainerState.UNHEALTHY}:
await self._restart_after_problem(event.state)
async def _restart_after_problem(self, state: ContainerState):
@ -123,10 +119,7 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
state,
)
try:
if state == ContainerState.STOPPED and attempts == 0:
await self.start()
else:
await self.rebuild()
await self.rebuild()
except PluginError as err:
attempts = attempts + 1
_LOGGER.error("Watchdog restart of %s plugin failed!", self.slug)

View File

@ -39,10 +39,6 @@ def _fire_test_event(coresys: CoreSys, name: str, state: ContainerState):
)
async def mock_stop() -> None:
"""Mock for stop method."""
def test_options_merge(coresys: CoreSys, install_addon_ssh: Addon) -> None:
"""Test options merge."""
addon = coresys.addons.get(TEST_ADDON_SLUG)
@ -148,7 +144,7 @@ async def test_addon_watchdog(coresys: CoreSys, install_addon_ssh: Addon) -> Non
# Rebuild if it failed
current_state.return_value = ContainerState.FAILED
with patch.object(DockerAddon, "stop", return_value=mock_stop()) as stop:
with patch.object(DockerAddon, "stop") as stop:
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.FAILED)
await asyncio.sleep(0)
stop.assert_called_once_with(remove_container=True)
@ -183,7 +179,7 @@ async def test_watchdog_on_stop(coresys: CoreSys, install_addon_ssh: Addon) -> N
DockerAddon,
"current_state",
return_value=ContainerState.STOPPED,
), patch.object(DockerAddon, "stop", return_value=mock_stop()):
), patch.object(DockerAddon, "stop"):
# Do not restart when addon stopped by user
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.RUNNING)
await asyncio.sleep(0)
@ -515,6 +511,42 @@ async def test_backup_cold_mode(
assert bool(start_task) is (status == "running")
async def test_backup_cold_mode_with_watchdog(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
tmp_supervisor_data,
path_extern,
):
"""Test backing up an addon in cold mode with watchdog active."""
container.status = "running"
install_addon_ssh.watchdog = True
install_addon_ssh.path_data.mkdir()
await install_addon_ssh.load()
# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
# Patching out the normal end of backup process leaves the container in a stopped state
# Watchdog should still not try to restart it though, it should remain this way
tarfile = SecureTarFile(coresys.config.path_tmp / "test.tar.gz", "w")
with patch.object(Addon, "start") as start, patch.object(
Addon, "restart"
) as restart, patch.object(Addon, "end_backup"), patch.object(
DockerAddon, "stop", new=mock_stop
), patch.object(
AddonModel,
"backup_mode",
new=PropertyMock(return_value=AddonBackupMode.COLD),
):
await install_addon_ssh.backup(tarfile)
await asyncio.sleep(0)
start.assert_not_called()
restart.assert_not_called()
@pytest.mark.parametrize("status", ["running", "stopped"])
async def test_restore(
coresys: CoreSys,
@ -561,6 +593,41 @@ async def test_restore_while_running(
container.stop.assert_called_once()
async def test_restore_while_running_with_watchdog(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
tmp_supervisor_data,
path_extern,
):
"""Test restore of a running addon with watchdog interference."""
container.status = "running"
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.path_data.mkdir()
install_addon_ssh.watchdog = True
await install_addon_ssh.load()
# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
_fire_test_event(coresys, f"addon_{TEST_ADDON_SLUG}", ContainerState.STOPPED)
# We restore a stopped backup so restore will not restart it
# Watchdog will see it stop and should not attempt reanimation either
tarfile = SecureTarFile(get_fixture_path("backup_local_ssh_stopped.tar.gz"), "r")
with patch.object(Addon, "start") as start, patch.object(
Addon, "restart"
) as restart, patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
CpuArch, "supported", new=PropertyMock(return_value=["aarch64"])
), patch.object(
Ingress, "update_hass_panel"
):
await coresys.addons.restore(TEST_ADDON_SLUG, tarfile)
await asyncio.sleep(0)
start.assert_not_called()
restart.assert_not_called()
async def test_start_when_running(
coresys: CoreSys,
install_addon_ssh: Addon,

View File

@ -404,3 +404,40 @@ async def test_store_data_changes_during_update(
assert install_addon_ssh.image == "test_image"
assert install_addon_ssh.version == AwesomeVersion("1.1.1")
async def test_watchdog_runs_during_update(
coresys: CoreSys, install_addon_ssh: Addon, container: MagicMock
):
"""Test watchdog running during a long update."""
container.status = "running"
install_addon_ssh.watchdog = True
coresys.store.data.addons["local_ssh"]["image"] = "test_image"
coresys.store.data.addons["local_ssh"]["version"] = AwesomeVersion("1.1.1")
await install_addon_ssh.load()
# Simulate stop firing the docker event for stopped container like it normally would
async def mock_stop(*args, **kwargs):
container.status = "stopped"
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
DockerContainerStateEvent(
name=f"addon_{TEST_ADDON_SLUG}",
state=ContainerState.STOPPED,
id="abc123",
time=1,
),
)
# Mock update to just wait and let other tasks run as if it is long running
async def mock_update(*args, **kwargs):
await asyncio.sleep(0)
# Start should be called exactly once by update itself. Restart should never be called
with patch.object(DockerAddon, "stop", new=mock_stop), patch.object(
DockerAddon, "update", new=mock_update
), patch.object(Addon, "start") as start, patch.object(Addon, "restart") as restart:
await coresys.addons.update("local_ssh")
await asyncio.sleep(0)
start.assert_called_once()
restart.assert_not_called()

View File

@ -1,6 +1,6 @@
"""Test base plugin functionality."""
import asyncio
from unittest.mock import Mock, PropertyMock, patch
from unittest.mock import MagicMock, Mock, PropertyMock, patch
from awesomeversion import AwesomeVersion
import pytest
@ -98,7 +98,7 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
start.assert_not_called()
rebuild.reset_mock()
# Plugins are restarted anytime they stop, not just on failure
# Stop should be ignored as it means an update or system shutdown, plugins don't stop otherwise
current_state.return_value = ContainerState.STOPPED
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
@ -111,9 +111,8 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
)
await asyncio.sleep(0)
rebuild.assert_not_called()
start.assert_called_once()
start.assert_not_called()
start.reset_mock()
# Do not process event if container state has changed since fired
current_state.return_value = ContainerState.HEALTHY
coresys.bus.fire_event(
@ -155,41 +154,38 @@ async def test_plugin_watchdog(coresys: CoreSys, plugin: PluginBase) -> None:
],
indirect=["plugin"],
)
async def test_plugin_watchdog_rebuild_on_failure(
coresys: CoreSys, capture_exception: Mock, plugin: PluginBase, error: PluginError
async def test_plugin_watchdog_max_failed_attempts(
coresys: CoreSys,
capture_exception: Mock,
plugin: PluginBase,
error: PluginError,
container: MagicMock,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test plugin watchdog rebuilds if start fails."""
with patch.object(type(plugin.instance), "attach"), patch.object(
type(plugin.instance), "is_running", return_value=True
):
"""Test plugin watchdog gives up after max failed attempts."""
with patch.object(type(plugin.instance), "attach"):
await plugin.load()
container.status = "stopped"
container.attrs = {"State": {"ExitCode": 1}}
with patch("supervisor.plugins.base.WATCHDOG_RETRY_SECONDS", 0), patch.object(
type(plugin), "rebuild"
) as rebuild, patch.object(
type(plugin), "start", side_effect=error
) as start, patch.object(
type(plugin.instance),
"current_state",
side_effect=[
ContainerState.STOPPED,
ContainerState.STOPPED,
],
):
coresys.bus.fire_event(
BusEvent.DOCKER_CONTAINER_STATE_CHANGE,
) as start:
await plugin.watchdog_container(
DockerContainerStateEvent(
name=plugin.instance.name,
state=ContainerState.STOPPED,
state=ContainerState.FAILED,
id="abc123",
time=1,
),
)
)
await asyncio.sleep(0.1)
start.assert_called_once()
rebuild.assert_called_once()
assert start.call_count == 5
capture_exception.assert_called_once_with(error)
capture_exception.assert_called_with(error)
assert (
f"Watchdog cannot restart {plugin.slug} plugin, failed all 5 attempts"
in caplog.text
)
@pytest.mark.parametrize(