Support CGroup v2 on Supervised with manual restarts (#5419)

This commit is contained in:
Mike Degatano 2024-12-09 09:09:54 -05:00 committed by GitHub
parent 1f893117cc
commit 829193fe84
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 341 additions and 11 deletions

View File

@ -148,6 +148,9 @@ class Addon(AddonModel):
self._boot_failed_issue = Issue(
IssueType.BOOT_FAIL, ContextType.ADDON, reference=self.slug
)
self._device_access_missing_issue = Issue(
IssueType.DEVICE_ACCESS_MISSING, ContextType.ADDON, reference=self.slug
)
def __repr__(self) -> str:
"""Return internal representation."""
@ -158,6 +161,11 @@ class Addon(AddonModel):
"""Get issue used if start on boot failed."""
return self._boot_failed_issue
@property
def device_access_missing_issue(self) -> Issue:
"""Get issue used if device access is missing and can't be automatically added."""
return self._device_access_missing_issue
@property
def state(self) -> AddonState:
"""Return state of the add-on."""
@ -182,6 +190,13 @@ class Addon(AddonModel):
):
self.sys_resolution.dismiss_issue(self.boot_failed_issue)
# Dismiss device access missing issue if present and we stopped
if (
new_state == AddonState.STOPPED
and self.device_access_missing_issue in self.sys_resolution.issues
):
self.sys_resolution.dismiss_issue(self.device_access_missing_issue)
self.sys_homeassistant.websocket.send_message(
{
ATTR_TYPE: WSType.SUPERVISOR_EVENT,

View File

@ -10,6 +10,7 @@ import os
from pathlib import Path
from typing import TYPE_CHECKING
from attr import evolve
from awesomeversion import AwesomeVersion
import docker
from docker.types import Mount
@ -40,7 +41,7 @@ from ..hardware.const import PolicyGroup
from ..hardware.data import Device
from ..jobs.const import JobCondition, JobExecutionLimit
from ..jobs.decorator import Job
from ..resolution.const import ContextType, IssueType, SuggestionType
from ..resolution.const import CGROUP_V2_VERSION, ContextType, IssueType, SuggestionType
from ..utils.sentry import capture_exception
from .const import (
ENV_TIME,
@ -802,6 +803,13 @@ class DockerAddon(DockerInterface):
await super().stop(remove_container)
# If there is a device access issue and the container is removed, clear it
if (
remove_container
and self.addon.device_access_missing_issue in self.sys_resolution.issues
):
self.sys_resolution.dismiss_issue(self.addon.device_access_missing_issue)
async def _validate_trust(
self, image_id: str, image: str, version: AwesomeVersion
) -> None:
@ -839,6 +847,16 @@ class DockerAddon(DockerInterface):
f"Can't process Hardware Event on {self.name}: {err!s}", _LOGGER.error
) from err
if (
self.sys_docker.info.cgroup == CGROUP_V2_VERSION
and not self.sys_os.available
):
self.sys_resolution.add_issue(
evolve(self.addon.device_access_missing_issue),
suggestions=[SuggestionType.EXECUTE_RESTART],
)
return
permission = self.sys_hardware.policy.get_cgroups_rule(device)
try:
await self.sys_dbus.agent.cgroup.add_devices_allowed(

View File

@ -15,6 +15,9 @@ MINIMUM_FULL_BACKUPS = 2
DNS_CHECK_HOST = "_checkdns.home-assistant.io"
DNS_ERROR_NO_DATA = 1
CGROUP_V1_VERSION = "1"
CGROUP_V2_VERSION = "2"
class ContextType(StrEnum):
"""Place where somethings was happening."""
@ -77,6 +80,7 @@ class IssueType(StrEnum):
CORRUPT_FILESYSTEM = "corrupt_filesystem"
DETACHED_ADDON_MISSING = "detached_addon_missing"
DETACHED_ADDON_REMOVED = "detached_addon_removed"
DEVICE_ACCESS_MISSING = "device_access_missing"
DISABLED_DATA_DISK = "disabled_data_disk"
DNS_LOOP = "dns_loop"
DNS_SERVER_FAILED = "dns_server_failed"
@ -112,6 +116,7 @@ class SuggestionType(StrEnum):
EXECUTE_REMOVE = "execute_remove"
EXECUTE_REPAIR = "execute_repair"
EXECUTE_RESET = "execute_reset"
EXECUTE_RESTART = "execute_restart"
EXECUTE_START = "execute_start"
EXECUTE_STOP = "execute_stop"
EXECUTE_UPDATE = "execute_update"

View File

@ -2,12 +2,9 @@
from ...const import CoreState
from ...coresys import CoreSys
from ..const import UnsupportedReason
from ..const import CGROUP_V1_VERSION, CGROUP_V2_VERSION, UnsupportedReason
from .base import EvaluateBase
CGROUP_V1_VERSION = "1"
CGROUP_V2_VERSION = "2"
def setup(coresys: CoreSys) -> EvaluateBase:
"""Initialize evaluation-setup function."""
@ -20,9 +17,7 @@ class EvaluateCGroupVersion(EvaluateBase):
@property
def expected_versions(self) -> set[str]:
"""Return expected cgroup versions."""
if self.coresys.os.available:
return {CGROUP_V1_VERSION, CGROUP_V2_VERSION}
return {CGROUP_V1_VERSION}
return {CGROUP_V1_VERSION, CGROUP_V2_VERSION}
@property
def reason(self) -> UnsupportedReason:

View File

@ -0,0 +1,60 @@
"""Helpers to fix addon by restarting it."""
import logging
from ...coresys import CoreSys
from ...exceptions import AddonsError, ResolutionFixupError
from ..const import ContextType, IssueType, SuggestionType
from .base import FixupBase
_LOGGER: logging.Logger = logging.getLogger(__name__)
def setup(coresys: CoreSys) -> FixupBase:
"""Check setup function."""
return FixupAddonExecuteRestart(coresys)
class FixupAddonExecuteRestart(FixupBase):
"""Storage class for fixup."""
async def process_fixup(self, reference: str | None = None) -> None:
"""Initialize the fixup class."""
if not (addon := self.sys_addons.get(reference, local_only=True)):
_LOGGER.info("Cannot restart addon %s as it does not exist", reference)
return
# Stop addon
try:
await addon.stop()
except AddonsError as err:
_LOGGER.error("Could not stop %s due to %s", reference, err)
raise ResolutionFixupError() from None
# Start addon
# Removing the container has already fixed the issue and dismissed it
# So any errors on startup are just logged. We won't wait on the startup task either
try:
await addon.start()
except AddonsError as err:
_LOGGER.error("Could not restart %s due to %s", reference, err)
@property
def suggestion(self) -> SuggestionType:
"""Return a SuggestionType enum."""
return SuggestionType.EXECUTE_RESTART
@property
def context(self) -> ContextType:
"""Return a ContextType enum."""
return ContextType.ADDON
@property
def issues(self) -> list[IssueType]:
"""Return a IssueType enum list."""
return [IssueType.DEVICE_ACCESS_MISSING]
@property
def auto(self) -> bool:
"""Return if a fixup can be apply as auto fix."""
return False

View File

@ -1,6 +1,8 @@
"""Test docker addon setup."""
import asyncio
from ipaddress import IPv4Address
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, Mock, PropertyMock, patch
@ -12,12 +14,17 @@ from supervisor.addons import validate as vd
from supervisor.addons.addon import Addon
from supervisor.addons.model import Data
from supervisor.addons.options import AddonOptions
from supervisor.const import BusEvent
from supervisor.coresys import CoreSys
from supervisor.dbus.agent.cgroup import CGroup
from supervisor.docker.addon import DockerAddon
from supervisor.docker.manager import DockerAPI
from supervisor.exceptions import CoreDNSError, DockerNotFound
from supervisor.hardware.data import Device
from supervisor.os.manager import OSManager
from supervisor.plugins.dns import PluginDns
from supervisor.resolution.const import ContextType, IssueType
from supervisor.resolution.data import Issue
from supervisor.resolution.const import ContextType, IssueType, SuggestionType
from supervisor.resolution.data import Issue, Suggestion
from ..common import load_json_fixture
from . import DEV_MOUNT
@ -380,3 +387,113 @@ async def test_addon_stop_delete_host_error(
await docker_addon.stop()
capture_exception.assert_called_once_with(err)
TEST_DEV_PATH = "/dev/ttyAMA0"
TEST_SYSFS_PATH = "/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0/tty/ttyACM0"
TEST_HW_DEVICE = Device(
name="ttyACM0",
path=Path("/dev/ttyAMA0"),
sysfs=Path(
"/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0/tty/ttyACM0"
),
subsystem="tty",
parent=Path(
"/sys/devices/platform/soc/ffe09000.usb/ff500000.usb/xhci-hcd.0.auto/usb1/1-1/1-1.1/1-1.1:1.0"
),
links=[
Path(
"/dev/serial/by-id/usb-Texas_Instruments_TI_CC2531_USB_CDC___0X0123456789ABCDEF-if00"
),
Path("/dev/serial/by-path/platform-xhci-hcd.0.auto-usb-0:1.1:1.0"),
Path("/dev/serial/by-path/platform-xhci-hcd.0.auto-usbv2-0:1.1:1.0"),
],
attributes={},
children=[],
)
@pytest.mark.usefixtures("path_extern")
@pytest.mark.parametrize(
("dev_path", "cgroup", "is_os"),
[
(TEST_DEV_PATH, "1", True),
(TEST_SYSFS_PATH, "1", True),
(TEST_DEV_PATH, "1", False),
(TEST_SYSFS_PATH, "1", False),
(TEST_DEV_PATH, "2", True),
(TEST_SYSFS_PATH, "2", True),
],
)
async def test_addon_new_device(
coresys: CoreSys,
install_addon_ssh: Addon,
container: MagicMock,
docker: DockerAPI,
dev_path: str,
cgroup: str,
is_os: bool,
):
"""Test new device that is listed in static devices."""
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.data["devices"] = [dev_path]
container.id = 123
docker.info.cgroup = cgroup
with (
patch.object(Addon, "write_options"),
patch.object(OSManager, "available", new=PropertyMock(return_value=is_os)),
patch.object(CGroup, "add_devices_allowed") as add_devices,
):
await install_addon_ssh.start()
coresys.bus.fire_event(
BusEvent.HARDWARE_NEW_DEVICE,
TEST_HW_DEVICE,
)
await asyncio.sleep(0.01)
add_devices.assert_called_once_with(123, "c 0:0 rwm")
@pytest.mark.usefixtures("path_extern")
@pytest.mark.parametrize("dev_path", [TEST_DEV_PATH, TEST_SYSFS_PATH])
async def test_addon_new_device_no_haos(
coresys: CoreSys,
install_addon_ssh: Addon,
docker: DockerAPI,
dev_path: str,
):
"""Test new device that is listed in static devices on non HAOS system with CGroup V2."""
coresys.hardware.disk.get_disk_free_space = lambda x: 5000
install_addon_ssh.data["devices"] = [dev_path]
docker.info.cgroup = "2"
with (
patch.object(Addon, "write_options"),
patch.object(OSManager, "available", new=PropertyMock(return_value=False)),
patch.object(CGroup, "add_devices_allowed") as add_devices,
):
await install_addon_ssh.start()
coresys.bus.fire_event(
BusEvent.HARDWARE_NEW_DEVICE,
TEST_HW_DEVICE,
)
await asyncio.sleep(0.01)
add_devices.assert_not_called()
# Issue added with hardware event since access cannot be added dynamically
assert install_addon_ssh.device_access_missing_issue in coresys.resolution.issues
assert (
Suggestion(
SuggestionType.EXECUTE_RESTART, ContextType.ADDON, reference="local_ssh"
)
in coresys.resolution.suggestions
)
# Stopping and removing the container clears it as access granted on next start
await install_addon_ssh.stop()
assert coresys.resolution.issues == []
assert coresys.resolution.suggestions == []

View File

@ -26,7 +26,7 @@ async def test_evaluation(coresys: CoreSys):
coresys.docker.info.cgroup = CGROUP_V2_VERSION
await cgroup_version()
assert cgroup_version.reason in coresys.resolution.unsupported
assert cgroup_version.reason not in coresys.resolution.unsupported
coresys.resolution.unsupported.clear()
coresys.docker.info.cgroup = CGROUP_V1_VERSION

View File

@ -0,0 +1,120 @@
"""Test fixup addon execute restart."""
from unittest.mock import patch
import pytest
from supervisor.addons.addon import Addon
from supervisor.const import AddonState
from supervisor.coresys import CoreSys
from supervisor.docker.addon import DockerAddon
from supervisor.docker.interface import DockerInterface
from supervisor.exceptions import DockerError
from supervisor.resolution.const import ContextType, IssueType, SuggestionType
from supervisor.resolution.data import Issue, Suggestion
from supervisor.resolution.fixups.addon_execute_restart import FixupAddonExecuteRestart
from tests.const import TEST_ADDON_SLUG
DEVICE_ACCESS_MISSING_ISSUE = Issue(
IssueType.DEVICE_ACCESS_MISSING,
ContextType.ADDON,
reference=TEST_ADDON_SLUG,
)
EXECUTE_RESTART_SUGGESTION = Suggestion(
SuggestionType.EXECUTE_RESTART, ContextType.ADDON, reference="local_ssh"
)
@pytest.mark.usefixtures("path_extern")
async def test_fixup(coresys: CoreSys, install_addon_ssh: Addon):
"""Test fixup restarts addon."""
install_addon_ssh.state = AddonState.STARTED
addon_execute_restart = FixupAddonExecuteRestart(coresys)
assert addon_execute_restart.auto is False
async def mock_stop(*args, **kwargs):
install_addon_ssh.state = AddonState.STOPPED
coresys.resolution.add_issue(
DEVICE_ACCESS_MISSING_ISSUE,
suggestions=[SuggestionType.EXECUTE_RESTART],
)
with (
patch.object(DockerInterface, "stop") as stop,
patch.object(DockerAddon, "run") as run,
patch.object(Addon, "_wait_for_startup"),
patch.object(Addon, "write_options"),
):
await addon_execute_restart()
stop.assert_called_once()
run.assert_called_once()
assert not coresys.resolution.issues
assert not coresys.resolution.suggestions
@pytest.mark.usefixtures("path_extern")
async def test_fixup_stop_error(
coresys: CoreSys, install_addon_ssh: Addon, caplog: pytest.LogCaptureFixture
):
"""Test fixup fails on stop addon failure."""
install_addon_ssh.state = AddonState.STARTED
addon_execute_start = FixupAddonExecuteRestart(coresys)
coresys.resolution.add_issue(
DEVICE_ACCESS_MISSING_ISSUE,
suggestions=[SuggestionType.EXECUTE_RESTART],
)
with (
patch.object(DockerInterface, "stop", side_effect=DockerError),
patch.object(DockerAddon, "run") as run,
):
await addon_execute_start()
run.assert_not_called()
assert DEVICE_ACCESS_MISSING_ISSUE in coresys.resolution.issues
assert EXECUTE_RESTART_SUGGESTION in coresys.resolution.suggestions
assert "Could not stop local_ssh" in caplog.text
@pytest.mark.usefixtures("path_extern")
async def test_fixup_start_error(
coresys: CoreSys, install_addon_ssh: Addon, caplog: pytest.LogCaptureFixture
):
"""Test fixup logs a start addon failure."""
install_addon_ssh.state = AddonState.STARTED
addon_execute_start = FixupAddonExecuteRestart(coresys)
coresys.resolution.add_issue(
DEVICE_ACCESS_MISSING_ISSUE,
suggestions=[SuggestionType.EXECUTE_RESTART],
)
with (
patch.object(DockerInterface, "stop") as stop,
patch.object(DockerAddon, "run", side_effect=DockerError),
patch.object(Addon, "write_options"),
):
await addon_execute_start()
stop.assert_called_once()
assert DEVICE_ACCESS_MISSING_ISSUE not in coresys.resolution.issues
assert EXECUTE_RESTART_SUGGESTION not in coresys.resolution.suggestions
assert "Could not restart local_ssh" in caplog.text
async def test_fixup_no_addon(coresys: CoreSys, caplog: pytest.LogCaptureFixture):
"""Test fixup dismisses if addon is missing."""
addon_execute_start = FixupAddonExecuteRestart(coresys)
coresys.resolution.add_issue(
DEVICE_ACCESS_MISSING_ISSUE,
suggestions=[SuggestionType.EXECUTE_RESTART],
)
with patch.object(DockerAddon, "stop") as stop:
await addon_execute_start()
stop.assert_not_called()
assert not coresys.resolution.issues
assert not coresys.resolution.suggestions
assert "Cannot restart addon local_ssh as it does not exist" in caplog.text