Set limits on watchdog retries (#3779)

* Set limits on watchdog retries * Use relative import
2025-07-27 11:06:32 +00:00 · 2022-08-09 11:44:35 -04:00 · 2022-08-09 11:44:35 -04:00 · e62324e43f
commit e62324e43f
parent a92058e6fc
15 changed files with 371 additions and 129 deletions
--- a/supervisor/addons/addon.py
+++ b/supervisor/addons/addon.py
@ -58,6 +58,7 @@ from ..docker.stats import DockerStats
 from ..exceptions import (
    AddonConfigurationError,
    AddonsError,
    AddonsJobError,
    AddonsNotSupportedError,
    ConfigurationFileError,
    DockerError,
@ -65,10 +66,18 @@ from ..exceptions import (
 )
 from ..hardware.data import Device
 from ..homeassistant.const import WSEvent, WSType
 from ..jobs.const import JobExecutionLimit
 from ..jobs.decorator import Job
 from ..utils import check_port
 from ..utils.apparmor import adjust_profile
 from ..utils.json import read_json_file, write_json_file
-from .const import WATCHDOG_RETRY_SECONDS, AddonBackupMode
+from .const import (
    WATCHDOG_MAX_ATTEMPTS,
    WATCHDOG_RETRY_SECONDS,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
    AddonBackupMode,
 )
 from .model import AddonModel, Data
 from .options import AddonOptions
 from .utils import remove_data
@ -104,6 +113,54 @@ class Addon(AddonModel):
        self.instance: DockerAddon = DockerAddon(coresys, self)
        self._state: AddonState = AddonState.UNKNOWN
        @Job(
            name=f"addon_{slug}_restart_after_problem",
            limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
            throttle_period=WATCHDOG_THROTTLE_PERIOD,
            throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
            on_condition=AddonsJobError,
        )
        async def restart_after_problem(addon: Addon, state: ContainerState):
            """Restart unhealthy or failed addon."""
            attempts = 0
            while await addon.instance.current_state() == state:
                if not addon.in_progress:
                    _LOGGER.warning(
                        "Watchdog found addon %s is %s, restarting...",
                        addon.name,
                        state.value,
                    )
                    try:
                        if state == ContainerState.FAILED:
                            # Ensure failed container is removed before attempting reanimation
                            if attempts == 0:
                                with suppress(DockerError):
                                    await addon.instance.stop(remove_container=True)
                            await addon.start()
                        else:
                            await addon.restart()
                    except AddonsError as err:
                        attempts = attempts + 1
                        _LOGGER.error(
                            "Watchdog restart of addon %s failed!", addon.name
                        )
                        addon.sys_capture_exception(err)
                    else:
                        break
                if attempts >= WATCHDOG_MAX_ATTEMPTS:
                    _LOGGER.critical(
                        "Watchdog cannot restart addon %s, failed all %s attempts",
                        addon.name,
                        attempts,
                    )
                    break
                await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
        self._restart_after_problem = restart_after_problem
    def __repr__(self) -> str:
        """Return internal representation."""
        return f"<Addon: {self.slug}>"
@ -904,40 +961,5 @@ class Addon(AddonModel):
        if not (event.name == self.instance.name and self.watchdog):
            return
-        if event.state == ContainerState.UNHEALTHY:
+        if event.state in [ContainerState.FAILED, ContainerState.UNHEALTHY]:
-            while await self.instance.current_state() == event.state:
+            await self._restart_after_problem(self, event.state)
                if not self.in_progress:
                    _LOGGER.warning(
                        "Watchdog found addon %s is unhealthy, restarting...", self.name
                    )
                    try:
                        await self.restart()
                    except AddonsError as err:
                        _LOGGER.error("Watchdog restart of addon %s failed!", self.name)
                        self.sys_capture_exception(err)
                    else:
                        break
                await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
        elif event.state == ContainerState.FAILED:
            # Ensure failed container is removed before attempting reanimation
            with suppress(DockerError):
                await self.instance.stop(remove_container=True)
            while await self.instance.current_state() == event.state:
                if not self.in_progress:
                    _LOGGER.warning(
                        "Watchdog found addon %s failed, restarting...", self.name
                    )
                    try:
                        await self.start()
                    except AddonsError as err:
                        _LOGGER.error(
                            "Watchdog reanimation of addon %s failed!", self.name
                        )
                        self.sys_capture_exception(err)
                    else:
                        break
                await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
--- a/supervisor/addons/const.py
+++ b/supervisor/addons/const.py
@ -1,4 +1,5 @@
 """Add-on static data."""
 from datetime import timedelta
 from enum import Enum
@ -12,3 +13,6 @@ class AddonBackupMode(str, Enum):
 ATTR_BACKUP = "backup"
 ATTR_CODENOTARY = "codenotary"
 WATCHDOG_RETRY_SECONDS = 10
 WATCHDOG_MAX_ATTEMPTS = 5
 WATCHDOG_THROTTLE_PERIOD = timedelta(minutes=30)
 WATCHDOG_THROTTLE_MAX_CALLS = 10
--- a/supervisor/exceptions.py
+++ b/supervisor/exceptions.py
@ -122,6 +122,10 @@ class PluginError(HassioError):
    """Plugin error."""
 class PluginJobError(PluginError, JobException):
    """Raise on job error with plugin."""
 # HaCli
@ -133,6 +137,10 @@ class CliUpdateError(CliError):
    """Error on update of a HA cli."""
 class CliJobError(CliError, PluginJobError):
    """Raise on job error with cli plugin."""
 # Observer
@ -144,6 +152,10 @@ class ObserverUpdateError(ObserverError):
    """Error on update of a Observer."""
 class ObserverJobError(ObserverError, PluginJobError):
    """Raise on job error with observer plugin."""
 # Multicast
@ -155,6 +167,10 @@ class MulticastUpdateError(MulticastError):
    """Error on update of a multicast."""
 class MulticastJobError(MulticastError, PluginJobError):
    """Raise on job error with multicast plugin."""
 # DNS
@ -166,6 +182,10 @@ class CoreDNSUpdateError(CoreDNSError):
    """Error on update of a CoreDNS."""
 class CoreDNSJobError(CoreDNSError, PluginJobError):
    """Raise on job error with dns plugin."""
 # Audio
@ -177,6 +197,10 @@ class AudioUpdateError(AudioError):
    """Error on update of a Audio."""
 class AudioJobError(AudioError, PluginJobError):
    """Raise on job error with audio plugin."""
 # Addons
--- a/supervisor/homeassistant/const.py
+++ b/supervisor/homeassistant/const.py
@ -1,4 +1,5 @@
 """Constants for homeassistant."""
 from datetime import timedelta
 from enum import Enum
 from awesomeversion import AwesomeVersion
@ -7,6 +8,9 @@ from ..const import CoreState
 LANDINGPAGE: AwesomeVersion = AwesomeVersion("landingpage")
 WATCHDOG_RETRY_SECONDS = 10
 WATCHDOG_MAX_ATTEMPTS = 5
 WATCHDOG_THROTTLE_PERIOD = timedelta(minutes=30)
 WATCHDOG_THROTTLE_MAX_CALLS = 10
 CLOSING_STATES = [
    CoreState.SHUTDOWN,
--- a/supervisor/homeassistant/core.py
+++ b/supervisor/homeassistant/core.py
@ -13,6 +13,7 @@ from awesomeversion import AwesomeVersion
 from supervisor.const import ATTR_HOMEASSISTANT, BusEvent
 from supervisor.docker.const import ContainerState
 from supervisor.docker.monitor import DockerContainerStateEvent
 from supervisor.jobs.const import JobExecutionLimit
 from ..coresys import CoreSys, CoreSysAttributes
 from ..docker.homeassistant import DockerHomeAssistant
@ -27,7 +28,13 @@ from ..exceptions import (
 from ..jobs.decorator import Job, JobCondition
 from ..resolution.const import ContextType, IssueType
 from ..utils import convert_to_ascii, process_lock
-from .const import LANDINGPAGE, WATCHDOG_RETRY_SECONDS
+from .const import (
    LANDINGPAGE,
    WATCHDOG_MAX_ATTEMPTS,
    WATCHDOG_RETRY_SECONDS,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
 )
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -444,46 +451,48 @@ class HomeAssistantCore(CoreSysAttributes):
        if not (event.name == self.instance.name and self.sys_homeassistant.watchdog):
            return
-        if event.state == ContainerState.UNHEALTHY:
+        if event.state in [ContainerState.FAILED, ContainerState.UNHEALTHY]:
-            while await self.instance.current_state() == event.state:
+            await self._restart_after_problem(event.state)
    @Job(
        limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
        throttle_period=WATCHDOG_THROTTLE_PERIOD,
        throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
    )
    async def _restart_after_problem(self, state: ContainerState):
        """Restart unhealthy or failed Home Assistant."""
        attempts = 0
        while await self.instance.current_state() == state:
            # Don't interrupt a task in progress or if rollback is handling it
            if not (self.in_progress or self.error_state):
                _LOGGER.warning(
-                        "Watchdog found Home Assistant is unhealthy, restarting..."
+                    "Watchdog found Home Assistant %s, restarting...", state.value
                )
                if state == ContainerState.FAILED and attempts == 0:
                    try:
                        await self.start()
                    except HomeAssistantError as err:
                        self.sys_capture_exception(err)
                    else:
                        break
                try:
                    if state == ContainerState.FAILED:
                        await self.rebuild()
                    else:
                        await self.restart()
                except HomeAssistantError as err:
                    attempts = attempts + 1
                    _LOGGER.error("Watchdog restart of Home Assistant failed!")
                    self.sys_capture_exception(err)
                else:
                    break
-                await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
+            if attempts >= WATCHDOG_MAX_ATTEMPTS:
-
+                _LOGGER.critical(
-        elif event.state == ContainerState.FAILED:
+                    "Watchdog cannot restart Home Assistant, failed all %s attempts",
-            rebuild = False
+                    attempts,
            while await self.instance.current_state() == event.state:
                # Don't interrupt a task in progress or if rollback is handling it
                if not (self.in_progress or self.error_state):
                    _LOGGER.warning(
                        "Watchdog found Home Assistant failed, restarting..."
                )
                    if not rebuild:
                        try:
                            await self.start()
                        except HomeAssistantError as err:
                            self.sys_capture_exception(err)
                            rebuild = True
                        else:
                            break
                    try:
                        await self.rebuild()
                    except HomeAssistantError as err:
                        _LOGGER.error("Watchdog reanimation of Home Assistant failed!")
                        self.sys_capture_exception(err)
                    else:
                break
            await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
--- a/supervisor/jobs/const.py
+++ b/supervisor/jobs/const.py
@ -30,3 +30,4 @@ class JobExecutionLimit(str, Enum):
    ONCE = "once"
    THROTTLE = "throttle"
    THROTTLE_WAIT = "throttle_wait"
    THROTTLE_RATE_LIMIT = "throttle_rate_limit"
--- a/supervisor/jobs/decorator.py
+++ b/supervisor/jobs/decorator.py
@ -28,6 +28,7 @@ class Job(CoreSysAttributes):
        on_condition: Optional[JobException] = None,
        limit: Optional[JobExecutionLimit] = None,
        throttle_period: Optional[timedelta] = None,
        throttle_max_calls: Optional[int] = None,
    ):
        """Initialize the Job class."""
        self.name = name
@ -36,17 +37,30 @@ class Job(CoreSysAttributes):
        self.on_condition = on_condition
        self.limit = limit
        self.throttle_period = throttle_period
        self.throttle_max_calls = throttle_max_calls
        self._lock: Optional[asyncio.Semaphore] = None
        self._method = None
        self._last_call = datetime.min
        self._rate_limited_calls: Optional[list[datetime]] = None
        # Validate Options
        if (
-            self.limit in (JobExecutionLimit.THROTTLE, JobExecutionLimit.THROTTLE_WAIT)
+            self.limit
            in (
                JobExecutionLimit.THROTTLE,
                JobExecutionLimit.THROTTLE_WAIT,
                JobExecutionLimit.THROTTLE_RATE_LIMIT,
            )
            and self.throttle_period is None
        ):
            raise RuntimeError("Using Job without a Throttle period!")
        if self.limit == JobExecutionLimit.THROTTLE_RATE_LIMIT:
            if self.throttle_max_calls is None:
                raise RuntimeError("Using rate limit without throttle max calls!")
            self._rate_limited_calls = []
    def _post_init(self, args: tuple[Any]) -> None:
        """Runtime init."""
        if self.name is None:
@ -99,10 +113,29 @@ class Job(CoreSysAttributes):
                if time_since_last_call < self.throttle_period:
                    self._release_exception_limits()
                    return
            elif self.limit == JobExecutionLimit.THROTTLE_RATE_LIMIT:
                # Only reprocess array when necessary (at limit)
                if len(self._rate_limited_calls) >= self.throttle_max_calls:
                    self._rate_limited_calls = [
                        call
                        for call in self._rate_limited_calls
                        if call > datetime.now() - self.throttle_period
                    ]
                if len(self._rate_limited_calls) >= self.throttle_max_calls:
                    on_condition = (
                        JobException if self.on_condition is None else self.on_condition
                    )
                    raise on_condition(
                        f"Rate limit exceeded, more then {self.throttle_max_calls} calls in {self.throttle_period}",
                    )
            # Execute Job
            try:
                self._last_call = datetime.now()
                if self._rate_limited_calls is not None:
                    self._rate_limited_calls.append(self._last_call)
                return await self._method(*args, **kwargs)
            except HassioError as err:
                raise err
@ -207,7 +240,10 @@ class Job(CoreSysAttributes):
            return
        if self.limit == JobExecutionLimit.ONCE and self._lock.locked():
-            raise self.on_condition("Another job is running")
+            on_condition = (
                JobException if self.on_condition is None else self.on_condition
            )
            raise on_condition("Another job is running")
        await self._lock.acquire()
--- a/supervisor/plugins/audio.py
+++ b/supervisor/plugins/audio.py
@ -12,19 +12,28 @@ from typing import Optional
 from awesomeversion import AwesomeVersion
 import jinja2
 from supervisor.docker.const import ContainerState
 from supervisor.jobs.const import JobExecutionLimit
 from supervisor.jobs.decorator import Job
 from ..const import LogLevel
 from ..coresys import CoreSys
 from ..docker.audio import DockerAudio
 from ..docker.stats import DockerStats
 from ..exceptions import (
    AudioError,
    AudioJobError,
    AudioUpdateError,
    ConfigurationFileError,
    DockerError,
 )
 from ..utils.json import write_json_file
 from .base import PluginBase
-from .const import FILE_HASSIO_AUDIO
+from .const import (
    FILE_HASSIO_AUDIO,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
 )
 from .validate import SCHEMA_AUDIO_CONFIG
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -199,3 +208,13 @@ class PluginAudio(PluginBase):
            raise AudioError(
                f"Can't update pulse audio config: {err}", _LOGGER.error
            ) from err
    @Job(
        limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
        throttle_period=WATCHDOG_THROTTLE_PERIOD,
        throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
        on_condition=AudioJobError,
    )
    async def _restart_after_problem(self, state: ContainerState):
        """Restart unhealthy or failed plugin."""
        return await super()._restart_after_problem(state)
--- a/supervisor/plugins/base.py
+++ b/supervisor/plugins/base.py
@ -15,7 +15,7 @@ from ..coresys import CoreSysAttributes
 from ..docker.interface import DockerInterface
 from ..exceptions import DockerError, PluginError
 from ..utils.common import FileConfiguration
-from .const import WATCHDOG_RETRY_SECONDS
+from .const import WATCHDOG_MAX_ATTEMPTS, WATCHDOG_RETRY_SECONDS
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -105,46 +105,41 @@ class PluginBase(ABC, FileConfiguration, CoreSysAttributes):
        if not (event.name == self.instance.name):
            return
-        if event.state == ContainerState.UNHEALTHY:
+        if event.state in [
-            while await self.instance.current_state() == event.state:
+            ContainerState.FAILED,
-                if not self.in_progress:
+            ContainerState.STOPPED,
-                    _LOGGER.warning(
+            ContainerState.UNHEALTHY,
-                        "Watchdog found %s plugin is unhealthy, restarting...",
+        ]:
-                        self.slug,
+            await self._restart_after_problem(event.state)
                    )
                    try:
                        await self.rebuild()
                    except PluginError as err:
                        _LOGGER.error(
                            "Watchdog restart of %s plugin failed!", self.slug
                        )
                        self.sys_capture_exception(err)
                    else:
                        break
-                await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
+    async def _restart_after_problem(self, state: ContainerState):
-
+        """Restart unhealthy or failed plugin."""
-        elif event.state in [ContainerState.FAILED, ContainerState.STOPPED]:
+        attempts = 0
-            rebuild = event.state == ContainerState.FAILED
+        while await self.instance.current_state() == state:
            while await self.instance.current_state() == event.state:
            if not self.in_progress:
                _LOGGER.warning(
                    "Watchdog found %s plugin %s, restarting...",
                    self.slug,
-                        event.state.value,
+                    state.value,
                )
                try:
-                        if rebuild:
+                    if state == ContainerState.STOPPED and attempts == 0:
                            await self.rebuild()
                        else:
                        await self.start()
                    except PluginError as err:
                        _LOGGER.error(
                            "Watchdog reanimation of %s plugin failed!", self.slug
                        )
                        self.sys_capture_exception(err)
                        rebuild = True
                    else:
                        await self.rebuild()
                except PluginError as err:
                    attempts = attempts + 1
                    _LOGGER.error("Watchdog restart of %s plugin failed!", self.slug)
                    self.sys_capture_exception(err)
                else:
                    break
            if attempts >= WATCHDOG_MAX_ATTEMPTS:
                _LOGGER.critical(
                    "Watchdog cannot restart %s plugin, failed all %s attempts",
                    self.slug,
                    attempts,
                )
                break
            await asyncio.sleep(WATCHDOG_RETRY_SECONDS)
--- a/supervisor/plugins/cli.py
+++ b/supervisor/plugins/cli.py
@ -13,10 +13,17 @@ from awesomeversion import AwesomeVersion
 from ..const import ATTR_ACCESS_TOKEN
 from ..coresys import CoreSys
 from ..docker.cli import DockerCli
 from ..docker.const import ContainerState
 from ..docker.stats import DockerStats
-from ..exceptions import CliError, CliUpdateError, DockerError
+from ..exceptions import CliError, CliJobError, CliUpdateError, DockerError
 from ..jobs.const import JobExecutionLimit
 from ..jobs.decorator import Job
 from .base import PluginBase
-from .const import FILE_HASSIO_CLI
+from .const import (
    FILE_HASSIO_CLI,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
 )
 from .validate import SCHEMA_CLI_CONFIG
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -136,3 +143,13 @@ class PluginCli(PluginBase):
        except DockerError as err:
            _LOGGER.error("Repair of HA cli failed")
            self.sys_capture_exception(err)
    @Job(
        limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
        throttle_period=WATCHDOG_THROTTLE_PERIOD,
        throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
        on_condition=CliJobError,
    )
    async def _restart_after_problem(self, state: ContainerState):
        """Restart unhealthy or failed plugin."""
        return await super()._restart_after_problem(state)
--- a/supervisor/plugins/const.py
+++ b/supervisor/plugins/const.py
@ -1,4 +1,5 @@
 """Const for plugins."""
 from datetime import timedelta
 from pathlib import Path
 from ..const import SUPERVISOR_DATA
@ -11,3 +12,6 @@ FILE_HASSIO_MULTICAST = Path(SUPERVISOR_DATA, "multicast.json")
 ATTR_FALLBACK = "fallback"
 WATCHDOG_RETRY_SECONDS = 10
 WATCHDOG_MAX_ATTEMPTS = 5
 WATCHDOG_THROTTLE_PERIOD = timedelta(minutes=30)
 WATCHDOG_THROTTLE_MAX_CALLS = 10
--- a/supervisor/plugins/dns.py
+++ b/supervisor/plugins/dns.py
@ -17,6 +17,8 @@ import voluptuous as vol
 from supervisor.dbus.const import MulticastProtocolEnabled
 from supervisor.docker.const import ContainerState
 from supervisor.docker.monitor import DockerContainerStateEvent
 from supervisor.jobs.const import JobExecutionLimit
 from supervisor.jobs.decorator import Job
 from ..const import ATTR_SERVERS, DNS_SUFFIX, LogLevel
 from ..coresys import CoreSys
@ -25,6 +27,7 @@ from ..docker.stats import DockerStats
 from ..exceptions import (
    ConfigurationFileError,
    CoreDNSError,
    CoreDNSJobError,
    CoreDNSUpdateError,
    DockerError,
 )
@ -32,7 +35,12 @@ from ..resolution.const import ContextType, IssueType, SuggestionType
 from ..utils.json import write_json_file
 from ..validate import dns_url
 from .base import PluginBase
-from .const import ATTR_FALLBACK, FILE_HASSIO_DNS
+from .const import (
    ATTR_FALLBACK,
    FILE_HASSIO_DNS,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
 )
 from .validate import SCHEMA_DNS_CONFIG
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -250,6 +258,16 @@ class PluginDns(PluginBase):
        return await super().watchdog_container(event)
    @Job(
        limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
        throttle_period=WATCHDOG_THROTTLE_PERIOD,
        throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
        on_condition=CoreDNSJobError,
    )
    async def _restart_after_problem(self, state: ContainerState):
        """Restart unhealthy or failed plugin."""
        return await super()._restart_after_problem(state)
    async def loop_detection(self) -> None:
        """Check if there was a loop found."""
        log = await self.instance.logs()
--- a/supervisor/plugins/multicast.py
+++ b/supervisor/plugins/multicast.py
@ -9,12 +9,25 @@ from typing import Optional
 from awesomeversion import AwesomeVersion
 from supervisor.docker.const import ContainerState
 from supervisor.jobs.const import JobExecutionLimit
 from supervisor.jobs.decorator import Job
 from ..coresys import CoreSys
 from ..docker.multicast import DockerMulticast
 from ..docker.stats import DockerStats
-from ..exceptions import DockerError, MulticastError, MulticastUpdateError
+from ..exceptions import (
    DockerError,
    MulticastError,
    MulticastJobError,
    MulticastUpdateError,
 )
 from .base import PluginBase
-from .const import FILE_HASSIO_MULTICAST
+from .const import (
    FILE_HASSIO_MULTICAST,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
 )
 from .validate import SCHEMA_MULTICAST_CONFIG
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -127,3 +140,13 @@ class PluginMulticast(PluginBase):
        except DockerError as err:
            _LOGGER.error("Repair of Multicast failed")
            self.sys_capture_exception(err)
    @Job(
        limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
        throttle_period=WATCHDOG_THROTTLE_PERIOD,
        throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
        on_condition=MulticastJobError,
    )
    async def _restart_after_problem(self, state: ContainerState):
        """Restart unhealthy or failed plugin."""
        return await super()._restart_after_problem(state)
--- a/supervisor/plugins/observer.py
+++ b/supervisor/plugins/observer.py
@ -13,11 +13,23 @@ from awesomeversion import AwesomeVersion
 from ..const import ATTR_ACCESS_TOKEN
 from ..coresys import CoreSys
 from ..docker.const import ContainerState
 from ..docker.observer import DockerObserver
 from ..docker.stats import DockerStats
-from ..exceptions import DockerError, ObserverError, ObserverUpdateError
+from ..exceptions import (
    DockerError,
    ObserverError,
    ObserverJobError,
    ObserverUpdateError,
 )
 from ..jobs.const import JobExecutionLimit
 from ..jobs.decorator import Job
 from .base import PluginBase
-from .const import FILE_HASSIO_OBSERVER
+from .const import (
    FILE_HASSIO_OBSERVER,
    WATCHDOG_THROTTLE_MAX_CALLS,
    WATCHDOG_THROTTLE_PERIOD,
 )
 from .validate import SCHEMA_OBSERVER_CONFIG
 _LOGGER: logging.Logger = logging.getLogger(__name__)
@ -137,3 +149,13 @@ class PluginObserver(PluginBase):
        except DockerError as err:
            _LOGGER.error("Repair of HA observer failed")
            self.sys_capture_exception(err)
    @Job(
        limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
        throttle_period=WATCHDOG_THROTTLE_PERIOD,
        throttle_max_calls=WATCHDOG_THROTTLE_MAX_CALLS,
        on_condition=ObserverJobError,
    )
    async def _restart_after_problem(self, state: ContainerState):
        """Restart unhealthy or failed plugin."""
        return await super()._restart_after_problem(state)
--- a/tests/jobs/test_job_decorator.py
+++ b/tests/jobs/test_job_decorator.py
@ -2,16 +2,19 @@
 # pylint: disable=protected-access,import-error
 import asyncio
 from datetime import timedelta
 from typing import Optional
 from unittest.mock import PropertyMock, patch
 import pytest
 import time_machine
 from supervisor.const import CoreState
 from supervisor.coresys import CoreSys
-from supervisor.exceptions import HassioError, JobException
+from supervisor.exceptions import HassioError, JobException, PluginJobError
 from supervisor.jobs.const import JobExecutionLimit
 from supervisor.jobs.decorator import Job, JobCondition
 from supervisor.resolution.const import UnhealthyReason
 from supervisor.utils.dt import utcnow
 async def test_healthy(coresys: CoreSys):
@ -220,7 +223,7 @@ async def test_ignore_conditions(coresys: CoreSys):
 async def test_exception_conditions(coresys: CoreSys):
-    """Test the ignore conditions decorator."""
+    """Test the on condition decorator."""
    class TestClass:
        """Test class."""
@ -247,7 +250,7 @@ async def test_exception_conditions(coresys: CoreSys):
 async def test_exectution_limit_single_wait(
    coresys: CoreSys, loop: asyncio.BaseEventLoop
 ):
-    """Test the ignore conditions decorator."""
+    """Test the single wait job execution limit."""
    class TestClass:
        """Test class."""
@ -269,10 +272,10 @@ async def test_exectution_limit_single_wait(
    await asyncio.gather(*[test.execute(0.1), test.execute(0.1), test.execute(0.1)])
-async def test_exectution_limit_throttle_wait(
+async def test_execution_limit_throttle_wait(
    coresys: CoreSys, loop: asyncio.BaseEventLoop
 ):
-    """Test the ignore conditions decorator."""
+    """Test the throttle wait job execution limit."""
    class TestClass:
        """Test class."""
@ -300,6 +303,47 @@ async def test_exectution_limit_throttle_wait(
    assert test.call == 1
@pytest.mark.parametrize("error", [None, PluginJobError])
 async def test_execution_limit_throttle_rate_limit(
    coresys: CoreSys, loop: asyncio.BaseEventLoop, error: Optional[JobException]
 ):
    """Test the throttle wait job execution limit."""
    class TestClass:
        """Test class."""
        def __init__(self, coresys: CoreSys):
            """Initialize the test class."""
            self.coresys = coresys
            self.run = asyncio.Lock()
            self.call = 0
        @Job(
            limit=JobExecutionLimit.THROTTLE_RATE_LIMIT,
            throttle_period=timedelta(hours=1),
            throttle_max_calls=2,
            on_condition=error,
        )
        async def execute(self):
            """Execute the class method."""
            self.call += 1
    test = TestClass(coresys)
    await asyncio.gather(*[test.execute(), test.execute()])
    assert test.call == 2
    with pytest.raises(JobException if error is None else error):
        await test.execute()
    assert test.call == 2
    with time_machine.travel(utcnow() + timedelta(hours=1)):
        await test.execute()
    assert test.call == 3
 async def test_exectution_limit_throttle(coresys: CoreSys, loop: asyncio.BaseEventLoop):
    """Test the ignore conditions decorator."""