From cad14bf46e1eae82fcc01a9101d8423825be95c7 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 7 Aug 2025 09:40:03 +0200 Subject: [PATCH] Handle disks with non-existing SMART attributes (#6077) Not all disks have all SMART attributes available, e.g. Sentry showed devices with missing "wctemp". In practice, any SMART attribute could be missing. Make sure we handle this gracefully. --- supervisor/dbus/udisks2/nvme_controller.py | 68 ++++++++++--------- supervisor/hardware/disk.py | 7 ++ tests/dbus/udisks2/test_nvme_controller.py | 30 ++++++++ .../udisks2_nvme_controller.py | 5 ++ tests/hardware/test_disk.py | 13 ++++ 5 files changed, 92 insertions(+), 31 deletions(-) diff --git a/supervisor/dbus/udisks2/nvme_controller.py b/supervisor/dbus/udisks2/nvme_controller.py index b58486b9b..52b27111c 100644 --- a/supervisor/dbus/udisks2/nvme_controller.py +++ b/supervisor/dbus/udisks2/nvme_controller.py @@ -34,43 +34,49 @@ class SmartStatus: """Smart status information for NVMe devices. https://storaged.org/doc/udisks2-api/latest/gdbus-org.freedesktop.UDisks2.NVMe.Controller.html#gdbus-method-org-freedesktop-UDisks2-NVMe-Controller.SmartGetAttributes + + All attributes are optional as their presence depends on the specific NVMe drive, + firmware version, and vendor implementation. """ - available_spare: int - spare_threshold: int - percent_used: int - total_data_read: int - total_data_written: int - controller_busy_minutes: int - power_cycles: int - unsafe_shutdowns: int - media_errors: int - number_error_log_entries: int - temperature_sensors: list[int] - warning_composite_temperature: int - critical_composite_temperature: int - warning_temperature_minutes: int - critical_temperature_minutes: int + available_spare: int | None + spare_threshold: int | None + percent_used: int | None + total_data_read: int | None + total_data_written: int | None + controller_busy_minutes: int | None + power_cycles: int | None + unsafe_shutdowns: int | None + media_errors: int | None + number_error_log_entries: int | None + temperature_sensors: list[int] | None + warning_composite_temperature: int | None + critical_composite_temperature: int | None + warning_temperature_minutes: int | None + critical_temperature_minutes: int | None @classmethod def from_smart_get_attributes_resp(cls, resp: dict[str, Any]): - """Convert SmartGetAttributes response dictionary to instance.""" + """Convert SmartGetAttributes response dictionary to instance. + + Safely handles missing attributes as they are vendor/drive dependent. + """ return cls( - available_spare=resp["avail_spare"], - spare_threshold=resp["spare_thresh"], - percent_used=resp["percent_used"], - total_data_read=resp["total_data_read"], - total_data_written=resp["total_data_written"], - controller_busy_minutes=resp["ctrl_busy_time"], - power_cycles=resp["power_cycles"], - unsafe_shutdowns=resp["unsafe_shutdowns"], - media_errors=resp["media_errors"], - number_error_log_entries=resp["num_err_log_entries"], - temperature_sensors=resp["temp_sensors"], - warning_composite_temperature=resp["wctemp"], - critical_composite_temperature=resp["cctemp"], - warning_temperature_minutes=resp["warning_temp_time"], - critical_temperature_minutes=resp["critical_temp_time"], + available_spare=resp.get("avail_spare"), + spare_threshold=resp.get("spare_thresh"), + percent_used=resp.get("percent_used"), + total_data_read=resp.get("total_data_read"), + total_data_written=resp.get("total_data_written"), + controller_busy_minutes=resp.get("ctrl_busy_time"), + power_cycles=resp.get("power_cycles"), + unsafe_shutdowns=resp.get("unsafe_shutdowns"), + media_errors=resp.get("media_errors"), + number_error_log_entries=resp.get("num_err_log_entries"), + temperature_sensors=resp.get("temp_sensors"), + warning_composite_temperature=resp.get("wctemp"), + critical_composite_temperature=resp.get("cctemp"), + warning_temperature_minutes=resp.get("warning_temp_time"), + critical_temperature_minutes=resp.get("critical_temp_time"), ) diff --git a/supervisor/hardware/disk.py b/supervisor/hardware/disk.py index 3b5158c7e..bbe3d31b7 100644 --- a/supervisor/hardware/disk.py +++ b/supervisor/hardware/disk.py @@ -141,6 +141,13 @@ class HwDisk(CoreSysAttributes): ) return None + # Check if percent_used is available (vendor/drive dependent) + if smart_log.percent_used is None: + _LOGGER.debug( + "NVMe controller %s does not provide percent_used attribute", drive.id + ) + return None + # UDisks2 documentation specifies that value can exceed 100 if smart_log.percent_used >= 100: _LOGGER.warning( diff --git a/tests/dbus/udisks2/test_nvme_controller.py b/tests/dbus/udisks2/test_nvme_controller.py index a06f99b61..acdc732ae 100644 --- a/tests/dbus/udisks2/test_nvme_controller.py +++ b/tests/dbus/udisks2/test_nvme_controller.py @@ -70,3 +70,33 @@ async def test_nvme_controller_smart_get_attributes(dbus_session_bus: MessageBus assert smart_log.total_data_written == 27723431936000 assert smart_log.controller_busy_minutes == 2682 assert smart_log.temperature_sensors == [310, 305, 0, 0, 0, 0, 0, 0] + + +async def test_nvme_controller_smart_get_attributes_missing( + nvme_controller_service: NVMeControllerService, dbus_session_bus: MessageBus +): + """Test NVMe Controller smart get attributes with missing vendor-specific attributes.""" + # Simulate a drive that doesn't provide some optional attributes + nvme_controller_service.set_missing_attributes( + ["wctemp", "cctemp", "warning_temp_time", "critical_temp_time", "temp_sensors"] + ) + + controller = UDisks2NVMeController( + "/org/freedesktop/UDisks2/drives/Samsung_SSD_970_EVO_Plus_2TB_S40123456789ABC" + ) + await controller.connect(dbus_session_bus) + + smart_log = await controller.smart_get_attributes() + # Core attributes should still be present + assert smart_log.available_spare == 100 + assert smart_log.percent_used == 1 + assert smart_log.total_data_read == 22890461184000 + assert smart_log.total_data_written == 27723431936000 + assert smart_log.controller_busy_minutes == 2682 + + # Optional attributes should be None + assert smart_log.warning_composite_temperature is None + assert smart_log.critical_composite_temperature is None + assert smart_log.warning_temperature_minutes is None + assert smart_log.critical_temperature_minutes is None + assert smart_log.temperature_sensors is None diff --git a/tests/dbus_service_mocks/udisks2_nvme_controller.py b/tests/dbus_service_mocks/udisks2_nvme_controller.py index 9b99a9bfd..fd703b27c 100644 --- a/tests/dbus_service_mocks/udisks2_nvme_controller.py +++ b/tests/dbus_service_mocks/udisks2_nvme_controller.py @@ -46,6 +46,11 @@ class NVMeController(DBusServiceMock): "critical_temp_time": Variant("i", 0), } + def set_missing_attributes(self, missing_keys: list[str]): + """Remove specified attributes to simulate drives that don't provide them.""" + for key in missing_keys: + self.smart_get_attributes_response.pop(key, None) + @dbus_property(access=PropertyAccess.READ) def State(self) -> "s": """Get State.""" diff --git a/tests/hardware/test_disk.py b/tests/hardware/test_disk.py index 32df09b5c..d4e253bce 100644 --- a/tests/hardware/test_disk.py +++ b/tests/hardware/test_disk.py @@ -177,3 +177,16 @@ async def test_try_get_nvme_life_time( coresys.config.path_supervisor ) assert lifetime == 50 + + +async def test_try_get_nvme_life_time_missing_percent_used( + coresys: CoreSys, nvme_data_disk: NVMeControllerService +): + """Test getting lifetime info from an NVMe when percent_used is missing.""" + # Simulate a drive that doesn't provide percent_used + nvme_data_disk.set_missing_attributes(["percent_used"]) + + lifetime = await coresys.hardware.disk.get_disk_life_time( + coresys.config.path_supervisor + ) + assert lifetime is None