Handle disks with non-existing SMART attributes (#6077)

Not all disks have all SMART attributes available, e.g. Sentry showed
devices with missing "wctemp". In practice, any SMART attribute could
be missing. Make sure we handle this gracefully.
This commit is contained in:
Stefan Agner 2025-08-07 09:40:03 +02:00 committed by GitHub
parent 5d851ad747
commit cad14bf46e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 92 additions and 31 deletions

View File

@ -34,43 +34,49 @@ class SmartStatus:
"""Smart status information for NVMe devices. """Smart status information for NVMe devices.
https://storaged.org/doc/udisks2-api/latest/gdbus-org.freedesktop.UDisks2.NVMe.Controller.html#gdbus-method-org-freedesktop-UDisks2-NVMe-Controller.SmartGetAttributes https://storaged.org/doc/udisks2-api/latest/gdbus-org.freedesktop.UDisks2.NVMe.Controller.html#gdbus-method-org-freedesktop-UDisks2-NVMe-Controller.SmartGetAttributes
All attributes are optional as their presence depends on the specific NVMe drive,
firmware version, and vendor implementation.
""" """
available_spare: int available_spare: int | None
spare_threshold: int spare_threshold: int | None
percent_used: int percent_used: int | None
total_data_read: int total_data_read: int | None
total_data_written: int total_data_written: int | None
controller_busy_minutes: int controller_busy_minutes: int | None
power_cycles: int power_cycles: int | None
unsafe_shutdowns: int unsafe_shutdowns: int | None
media_errors: int media_errors: int | None
number_error_log_entries: int number_error_log_entries: int | None
temperature_sensors: list[int] temperature_sensors: list[int] | None
warning_composite_temperature: int warning_composite_temperature: int | None
critical_composite_temperature: int critical_composite_temperature: int | None
warning_temperature_minutes: int warning_temperature_minutes: int | None
critical_temperature_minutes: int critical_temperature_minutes: int | None
@classmethod @classmethod
def from_smart_get_attributes_resp(cls, resp: dict[str, Any]): def from_smart_get_attributes_resp(cls, resp: dict[str, Any]):
"""Convert SmartGetAttributes response dictionary to instance.""" """Convert SmartGetAttributes response dictionary to instance.
Safely handles missing attributes as they are vendor/drive dependent.
"""
return cls( return cls(
available_spare=resp["avail_spare"], available_spare=resp.get("avail_spare"),
spare_threshold=resp["spare_thresh"], spare_threshold=resp.get("spare_thresh"),
percent_used=resp["percent_used"], percent_used=resp.get("percent_used"),
total_data_read=resp["total_data_read"], total_data_read=resp.get("total_data_read"),
total_data_written=resp["total_data_written"], total_data_written=resp.get("total_data_written"),
controller_busy_minutes=resp["ctrl_busy_time"], controller_busy_minutes=resp.get("ctrl_busy_time"),
power_cycles=resp["power_cycles"], power_cycles=resp.get("power_cycles"),
unsafe_shutdowns=resp["unsafe_shutdowns"], unsafe_shutdowns=resp.get("unsafe_shutdowns"),
media_errors=resp["media_errors"], media_errors=resp.get("media_errors"),
number_error_log_entries=resp["num_err_log_entries"], number_error_log_entries=resp.get("num_err_log_entries"),
temperature_sensors=resp["temp_sensors"], temperature_sensors=resp.get("temp_sensors"),
warning_composite_temperature=resp["wctemp"], warning_composite_temperature=resp.get("wctemp"),
critical_composite_temperature=resp["cctemp"], critical_composite_temperature=resp.get("cctemp"),
warning_temperature_minutes=resp["warning_temp_time"], warning_temperature_minutes=resp.get("warning_temp_time"),
critical_temperature_minutes=resp["critical_temp_time"], critical_temperature_minutes=resp.get("critical_temp_time"),
) )

View File

@ -141,6 +141,13 @@ class HwDisk(CoreSysAttributes):
) )
return None return None
# Check if percent_used is available (vendor/drive dependent)
if smart_log.percent_used is None:
_LOGGER.debug(
"NVMe controller %s does not provide percent_used attribute", drive.id
)
return None
# UDisks2 documentation specifies that value can exceed 100 # UDisks2 documentation specifies that value can exceed 100
if smart_log.percent_used >= 100: if smart_log.percent_used >= 100:
_LOGGER.warning( _LOGGER.warning(

View File

@ -70,3 +70,33 @@ async def test_nvme_controller_smart_get_attributes(dbus_session_bus: MessageBus
assert smart_log.total_data_written == 27723431936000 assert smart_log.total_data_written == 27723431936000
assert smart_log.controller_busy_minutes == 2682 assert smart_log.controller_busy_minutes == 2682
assert smart_log.temperature_sensors == [310, 305, 0, 0, 0, 0, 0, 0] assert smart_log.temperature_sensors == [310, 305, 0, 0, 0, 0, 0, 0]
async def test_nvme_controller_smart_get_attributes_missing(
nvme_controller_service: NVMeControllerService, dbus_session_bus: MessageBus
):
"""Test NVMe Controller smart get attributes with missing vendor-specific attributes."""
# Simulate a drive that doesn't provide some optional attributes
nvme_controller_service.set_missing_attributes(
["wctemp", "cctemp", "warning_temp_time", "critical_temp_time", "temp_sensors"]
)
controller = UDisks2NVMeController(
"/org/freedesktop/UDisks2/drives/Samsung_SSD_970_EVO_Plus_2TB_S40123456789ABC"
)
await controller.connect(dbus_session_bus)
smart_log = await controller.smart_get_attributes()
# Core attributes should still be present
assert smart_log.available_spare == 100
assert smart_log.percent_used == 1
assert smart_log.total_data_read == 22890461184000
assert smart_log.total_data_written == 27723431936000
assert smart_log.controller_busy_minutes == 2682
# Optional attributes should be None
assert smart_log.warning_composite_temperature is None
assert smart_log.critical_composite_temperature is None
assert smart_log.warning_temperature_minutes is None
assert smart_log.critical_temperature_minutes is None
assert smart_log.temperature_sensors is None

View File

@ -46,6 +46,11 @@ class NVMeController(DBusServiceMock):
"critical_temp_time": Variant("i", 0), "critical_temp_time": Variant("i", 0),
} }
def set_missing_attributes(self, missing_keys: list[str]):
"""Remove specified attributes to simulate drives that don't provide them."""
for key in missing_keys:
self.smart_get_attributes_response.pop(key, None)
@dbus_property(access=PropertyAccess.READ) @dbus_property(access=PropertyAccess.READ)
def State(self) -> "s": def State(self) -> "s":
"""Get State.""" """Get State."""

View File

@ -177,3 +177,16 @@ async def test_try_get_nvme_life_time(
coresys.config.path_supervisor coresys.config.path_supervisor
) )
assert lifetime == 50 assert lifetime == 50
async def test_try_get_nvme_life_time_missing_percent_used(
coresys: CoreSys, nvme_data_disk: NVMeControllerService
):
"""Test getting lifetime info from an NVMe when percent_used is missing."""
# Simulate a drive that doesn't provide percent_used
nvme_data_disk.set_missing_attributes(["percent_used"])
lifetime = await coresys.hardware.disk.get_disk_life_time(
coresys.config.path_supervisor
)
assert lifetime is None