Handle disks with non-existing SMART attributes (#6077)

Not all disks have all SMART attributes available, e.g. Sentry showed
devices with missing "wctemp". In practice, any SMART attribute could
be missing. Make sure we handle this gracefully.
This commit is contained in:
Stefan Agner 2025-08-07 09:40:03 +02:00 committed by GitHub
parent 5d851ad747
commit cad14bf46e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 92 additions and 31 deletions

View File

@ -34,43 +34,49 @@ class SmartStatus:
"""Smart status information for NVMe devices.
https://storaged.org/doc/udisks2-api/latest/gdbus-org.freedesktop.UDisks2.NVMe.Controller.html#gdbus-method-org-freedesktop-UDisks2-NVMe-Controller.SmartGetAttributes
All attributes are optional as their presence depends on the specific NVMe drive,
firmware version, and vendor implementation.
"""
available_spare: int
spare_threshold: int
percent_used: int
total_data_read: int
total_data_written: int
controller_busy_minutes: int
power_cycles: int
unsafe_shutdowns: int
media_errors: int
number_error_log_entries: int
temperature_sensors: list[int]
warning_composite_temperature: int
critical_composite_temperature: int
warning_temperature_minutes: int
critical_temperature_minutes: int
available_spare: int | None
spare_threshold: int | None
percent_used: int | None
total_data_read: int | None
total_data_written: int | None
controller_busy_minutes: int | None
power_cycles: int | None
unsafe_shutdowns: int | None
media_errors: int | None
number_error_log_entries: int | None
temperature_sensors: list[int] | None
warning_composite_temperature: int | None
critical_composite_temperature: int | None
warning_temperature_minutes: int | None
critical_temperature_minutes: int | None
@classmethod
def from_smart_get_attributes_resp(cls, resp: dict[str, Any]):
"""Convert SmartGetAttributes response dictionary to instance."""
"""Convert SmartGetAttributes response dictionary to instance.
Safely handles missing attributes as they are vendor/drive dependent.
"""
return cls(
available_spare=resp["avail_spare"],
spare_threshold=resp["spare_thresh"],
percent_used=resp["percent_used"],
total_data_read=resp["total_data_read"],
total_data_written=resp["total_data_written"],
controller_busy_minutes=resp["ctrl_busy_time"],
power_cycles=resp["power_cycles"],
unsafe_shutdowns=resp["unsafe_shutdowns"],
media_errors=resp["media_errors"],
number_error_log_entries=resp["num_err_log_entries"],
temperature_sensors=resp["temp_sensors"],
warning_composite_temperature=resp["wctemp"],
critical_composite_temperature=resp["cctemp"],
warning_temperature_minutes=resp["warning_temp_time"],
critical_temperature_minutes=resp["critical_temp_time"],
available_spare=resp.get("avail_spare"),
spare_threshold=resp.get("spare_thresh"),
percent_used=resp.get("percent_used"),
total_data_read=resp.get("total_data_read"),
total_data_written=resp.get("total_data_written"),
controller_busy_minutes=resp.get("ctrl_busy_time"),
power_cycles=resp.get("power_cycles"),
unsafe_shutdowns=resp.get("unsafe_shutdowns"),
media_errors=resp.get("media_errors"),
number_error_log_entries=resp.get("num_err_log_entries"),
temperature_sensors=resp.get("temp_sensors"),
warning_composite_temperature=resp.get("wctemp"),
critical_composite_temperature=resp.get("cctemp"),
warning_temperature_minutes=resp.get("warning_temp_time"),
critical_temperature_minutes=resp.get("critical_temp_time"),
)

View File

@ -141,6 +141,13 @@ class HwDisk(CoreSysAttributes):
)
return None
# Check if percent_used is available (vendor/drive dependent)
if smart_log.percent_used is None:
_LOGGER.debug(
"NVMe controller %s does not provide percent_used attribute", drive.id
)
return None
# UDisks2 documentation specifies that value can exceed 100
if smart_log.percent_used >= 100:
_LOGGER.warning(

View File

@ -70,3 +70,33 @@ async def test_nvme_controller_smart_get_attributes(dbus_session_bus: MessageBus
assert smart_log.total_data_written == 27723431936000
assert smart_log.controller_busy_minutes == 2682
assert smart_log.temperature_sensors == [310, 305, 0, 0, 0, 0, 0, 0]
async def test_nvme_controller_smart_get_attributes_missing(
nvme_controller_service: NVMeControllerService, dbus_session_bus: MessageBus
):
"""Test NVMe Controller smart get attributes with missing vendor-specific attributes."""
# Simulate a drive that doesn't provide some optional attributes
nvme_controller_service.set_missing_attributes(
["wctemp", "cctemp", "warning_temp_time", "critical_temp_time", "temp_sensors"]
)
controller = UDisks2NVMeController(
"/org/freedesktop/UDisks2/drives/Samsung_SSD_970_EVO_Plus_2TB_S40123456789ABC"
)
await controller.connect(dbus_session_bus)
smart_log = await controller.smart_get_attributes()
# Core attributes should still be present
assert smart_log.available_spare == 100
assert smart_log.percent_used == 1
assert smart_log.total_data_read == 22890461184000
assert smart_log.total_data_written == 27723431936000
assert smart_log.controller_busy_minutes == 2682
# Optional attributes should be None
assert smart_log.warning_composite_temperature is None
assert smart_log.critical_composite_temperature is None
assert smart_log.warning_temperature_minutes is None
assert smart_log.critical_temperature_minutes is None
assert smart_log.temperature_sensors is None

View File

@ -46,6 +46,11 @@ class NVMeController(DBusServiceMock):
"critical_temp_time": Variant("i", 0),
}
def set_missing_attributes(self, missing_keys: list[str]):
"""Remove specified attributes to simulate drives that don't provide them."""
for key in missing_keys:
self.smart_get_attributes_response.pop(key, None)
@dbus_property(access=PropertyAccess.READ)
def State(self) -> "s":
"""Get State."""

View File

@ -177,3 +177,16 @@ async def test_try_get_nvme_life_time(
coresys.config.path_supervisor
)
assert lifetime == 50
async def test_try_get_nvme_life_time_missing_percent_used(
coresys: CoreSys, nvme_data_disk: NVMeControllerService
):
"""Test getting lifetime info from an NVMe when percent_used is missing."""
# Simulate a drive that doesn't provide percent_used
nvme_data_disk.set_missing_attributes(["percent_used"])
lifetime = await coresys.hardware.disk.get_disk_life_time(
coresys.config.path_supervisor
)
assert lifetime is None