From ffbd2d79c8e03f4a7fee2c65a28181a0158127ea Mon Sep 17 00:00:00 2001
From: Erik Montnemery <erik@montnemery.com>
Date: Wed, 25 Aug 2021 13:00:35 +0200
Subject: [PATCH] Generate statistics for all sensors with a supported
 state_class (#54882)

* Generate statistics for all sensors

* Fix bugs, add tests

* Address review comments

* Cleanup warnings

* Simplify tests

* Simplify selection of statistics

* Fix tests
---
 .../components/recorder/statistics.py         |  13 ++
 homeassistant/components/sensor/recorder.py   | 127 +++++++-----
 tests/components/sensor/test_recorder.py      | 194 ++++++++++++++++--
 3 files changed, 272 insertions(+), 62 deletions(-)

diff --git a/homeassistant/components/recorder/statistics.py b/homeassistant/components/recorder/statistics.py
index 34112fcc059..06f7851b1a6 100644
--- a/homeassistant/components/recorder/statistics.py
+++ b/homeassistant/components/recorder/statistics.py
@@ -227,6 +227,19 @@ def _get_metadata(
     return metadata
 
 
+def get_metadata(
+    hass: HomeAssistant,
+    statistic_id: str,
+) -> dict[str, str] | None:
+    """Return metadata for a statistic_id."""
+    statistic_ids = [statistic_id]
+    with session_scope(hass=hass) as session:
+        metadata_ids = _get_metadata_ids(hass, session, [statistic_id])
+        if not metadata_ids:
+            return None
+        return _get_metadata(hass, session, statistic_ids, None).get(metadata_ids[0])
+
+
 def _configured_unit(unit: str, units: UnitSystem) -> str:
     """Return the pressure and temperature units configured by the user."""
     if unit == PRESSURE_PA:
diff --git a/homeassistant/components/sensor/recorder.py b/homeassistant/components/sensor/recorder.py
index 6dc91b52c9a..77f91752327 100644
--- a/homeassistant/components/sensor/recorder.py
+++ b/homeassistant/components/sensor/recorder.py
@@ -9,10 +9,8 @@ from typing import Callable
 from homeassistant.components.recorder import history, statistics
 from homeassistant.components.sensor import (
     ATTR_STATE_CLASS,
-    DEVICE_CLASS_BATTERY,
     DEVICE_CLASS_ENERGY,
     DEVICE_CLASS_GAS,
-    DEVICE_CLASS_HUMIDITY,
     DEVICE_CLASS_MONETARY,
     DEVICE_CLASS_PRESSURE,
     DEVICE_CLASS_TEMPERATURE,
@@ -26,7 +24,6 @@ from homeassistant.const import (
     DEVICE_CLASS_POWER,
     ENERGY_KILO_WATT_HOUR,
     ENERGY_WATT_HOUR,
-    PERCENTAGE,
     POWER_KILO_WATT,
     POWER_WATT,
     PRESSURE_BAR,
@@ -51,24 +48,18 @@ from . import ATTR_LAST_RESET, DOMAIN
 
 _LOGGER = logging.getLogger(__name__)
 
-DEVICE_CLASS_OR_UNIT_STATISTICS = {
+DEVICE_CLASS_STATISTICS: dict[str, dict[str, set[str]]] = {
     STATE_CLASS_MEASUREMENT: {
-        DEVICE_CLASS_BATTERY: {"mean", "min", "max"},
-        DEVICE_CLASS_HUMIDITY: {"mean", "min", "max"},
-        DEVICE_CLASS_POWER: {"mean", "min", "max"},
-        DEVICE_CLASS_PRESSURE: {"mean", "min", "max"},
-        DEVICE_CLASS_TEMPERATURE: {"mean", "min", "max"},
-        PERCENTAGE: {"mean", "min", "max"},
         # Deprecated, support will be removed in Home Assistant 2021.11
         DEVICE_CLASS_ENERGY: {"sum"},
         DEVICE_CLASS_GAS: {"sum"},
         DEVICE_CLASS_MONETARY: {"sum"},
     },
-    STATE_CLASS_TOTAL_INCREASING: {
-        DEVICE_CLASS_ENERGY: {"sum"},
-        DEVICE_CLASS_GAS: {"sum"},
-        DEVICE_CLASS_MONETARY: {"sum"},
-    },
+    STATE_CLASS_TOTAL_INCREASING: {},
+}
+DEFAULT_STATISTICS = {
+    STATE_CLASS_MEASUREMENT: {"mean", "min", "max"},
+    STATE_CLASS_TOTAL_INCREASING: {"sum"},
 }
 
 # Normalized units which will be stored in the statistics table
@@ -116,31 +107,20 @@ UNIT_CONVERSIONS: dict[str, dict[str, Callable]] = {
 }
 
 # Keep track of entities for which a warning about unsupported unit has been logged
-WARN_UNSUPPORTED_UNIT = set()
+WARN_UNSUPPORTED_UNIT = "sensor_warn_unsupported_unit"
+WARN_UNSTABLE_UNIT = "sensor_warn_unstable_unit"
 
 
-def _get_entities(hass: HomeAssistant) -> list[tuple[str, str, str]]:
-    """Get (entity_id, state_class, key) of all sensors for which to compile statistics.
-
-    Key is either a device class or a unit and is used to index the
-    DEVICE_CLASS_OR_UNIT_STATISTICS map.
-    """
+def _get_entities(hass: HomeAssistant) -> list[tuple[str, str, str | None]]:
+    """Get (entity_id, state_class, device_class) of all sensors for which to compile statistics."""
     all_sensors = hass.states.all(DOMAIN)
     entity_ids = []
 
     for state in all_sensors:
         if (state_class := state.attributes.get(ATTR_STATE_CLASS)) not in STATE_CLASSES:
             continue
-
-        if (
-            key := state.attributes.get(ATTR_DEVICE_CLASS)
-        ) in DEVICE_CLASS_OR_UNIT_STATISTICS[state_class]:
-            entity_ids.append((state.entity_id, state_class, key))
-
-        if (
-            key := state.attributes.get(ATTR_UNIT_OF_MEASUREMENT)
-        ) in DEVICE_CLASS_OR_UNIT_STATISTICS[state_class]:
-            entity_ids.append((state.entity_id, state_class, key))
+        device_class = state.attributes.get(ATTR_DEVICE_CLASS)
+        entity_ids.append((state.entity_id, state_class, device_class))
 
     return entity_ids
 
@@ -190,18 +170,39 @@ def _time_weighted_average(
     return accumulated / (end - start).total_seconds()
 
 
+def _get_units(fstates: list[tuple[float, State]]) -> set[str | None]:
+    """Return True if all states have the same unit."""
+    return {item[1].attributes.get(ATTR_UNIT_OF_MEASUREMENT) for item in fstates}
+
+
 def _normalize_states(
-    entity_history: list[State], key: str, entity_id: str
+    hass: HomeAssistant,
+    entity_history: list[State],
+    device_class: str | None,
+    entity_id: str,
 ) -> tuple[str | None, list[tuple[float, State]]]:
     """Normalize units."""
     unit = None
 
-    if key not in UNIT_CONVERSIONS:
+    if device_class not in UNIT_CONVERSIONS:
         # We're not normalizing this device class, return the state as they are
         fstates = [
             (float(el.state), el) for el in entity_history if _is_number(el.state)
         ]
         if fstates:
+            all_units = _get_units(fstates)
+            if len(all_units) > 1:
+                if WARN_UNSTABLE_UNIT not in hass.data:
+                    hass.data[WARN_UNSTABLE_UNIT] = set()
+                if entity_id not in hass.data[WARN_UNSTABLE_UNIT]:
+                    hass.data[WARN_UNSTABLE_UNIT].add(entity_id)
+                    _LOGGER.warning(
+                        "The unit of %s is changing, got %s, generation of long term "
+                        "statistics will be suppressed unless the unit is stable",
+                        entity_id,
+                        all_units,
+                    )
+                return None, []
             unit = fstates[0][1].attributes.get(ATTR_UNIT_OF_MEASUREMENT)
         return unit, fstates
 
@@ -215,15 +216,17 @@ def _normalize_states(
         fstate = float(state.state)
         unit = state.attributes.get(ATTR_UNIT_OF_MEASUREMENT)
         # Exclude unsupported units from statistics
-        if unit not in UNIT_CONVERSIONS[key]:
-            if entity_id not in WARN_UNSUPPORTED_UNIT:
-                WARN_UNSUPPORTED_UNIT.add(entity_id)
+        if unit not in UNIT_CONVERSIONS[device_class]:
+            if WARN_UNSUPPORTED_UNIT not in hass.data:
+                hass.data[WARN_UNSUPPORTED_UNIT] = set()
+            if entity_id not in hass.data[WARN_UNSUPPORTED_UNIT]:
+                hass.data[WARN_UNSUPPORTED_UNIT].add(entity_id)
                 _LOGGER.warning("%s has unknown unit %s", entity_id, unit)
             continue
 
-        fstates.append((UNIT_CONVERSIONS[key][unit](fstate), state))
+        fstates.append((UNIT_CONVERSIONS[device_class][unit](fstate), state))
 
-    return DEVICE_CLASS_UNITS[key], fstates
+    return DEVICE_CLASS_UNITS[device_class], fstates
 
 
 def reset_detected(state: float, previous_state: float | None) -> bool:
@@ -247,18 +250,39 @@ def compile_statistics(
         hass, start - datetime.timedelta.resolution, end, [i[0] for i in entities]
     )
 
-    for entity_id, state_class, key in entities:
-        wanted_statistics = DEVICE_CLASS_OR_UNIT_STATISTICS[state_class][key]
+    for entity_id, state_class, device_class in entities:
+        if device_class in DEVICE_CLASS_STATISTICS[state_class]:
+            wanted_statistics = DEVICE_CLASS_STATISTICS[state_class][device_class]
+        else:
+            wanted_statistics = DEFAULT_STATISTICS[state_class]
 
         if entity_id not in history_list:
             continue
 
         entity_history = history_list[entity_id]
-        unit, fstates = _normalize_states(entity_history, key, entity_id)
+        unit, fstates = _normalize_states(hass, entity_history, device_class, entity_id)
 
         if not fstates:
             continue
 
+        # Check metadata
+        if old_metadata := statistics.get_metadata(hass, entity_id):
+            if old_metadata["unit_of_measurement"] != unit:
+                if WARN_UNSTABLE_UNIT not in hass.data:
+                    hass.data[WARN_UNSTABLE_UNIT] = set()
+                if entity_id not in hass.data[WARN_UNSTABLE_UNIT]:
+                    hass.data[WARN_UNSTABLE_UNIT].add(entity_id)
+                    _LOGGER.warning(
+                        "The unit of %s (%s) does not match the unit of already "
+                        "compiled statistics (%s). Generation of long term statistics "
+                        "will be suppressed unless the unit changes back to %s",
+                        entity_id,
+                        unit,
+                        old_metadata["unit_of_measurement"],
+                        unit,
+                    )
+                continue
+
         result[entity_id] = {}
 
         # Set meta data
@@ -370,8 +394,11 @@ def list_statistic_ids(hass: HomeAssistant, statistic_type: str | None = None) -
 
     statistic_ids = {}
 
-    for entity_id, state_class, key in entities:
-        provided_statistics = DEVICE_CLASS_OR_UNIT_STATISTICS[state_class][key]
+    for entity_id, state_class, device_class in entities:
+        if device_class in DEVICE_CLASS_STATISTICS[state_class]:
+            provided_statistics = DEVICE_CLASS_STATISTICS[state_class][device_class]
+        else:
+            provided_statistics = DEFAULT_STATISTICS[state_class]
 
         if statistic_type is not None and statistic_type not in provided_statistics:
             continue
@@ -386,16 +413,20 @@ def list_statistic_ids(hass: HomeAssistant, statistic_type: str | None = None) -
         ):
             continue
 
-        native_unit = state.attributes.get(ATTR_UNIT_OF_MEASUREMENT)
+        metadata = statistics.get_metadata(hass, entity_id)
+        if metadata:
+            native_unit: str | None = metadata["unit_of_measurement"]
+        else:
+            native_unit = state.attributes.get(ATTR_UNIT_OF_MEASUREMENT)
 
-        if key not in UNIT_CONVERSIONS:
+        if device_class not in UNIT_CONVERSIONS:
             statistic_ids[entity_id] = native_unit
             continue
 
-        if native_unit not in UNIT_CONVERSIONS[key]:
+        if native_unit not in UNIT_CONVERSIONS[device_class]:
             continue
 
-        statistics_unit = DEVICE_CLASS_UNITS[key]
+        statistics_unit = DEVICE_CLASS_UNITS[device_class]
         statistic_ids[entity_id] = statistics_unit
 
     return statistic_ids
diff --git a/tests/components/sensor/test_recorder.py b/tests/components/sensor/test_recorder.py
index abeda03ed7b..5b82fcf0506 100644
--- a/tests/components/sensor/test_recorder.py
+++ b/tests/components/sensor/test_recorder.py
@@ -107,24 +107,34 @@ def test_compile_hourly_statistics(
 @pytest.mark.parametrize("attributes", [TEMPERATURE_SENSOR_ATTRIBUTES])
 def test_compile_hourly_statistics_unsupported(hass_recorder, caplog, attributes):
     """Test compiling hourly statistics for unsupported sensor."""
-    attributes = dict(attributes)
     zero = dt_util.utcnow()
     hass = hass_recorder()
     recorder = hass.data[DATA_INSTANCE]
     setup_component(hass, "sensor", {})
     four, states = record_states(hass, zero, "sensor.test1", attributes)
-    if "unit_of_measurement" in attributes:
-        attributes["unit_of_measurement"] = "invalid"
-        _, _states = record_states(hass, zero, "sensor.test2", attributes)
-        states = {**states, **_states}
-        attributes.pop("unit_of_measurement")
-        _, _states = record_states(hass, zero, "sensor.test3", attributes)
-        states = {**states, **_states}
-    attributes["state_class"] = "invalid"
-    _, _states = record_states(hass, zero, "sensor.test4", attributes)
+
+    attributes_tmp = dict(attributes)
+    attributes_tmp["unit_of_measurement"] = "invalid"
+    _, _states = record_states(hass, zero, "sensor.test2", attributes_tmp)
     states = {**states, **_states}
-    attributes.pop("state_class")
-    _, _states = record_states(hass, zero, "sensor.test5", attributes)
+    attributes_tmp.pop("unit_of_measurement")
+    _, _states = record_states(hass, zero, "sensor.test3", attributes_tmp)
+    states = {**states, **_states}
+
+    attributes_tmp = dict(attributes)
+    attributes_tmp["state_class"] = "invalid"
+    _, _states = record_states(hass, zero, "sensor.test4", attributes_tmp)
+    states = {**states, **_states}
+    attributes_tmp.pop("state_class")
+    _, _states = record_states(hass, zero, "sensor.test5", attributes_tmp)
+    states = {**states, **_states}
+
+    attributes_tmp = dict(attributes)
+    attributes_tmp["device_class"] = "invalid"
+    _, _states = record_states(hass, zero, "sensor.test6", attributes_tmp)
+    states = {**states, **_states}
+    attributes_tmp.pop("device_class")
+    _, _states = record_states(hass, zero, "sensor.test7", attributes_tmp)
     states = {**states, **_states}
 
     hist = history.get_significant_states(hass, zero, four)
@@ -134,7 +144,9 @@ def test_compile_hourly_statistics_unsupported(hass_recorder, caplog, attributes
     wait_recording_done(hass)
     statistic_ids = list_statistic_ids(hass)
     assert statistic_ids == [
-        {"statistic_id": "sensor.test1", "unit_of_measurement": "°C"}
+        {"statistic_id": "sensor.test1", "unit_of_measurement": "°C"},
+        {"statistic_id": "sensor.test6", "unit_of_measurement": "°C"},
+        {"statistic_id": "sensor.test7", "unit_of_measurement": "°C"},
     ]
     stats = statistics_during_period(hass, zero)
     assert stats == {
@@ -149,7 +161,31 @@ def test_compile_hourly_statistics_unsupported(hass_recorder, caplog, attributes
                 "state": None,
                 "sum": None,
             }
-        ]
+        ],
+        "sensor.test6": [
+            {
+                "statistic_id": "sensor.test6",
+                "start": process_timestamp_to_utc_isoformat(zero),
+                "mean": approx(16.440677966101696),
+                "min": approx(10.0),
+                "max": approx(30.0),
+                "last_reset": None,
+                "state": None,
+                "sum": None,
+            }
+        ],
+        "sensor.test7": [
+            {
+                "statistic_id": "sensor.test7",
+                "start": process_timestamp_to_utc_isoformat(zero),
+                "mean": approx(16.440677966101696),
+                "min": approx(10.0),
+                "max": approx(30.0),
+                "last_reset": None,
+                "state": None,
+                "sum": None,
+            }
+        ],
     }
     assert "Error while processing event StatisticsTask" not in caplog.text
 
@@ -859,6 +895,136 @@ def test_list_statistic_ids_unsupported(hass_recorder, caplog, _attributes):
     hass.states.set("sensor.test6", 0, attributes=attributes)
 
 
+@pytest.mark.parametrize(
+    "device_class,unit,native_unit,mean,min,max",
+    [
+        (None, None, None, 16.440677, 10, 30),
+        (None, "%", "%", 16.440677, 10, 30),
+        ("battery", "%", "%", 16.440677, 10, 30),
+        ("battery", None, None, 16.440677, 10, 30),
+    ],
+)
+def test_compile_hourly_statistics_changing_units_1(
+    hass_recorder, caplog, device_class, unit, native_unit, mean, min, max
+):
+    """Test compiling hourly statistics where units change from one hour to the next."""
+    zero = dt_util.utcnow()
+    hass = hass_recorder()
+    recorder = hass.data[DATA_INSTANCE]
+    setup_component(hass, "sensor", {})
+    attributes = {
+        "device_class": device_class,
+        "state_class": "measurement",
+        "unit_of_measurement": unit,
+    }
+    four, states = record_states(hass, zero, "sensor.test1", attributes)
+    attributes["unit_of_measurement"] = "cats"
+    four, _states = record_states(
+        hass, zero + timedelta(hours=1), "sensor.test1", attributes
+    )
+    states["sensor.test1"] += _states["sensor.test1"]
+    four, _states = record_states(
+        hass, zero + timedelta(hours=2), "sensor.test1", attributes
+    )
+    states["sensor.test1"] += _states["sensor.test1"]
+    hist = history.get_significant_states(hass, zero, four)
+    assert dict(states) == dict(hist)
+
+    recorder.do_adhoc_statistics(period="hourly", start=zero)
+    wait_recording_done(hass)
+    assert "does not match the unit of already compiled" not in caplog.text
+    statistic_ids = list_statistic_ids(hass)
+    assert statistic_ids == [
+        {"statistic_id": "sensor.test1", "unit_of_measurement": native_unit}
+    ]
+    stats = statistics_during_period(hass, zero)
+    assert stats == {
+        "sensor.test1": [
+            {
+                "statistic_id": "sensor.test1",
+                "start": process_timestamp_to_utc_isoformat(zero),
+                "mean": approx(mean),
+                "min": approx(min),
+                "max": approx(max),
+                "last_reset": None,
+                "state": None,
+                "sum": None,
+            }
+        ]
+    }
+
+    recorder.do_adhoc_statistics(period="hourly", start=zero + timedelta(hours=2))
+    wait_recording_done(hass)
+    assert (
+        "The unit of sensor.test1 (cats) does not match the unit of already compiled "
+        f"statistics ({native_unit})" in caplog.text
+    )
+    statistic_ids = list_statistic_ids(hass)
+    assert statistic_ids == [
+        {"statistic_id": "sensor.test1", "unit_of_measurement": native_unit}
+    ]
+    stats = statistics_during_period(hass, zero)
+    assert stats == {
+        "sensor.test1": [
+            {
+                "statistic_id": "sensor.test1",
+                "start": process_timestamp_to_utc_isoformat(zero),
+                "mean": approx(mean),
+                "min": approx(min),
+                "max": approx(max),
+                "last_reset": None,
+                "state": None,
+                "sum": None,
+            }
+        ]
+    }
+    assert "Error while processing event StatisticsTask" not in caplog.text
+
+
+@pytest.mark.parametrize(
+    "device_class,unit,native_unit,mean,min,max",
+    [
+        (None, None, None, 16.440677, 10, 30),
+        (None, "%", "%", 16.440677, 10, 30),
+        ("battery", "%", "%", 16.440677, 10, 30),
+        ("battery", None, None, 16.440677, 10, 30),
+    ],
+)
+def test_compile_hourly_statistics_changing_units_2(
+    hass_recorder, caplog, device_class, unit, native_unit, mean, min, max
+):
+    """Test compiling hourly statistics where units change during an hour."""
+    zero = dt_util.utcnow()
+    hass = hass_recorder()
+    recorder = hass.data[DATA_INSTANCE]
+    setup_component(hass, "sensor", {})
+    attributes = {
+        "device_class": device_class,
+        "state_class": "measurement",
+        "unit_of_measurement": unit,
+    }
+    four, states = record_states(hass, zero, "sensor.test1", attributes)
+    attributes["unit_of_measurement"] = "cats"
+    four, _states = record_states(
+        hass, zero + timedelta(hours=1), "sensor.test1", attributes
+    )
+    states["sensor.test1"] += _states["sensor.test1"]
+    hist = history.get_significant_states(hass, zero, four)
+    assert dict(states) == dict(hist)
+
+    recorder.do_adhoc_statistics(period="hourly", start=zero + timedelta(minutes=30))
+    wait_recording_done(hass)
+    assert "The unit of sensor.test1 is changing" in caplog.text
+    statistic_ids = list_statistic_ids(hass)
+    assert statistic_ids == [
+        {"statistic_id": "sensor.test1", "unit_of_measurement": "cats"}
+    ]
+    stats = statistics_during_period(hass, zero)
+    assert stats == {}
+
+    assert "Error while processing event StatisticsTask" not in caplog.text
+
+
 def record_states(hass, zero, entity_id, attributes):
     """Record some test states.