Fix performance of logbook entity and devices queries with large MySQL databases (#72898)

This commit is contained in:
J. Nick Koston 2022-06-02 11:54:06 -10:00 committed by GitHub
parent b97d346df7
commit 9fbde245d0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 93 additions and 44 deletions

View File

@ -12,9 +12,11 @@ from sqlalchemy.sql.selectable import Select
from homeassistant.components.proximity import DOMAIN as PROXIMITY_DOMAIN from homeassistant.components.proximity import DOMAIN as PROXIMITY_DOMAIN
from homeassistant.components.recorder.models import ( from homeassistant.components.recorder.models import (
EVENTS_CONTEXT_ID_INDEX,
OLD_FORMAT_ATTRS_JSON, OLD_FORMAT_ATTRS_JSON,
OLD_STATE, OLD_STATE,
SHARED_ATTRS_JSON, SHARED_ATTRS_JSON,
STATES_CONTEXT_ID_INDEX,
EventData, EventData,
Events, Events,
StateAttributes, StateAttributes,
@ -121,9 +123,7 @@ def select_events_context_only() -> Select:
By marking them as context_only we know they are only for By marking them as context_only we know they are only for
linking context ids and we can avoid processing them. linking context ids and we can avoid processing them.
""" """
return select(*EVENT_ROWS_NO_STATES, CONTEXT_ONLY).outerjoin( return select(*EVENT_ROWS_NO_STATES, CONTEXT_ONLY)
EventData, (Events.data_id == EventData.data_id)
)
def select_states_context_only() -> Select: def select_states_context_only() -> Select:
@ -252,3 +252,17 @@ def _not_uom_attributes_matcher() -> ClauseList:
return ~StateAttributes.shared_attrs.like( return ~StateAttributes.shared_attrs.like(
UNIT_OF_MEASUREMENT_JSON_LIKE UNIT_OF_MEASUREMENT_JSON_LIKE
) | ~States.attributes.like(UNIT_OF_MEASUREMENT_JSON_LIKE) ) | ~States.attributes.like(UNIT_OF_MEASUREMENT_JSON_LIKE)
def apply_states_context_hints(query: Query) -> Query:
"""Force mysql to use the right index on large context_id selects."""
return query.with_hint(
States, f"FORCE INDEX ({STATES_CONTEXT_ID_INDEX})", dialect_name="mysql"
)
def apply_events_context_hints(query: Query) -> Query:
"""Force mysql to use the right index on large context_id selects."""
return query.with_hint(
Events, f"FORCE INDEX ({EVENTS_CONTEXT_ID_INDEX})", dialect_name="mysql"
)

View File

@ -4,15 +4,22 @@ from __future__ import annotations
from collections.abc import Iterable from collections.abc import Iterable
from datetime import datetime as dt from datetime import datetime as dt
from sqlalchemy import lambda_stmt, select, union_all from sqlalchemy import lambda_stmt, select
from sqlalchemy.orm import Query from sqlalchemy.orm import Query
from sqlalchemy.sql.elements import ClauseList from sqlalchemy.sql.elements import ClauseList
from sqlalchemy.sql.lambdas import StatementLambdaElement from sqlalchemy.sql.lambdas import StatementLambdaElement
from sqlalchemy.sql.selectable import CTE, CompoundSelect from sqlalchemy.sql.selectable import CTE, CompoundSelect
from homeassistant.components.recorder.models import DEVICE_ID_IN_EVENT, Events, States from homeassistant.components.recorder.models import (
DEVICE_ID_IN_EVENT,
EventData,
Events,
States,
)
from .common import ( from .common import (
apply_events_context_hints,
apply_states_context_hints,
select_events_context_id_subquery, select_events_context_id_subquery,
select_events_context_only, select_events_context_only,
select_events_without_states, select_events_without_states,
@ -27,13 +34,10 @@ def _select_device_id_context_ids_sub_query(
json_quotable_device_ids: list[str], json_quotable_device_ids: list[str],
) -> CompoundSelect: ) -> CompoundSelect:
"""Generate a subquery to find context ids for multiple devices.""" """Generate a subquery to find context ids for multiple devices."""
return select( inner = select_events_context_id_subquery(start_day, end_day, event_types).where(
union_all(
select_events_context_id_subquery(start_day, end_day, event_types).where(
apply_event_device_id_matchers(json_quotable_device_ids) apply_event_device_id_matchers(json_quotable_device_ids)
),
).c.context_id
) )
return select(inner.c.context_id).group_by(inner.c.context_id)
def _apply_devices_context_union( def _apply_devices_context_union(
@ -51,8 +55,16 @@ def _apply_devices_context_union(
json_quotable_device_ids, json_quotable_device_ids,
).cte() ).cte()
return query.union_all( return query.union_all(
select_events_context_only().where(Events.context_id.in_(devices_cte.select())), apply_events_context_hints(
select_states_context_only().where(States.context_id.in_(devices_cte.select())), select_events_context_only()
.select_from(devices_cte)
.outerjoin(Events, devices_cte.c.context_id == Events.context_id)
).outerjoin(EventData, (Events.data_id == EventData.data_id)),
apply_states_context_hints(
select_states_context_only()
.select_from(devices_cte)
.outerjoin(States, devices_cte.c.context_id == States.context_id)
),
) )

View File

@ -14,11 +14,14 @@ from homeassistant.components.recorder.models import (
ENTITY_ID_IN_EVENT, ENTITY_ID_IN_EVENT,
ENTITY_ID_LAST_UPDATED_INDEX, ENTITY_ID_LAST_UPDATED_INDEX,
OLD_ENTITY_ID_IN_EVENT, OLD_ENTITY_ID_IN_EVENT,
EventData,
Events, Events,
States, States,
) )
from .common import ( from .common import (
apply_events_context_hints,
apply_states_context_hints,
apply_states_filters, apply_states_filters,
select_events_context_id_subquery, select_events_context_id_subquery,
select_events_context_only, select_events_context_only,
@ -36,16 +39,15 @@ def _select_entities_context_ids_sub_query(
json_quotable_entity_ids: list[str], json_quotable_entity_ids: list[str],
) -> CompoundSelect: ) -> CompoundSelect:
"""Generate a subquery to find context ids for multiple entities.""" """Generate a subquery to find context ids for multiple entities."""
return select( union = union_all(
union_all(
select_events_context_id_subquery(start_day, end_day, event_types).where( select_events_context_id_subquery(start_day, end_day, event_types).where(
apply_event_entity_id_matchers(json_quotable_entity_ids) apply_event_entity_id_matchers(json_quotable_entity_ids)
), ),
apply_entities_hints(select(States.context_id)) apply_entities_hints(select(States.context_id))
.filter((States.last_updated > start_day) & (States.last_updated < end_day)) .filter((States.last_updated > start_day) & (States.last_updated < end_day))
.where(States.entity_id.in_(entity_ids)), .where(States.entity_id.in_(entity_ids)),
).c.context_id
) )
return select(union.c.context_id).group_by(union.c.context_id)
def _apply_entities_context_union( def _apply_entities_context_union(
@ -64,14 +66,23 @@ def _apply_entities_context_union(
entity_ids, entity_ids,
json_quotable_entity_ids, json_quotable_entity_ids,
).cte() ).cte()
# We used to optimize this to exclude rows we already in the union with
# a States.entity_id.not_in(entity_ids) but that made the
# query much slower on MySQL, and since we already filter them away
# in the python code anyways since they will have context_only
# set on them the impact is minimal.
return query.union_all( return query.union_all(
states_query_for_entity_ids(start_day, end_day, entity_ids), states_query_for_entity_ids(start_day, end_day, entity_ids),
select_events_context_only().where( apply_events_context_hints(
Events.context_id.in_(entities_cte.select()) select_events_context_only()
), .select_from(entities_cte)
.outerjoin(Events, entities_cte.c.context_id == Events.context_id)
).outerjoin(EventData, (Events.data_id == EventData.data_id)),
apply_states_context_hints(
select_states_context_only() select_states_context_only()
.where(States.entity_id.not_in(entity_ids)) .select_from(entities_cte)
.where(States.context_id.in_(entities_cte.select())), .outerjoin(States, entities_cte.c.context_id == States.context_id)
),
) )

View File

@ -10,9 +10,11 @@ from sqlalchemy.orm import Query
from sqlalchemy.sql.lambdas import StatementLambdaElement from sqlalchemy.sql.lambdas import StatementLambdaElement
from sqlalchemy.sql.selectable import CTE, CompoundSelect from sqlalchemy.sql.selectable import CTE, CompoundSelect
from homeassistant.components.recorder.models import Events, States from homeassistant.components.recorder.models import EventData, Events, States
from .common import ( from .common import (
apply_events_context_hints,
apply_states_context_hints,
select_events_context_id_subquery, select_events_context_id_subquery,
select_events_context_only, select_events_context_only,
select_events_without_states, select_events_without_states,
@ -35,8 +37,7 @@ def _select_entities_device_id_context_ids_sub_query(
json_quotable_device_ids: list[str], json_quotable_device_ids: list[str],
) -> CompoundSelect: ) -> CompoundSelect:
"""Generate a subquery to find context ids for multiple entities and multiple devices.""" """Generate a subquery to find context ids for multiple entities and multiple devices."""
return select( union = union_all(
union_all(
select_events_context_id_subquery(start_day, end_day, event_types).where( select_events_context_id_subquery(start_day, end_day, event_types).where(
_apply_event_entity_id_device_id_matchers( _apply_event_entity_id_device_id_matchers(
json_quotable_entity_ids, json_quotable_device_ids json_quotable_entity_ids, json_quotable_device_ids
@ -45,8 +46,8 @@ def _select_entities_device_id_context_ids_sub_query(
apply_entities_hints(select(States.context_id)) apply_entities_hints(select(States.context_id))
.filter((States.last_updated > start_day) & (States.last_updated < end_day)) .filter((States.last_updated > start_day) & (States.last_updated < end_day))
.where(States.entity_id.in_(entity_ids)), .where(States.entity_id.in_(entity_ids)),
).c.context_id
) )
return select(union.c.context_id).group_by(union.c.context_id)
def _apply_entities_devices_context_union( def _apply_entities_devices_context_union(
@ -66,14 +67,23 @@ def _apply_entities_devices_context_union(
json_quotable_entity_ids, json_quotable_entity_ids,
json_quotable_device_ids, json_quotable_device_ids,
).cte() ).cte()
# We used to optimize this to exclude rows we already in the union with
# a States.entity_id.not_in(entity_ids) but that made the
# query much slower on MySQL, and since we already filter them away
# in the python code anyways since they will have context_only
# set on them the impact is minimal.
return query.union_all( return query.union_all(
states_query_for_entity_ids(start_day, end_day, entity_ids), states_query_for_entity_ids(start_day, end_day, entity_ids),
select_events_context_only().where( apply_events_context_hints(
Events.context_id.in_(devices_entities_cte.select()) select_events_context_only()
), .select_from(devices_entities_cte)
.outerjoin(Events, devices_entities_cte.c.context_id == Events.context_id)
).outerjoin(EventData, (Events.data_id == EventData.data_id)),
apply_states_context_hints(
select_states_context_only() select_states_context_only()
.where(States.entity_id.not_in(entity_ids)) .select_from(devices_entities_cte)
.where(States.context_id.in_(devices_entities_cte.select())), .outerjoin(States, devices_entities_cte.c.context_id == States.context_id)
),
) )

View File

@ -93,6 +93,8 @@ TABLES_TO_CHECK = [
LAST_UPDATED_INDEX = "ix_states_last_updated" LAST_UPDATED_INDEX = "ix_states_last_updated"
ENTITY_ID_LAST_UPDATED_INDEX = "ix_states_entity_id_last_updated" ENTITY_ID_LAST_UPDATED_INDEX = "ix_states_entity_id_last_updated"
EVENTS_CONTEXT_ID_INDEX = "ix_events_context_id"
STATES_CONTEXT_ID_INDEX = "ix_states_context_id"
EMPTY_JSON_OBJECT = "{}" EMPTY_JSON_OBJECT = "{}"