From 1a82b353e0d102eea6ccd57fae2bc153d2004f77 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sat, 29 Apr 2023 20:17:09 -0500 Subject: [PATCH] Auto repair incorrect collation on MySQL schema (#92270) * Auto repair incorrect collation on MySQL schema As we do more union queries in 2023.5.x if there is a mismatch between collations on tables, they will fail with an error that is hard for the user to figure out how to fix `Error executing query: (MySQLdb.OperationalError) (1271, "Illegal mix of collations for operation UNION")` This was reported in the #beta channel and by PM from others so the problem is not isolated to a single user https://discord.com/channels/330944238910963714/427516175237382144/1100908739910963272 * test with ascii since older maraidb versions may not work otherwise * Revert "test with ascii since older maraidb versions may not work otherwise" This reverts commit 787fda1aefcd8418a28a8a8f430e7e7232218ef8.t * older version need to check collation_server because the collation is not reflected if its the default --- .../recorder/auto_repairs/events/schema.py | 9 ++- .../recorder/auto_repairs/schema.py | 60 ++++++++++++++++- .../recorder/auto_repairs/states/schema.py | 3 + .../auto_repairs/statistics/schema.py | 3 + .../auto_repairs/events/test_schema.py | 29 +++++++++ .../auto_repairs/states/test_schema.py | 29 +++++++++ .../auto_repairs/statistics/test_schema.py | 30 +++++++++ .../recorder/auto_repairs/test_schema.py | 64 +++++++++++++++++++ 8 files changed, 224 insertions(+), 3 deletions(-) diff --git a/homeassistant/components/recorder/auto_repairs/events/schema.py b/homeassistant/components/recorder/auto_repairs/events/schema.py index e32cbd4df7f..3cc2e74f95b 100644 --- a/homeassistant/components/recorder/auto_repairs/events/schema.py +++ b/homeassistant/components/recorder/auto_repairs/events/schema.py @@ -8,6 +8,7 @@ from ..schema import ( correct_db_schema_precision, correct_db_schema_utf8, validate_db_schema_precision, + validate_table_schema_has_correct_collation, validate_table_schema_supports_utf8, ) @@ -17,9 +18,12 @@ if TYPE_CHECKING: def validate_db_schema(instance: Recorder) -> set[str]: """Do some basic checks for common schema errors caused by manual migration.""" - return validate_table_schema_supports_utf8( + schema_errors = validate_table_schema_supports_utf8( instance, EventData, (EventData.shared_data,) ) | validate_db_schema_precision(instance, Events) + for table in (Events, EventData): + schema_errors |= validate_table_schema_has_correct_collation(instance, table) + return schema_errors def correct_db_schema( @@ -27,5 +31,6 @@ def correct_db_schema( schema_errors: set[str], ) -> None: """Correct issues detected by validate_db_schema.""" - correct_db_schema_utf8(instance, EventData, schema_errors) + for table in (Events, EventData): + correct_db_schema_utf8(instance, table, schema_errors) correct_db_schema_precision(instance, Events, schema_errors) diff --git a/homeassistant/components/recorder/auto_repairs/schema.py b/homeassistant/components/recorder/auto_repairs/schema.py index ec05eafd140..aa036f33999 100644 --- a/homeassistant/components/recorder/auto_repairs/schema.py +++ b/homeassistant/components/recorder/auto_repairs/schema.py @@ -5,6 +5,7 @@ from collections.abc import Iterable, Mapping import logging from typing import TYPE_CHECKING +from sqlalchemy import MetaData from sqlalchemy.exc import OperationalError from sqlalchemy.orm import DeclarativeBase from sqlalchemy.orm.attributes import InstrumentedAttribute @@ -60,6 +61,60 @@ def validate_table_schema_supports_utf8( return schema_errors +def validate_table_schema_has_correct_collation( + instance: Recorder, + table_object: type[DeclarativeBase], +) -> set[str]: + """Verify the table has the correct collation.""" + schema_errors: set[str] = set() + # Lack of full utf8 support is only an issue for MySQL / MariaDB + if instance.dialect_name != SupportedDialect.MYSQL: + return schema_errors + + try: + schema_errors = _validate_table_schema_has_correct_collation( + instance, table_object + ) + except Exception as exc: # pylint: disable=broad-except + _LOGGER.exception("Error when validating DB schema: %s", exc) + + _log_schema_errors(table_object, schema_errors) + return schema_errors + + +def _validate_table_schema_has_correct_collation( + instance: Recorder, + table_object: type[DeclarativeBase], +) -> set[str]: + """Ensure the table has the correct collation to avoid union errors with mixed collations.""" + schema_errors: set[str] = set() + # Mark the session as read_only to ensure that the test data is not committed + # to the database and we always rollback when the scope is exited + with session_scope(session=instance.get_session(), read_only=True) as session: + table = table_object.__tablename__ + metadata_obj = MetaData() + connection = session.connection() + metadata_obj.reflect(bind=connection) + dialect_kwargs = metadata_obj.tables[table].dialect_kwargs + # Check if the table has a collation set, if its not set than its + # using the server default collation for the database + + collate = ( + dialect_kwargs.get("mysql_collate") + or dialect_kwargs.get( + "mariadb_collate" + ) # pylint: disable-next=protected-access + or connection.dialect._fetch_setting(connection, "collation_server") # type: ignore[attr-defined] + ) + if collate and collate != "utf8mb4_unicode_ci": + _LOGGER.debug( + "Database %s collation is not utf8mb4_unicode_ci", + table, + ) + schema_errors.add(f"{table}.utf8mb4_unicode_ci") + return schema_errors + + def _validate_table_schema_supports_utf8( instance: Recorder, table_object: type[DeclarativeBase], @@ -184,7 +239,10 @@ def correct_db_schema_utf8( ) -> None: """Correct utf8 issues detected by validate_db_schema.""" table_name = table_object.__tablename__ - if f"{table_name}.4-byte UTF-8" in schema_errors: + if ( + f"{table_name}.4-byte UTF-8" in schema_errors + or f"{table_name}.utf8mb4_unicode_ci" in schema_errors + ): from ..migration import ( # pylint: disable=import-outside-toplevel _correct_table_character_set_and_collation, ) diff --git a/homeassistant/components/recorder/auto_repairs/states/schema.py b/homeassistant/components/recorder/auto_repairs/states/schema.py index 258e15cbb52..3c0daef452d 100644 --- a/homeassistant/components/recorder/auto_repairs/states/schema.py +++ b/homeassistant/components/recorder/auto_repairs/states/schema.py @@ -8,6 +8,7 @@ from ..schema import ( correct_db_schema_precision, correct_db_schema_utf8, validate_db_schema_precision, + validate_table_schema_has_correct_collation, validate_table_schema_supports_utf8, ) @@ -26,6 +27,8 @@ def validate_db_schema(instance: Recorder) -> set[str]: for table, columns in TABLE_UTF8_COLUMNS.items(): schema_errors |= validate_table_schema_supports_utf8(instance, table, columns) schema_errors |= validate_db_schema_precision(instance, States) + for table in (States, StateAttributes): + schema_errors |= validate_table_schema_has_correct_collation(instance, table) return schema_errors diff --git a/homeassistant/components/recorder/auto_repairs/statistics/schema.py b/homeassistant/components/recorder/auto_repairs/statistics/schema.py index 9b4687cb72d..607935bd6ff 100644 --- a/homeassistant/components/recorder/auto_repairs/statistics/schema.py +++ b/homeassistant/components/recorder/auto_repairs/statistics/schema.py @@ -9,6 +9,7 @@ from ..schema import ( correct_db_schema_precision, correct_db_schema_utf8, validate_db_schema_precision, + validate_table_schema_has_correct_collation, validate_table_schema_supports_utf8, ) @@ -26,6 +27,7 @@ def validate_db_schema(instance: Recorder) -> set[str]: ) for table in (Statistics, StatisticsShortTerm): schema_errors |= validate_db_schema_precision(instance, table) + schema_errors |= validate_table_schema_has_correct_collation(instance, table) if schema_errors: _LOGGER.debug( "Detected statistics schema errors: %s", ", ".join(sorted(schema_errors)) @@ -41,3 +43,4 @@ def correct_db_schema( correct_db_schema_utf8(instance, StatisticsMeta, schema_errors) for table in (Statistics, StatisticsShortTerm): correct_db_schema_precision(instance, table, schema_errors) + correct_db_schema_utf8(instance, table, schema_errors) diff --git a/tests/components/recorder/auto_repairs/events/test_schema.py b/tests/components/recorder/auto_repairs/events/test_schema.py index b19ff4ca503..1fd5d769c7c 100644 --- a/tests/components/recorder/auto_repairs/events/test_schema.py +++ b/tests/components/recorder/auto_repairs/events/test_schema.py @@ -74,3 +74,32 @@ async def test_validate_db_schema_fix_utf8_issue_event_data( "Updating character set and collation of table event_data to utf8mb4" in caplog.text ) + + +@pytest.mark.parametrize("enable_schema_validation", [True]) +async def test_validate_db_schema_fix_collation_issue( + async_setup_recorder_instance: RecorderInstanceGenerator, + hass: HomeAssistant, + caplog: pytest.LogCaptureFixture, +) -> None: + """Test validating DB schema with MySQL. + + Note: The test uses SQLite, the purpose is only to exercise the code. + """ + with patch( + "homeassistant.components.recorder.core.Recorder.dialect_name", "mysql" + ), patch( + "homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation", + return_value={"events.utf8mb4_unicode_ci"}, + ): + await async_setup_recorder_instance(hass) + await async_wait_recording_done(hass) + + assert "Schema validation failed" not in caplog.text + assert ( + "Database is about to correct DB schema errors: events.utf8mb4_unicode_ci" + in caplog.text + ) + assert ( + "Updating character set and collation of table events to utf8mb4" in caplog.text + ) diff --git a/tests/components/recorder/auto_repairs/states/test_schema.py b/tests/components/recorder/auto_repairs/states/test_schema.py index 2e37001582e..9b90489d7c0 100644 --- a/tests/components/recorder/auto_repairs/states/test_schema.py +++ b/tests/components/recorder/auto_repairs/states/test_schema.py @@ -104,3 +104,32 @@ async def test_validate_db_schema_fix_utf8_issue_state_attributes( "Updating character set and collation of table state_attributes to utf8mb4" in caplog.text ) + + +@pytest.mark.parametrize("enable_schema_validation", [True]) +async def test_validate_db_schema_fix_collation_issue( + async_setup_recorder_instance: RecorderInstanceGenerator, + hass: HomeAssistant, + caplog: pytest.LogCaptureFixture, +) -> None: + """Test validating DB schema with MySQL. + + Note: The test uses SQLite, the purpose is only to exercise the code. + """ + with patch( + "homeassistant.components.recorder.core.Recorder.dialect_name", "mysql" + ), patch( + "homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation", + return_value={"states.utf8mb4_unicode_ci"}, + ): + await async_setup_recorder_instance(hass) + await async_wait_recording_done(hass) + + assert "Schema validation failed" not in caplog.text + assert ( + "Database is about to correct DB schema errors: states.utf8mb4_unicode_ci" + in caplog.text + ) + assert ( + "Updating character set and collation of table states to utf8mb4" in caplog.text + ) diff --git a/tests/components/recorder/auto_repairs/statistics/test_schema.py b/tests/components/recorder/auto_repairs/statistics/test_schema.py index dfe036355aa..10d1ed00b5b 100644 --- a/tests/components/recorder/auto_repairs/statistics/test_schema.py +++ b/tests/components/recorder/auto_repairs/statistics/test_schema.py @@ -83,3 +83,33 @@ async def test_validate_db_schema_fix_float_issue( "sum DOUBLE PRECISION", ] modify_columns_mock.assert_called_once_with(ANY, ANY, table, modification) + + +@pytest.mark.parametrize("enable_schema_validation", [True]) +async def test_validate_db_schema_fix_collation_issue( + async_setup_recorder_instance: RecorderInstanceGenerator, + hass: HomeAssistant, + caplog: pytest.LogCaptureFixture, +) -> None: + """Test validating DB schema with MySQL. + + Note: The test uses SQLite, the purpose is only to exercise the code. + """ + with patch( + "homeassistant.components.recorder.core.Recorder.dialect_name", "mysql" + ), patch( + "homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation", + return_value={"statistics.utf8mb4_unicode_ci"}, + ): + await async_setup_recorder_instance(hass) + await async_wait_recording_done(hass) + + assert "Schema validation failed" not in caplog.text + assert ( + "Database is about to correct DB schema errors: statistics.utf8mb4_unicode_ci" + in caplog.text + ) + assert ( + "Updating character set and collation of table statistics to utf8mb4" + in caplog.text + ) diff --git a/tests/components/recorder/auto_repairs/test_schema.py b/tests/components/recorder/auto_repairs/test_schema.py index 510f46f98a2..ad2c33bfb88 100644 --- a/tests/components/recorder/auto_repairs/test_schema.py +++ b/tests/components/recorder/auto_repairs/test_schema.py @@ -10,6 +10,7 @@ from homeassistant.components.recorder.auto_repairs.schema import ( correct_db_schema_precision, correct_db_schema_utf8, validate_db_schema_precision, + validate_table_schema_has_correct_collation, validate_table_schema_supports_utf8, ) from homeassistant.components.recorder.db_schema import States @@ -106,6 +107,69 @@ async def test_validate_db_schema_fix_utf8_issue_with_broken_schema( assert schema_errors == set() +async def test_validate_db_schema_fix_incorrect_collation( + async_setup_recorder_instance: RecorderInstanceGenerator, + hass: HomeAssistant, + recorder_db_url: str, + caplog: pytest.LogCaptureFixture, +) -> None: + """Test validating DB schema with MySQL when the collation is incorrect.""" + if not recorder_db_url.startswith("mysql://"): + # This problem only happens on MySQL + return + await async_setup_recorder_instance(hass) + await async_wait_recording_done(hass) + instance = get_instance(hass) + session_maker = instance.get_session + + def _break_states_schema(): + with session_scope(session=session_maker()) as session: + session.execute( + text( + "ALTER TABLE states CHARACTER SET utf8mb3 COLLATE utf8_general_ci, " + "LOCK=EXCLUSIVE;" + ) + ) + + await instance.async_add_executor_job(_break_states_schema) + schema_errors = await instance.async_add_executor_job( + validate_table_schema_has_correct_collation, instance, States + ) + assert schema_errors == {"states.utf8mb4_unicode_ci"} + + # Now repair the schema + await instance.async_add_executor_job( + correct_db_schema_utf8, instance, States, schema_errors + ) + + # Now validate the schema again + schema_errors = await instance.async_add_executor_job( + validate_table_schema_has_correct_collation, instance, States + ) + assert schema_errors == set() + + +async def test_validate_db_schema_precision_correct_collation( + async_setup_recorder_instance: RecorderInstanceGenerator, + hass: HomeAssistant, + recorder_db_url: str, + caplog: pytest.LogCaptureFixture, +) -> None: + """Test validating DB schema when the schema is correct with the correct collation.""" + if not recorder_db_url.startswith("mysql://"): + # This problem only happens on MySQL + return + await async_setup_recorder_instance(hass) + await async_wait_recording_done(hass) + instance = get_instance(hass) + schema_errors = await instance.async_add_executor_job( + validate_table_schema_has_correct_collation, + instance, + States, + ) + assert schema_errors == set() + + async def test_validate_db_schema_fix_utf8_issue_with_broken_schema_unrepairable( async_setup_recorder_instance: RecorderInstanceGenerator, hass: HomeAssistant,