Auto repair incorrect collation on MySQL schema (#92270)

* Auto repair incorrect collation on MySQL schema

As we do more union queries in 2023.5.x if there is a mismatch
between collations on tables, they will fail with an error
that is hard for the user to figure out how to fix

`Error executing query: (MySQLdb.OperationalError) (1271, "Illegal mix of collations for operation UNION")`

This was reported in the #beta channel and by PM from others
so the problem is not isolated to a single user

https://discord.com/channels/330944238910963714/427516175237382144/1100908739910963272

* test with ascii since older maraidb versions may not work otherwise

* Revert "test with ascii since older maraidb versions may not work otherwise"

This reverts commit 787fda1aefcd8418a28a8a8f430e7e7232218ef8.t

* older version need to check collation_server because the collation is not reflected if its the default
This commit is contained in:
J. Nick Koston 2023-04-29 20:17:09 -05:00 committed by GitHub
parent 3a5a9a90b2
commit 1a82b353e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 224 additions and 3 deletions

View File

@ -8,6 +8,7 @@ from ..schema import (
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)
@ -17,9 +18,12 @@ if TYPE_CHECKING:
def validate_db_schema(instance: Recorder) -> set[str]:
"""Do some basic checks for common schema errors caused by manual migration."""
return validate_table_schema_supports_utf8(
schema_errors = validate_table_schema_supports_utf8(
instance, EventData, (EventData.shared_data,)
) | validate_db_schema_precision(instance, Events)
for table in (Events, EventData):
schema_errors |= validate_table_schema_has_correct_collation(instance, table)
return schema_errors
def correct_db_schema(
@ -27,5 +31,6 @@ def correct_db_schema(
schema_errors: set[str],
) -> None:
"""Correct issues detected by validate_db_schema."""
correct_db_schema_utf8(instance, EventData, schema_errors)
for table in (Events, EventData):
correct_db_schema_utf8(instance, table, schema_errors)
correct_db_schema_precision(instance, Events, schema_errors)

View File

@ -5,6 +5,7 @@ from collections.abc import Iterable, Mapping
import logging
from typing import TYPE_CHECKING
from sqlalchemy import MetaData
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm.attributes import InstrumentedAttribute
@ -60,6 +61,60 @@ def validate_table_schema_supports_utf8(
return schema_errors
def validate_table_schema_has_correct_collation(
instance: Recorder,
table_object: type[DeclarativeBase],
) -> set[str]:
"""Verify the table has the correct collation."""
schema_errors: set[str] = set()
# Lack of full utf8 support is only an issue for MySQL / MariaDB
if instance.dialect_name != SupportedDialect.MYSQL:
return schema_errors
try:
schema_errors = _validate_table_schema_has_correct_collation(
instance, table_object
)
except Exception as exc: # pylint: disable=broad-except
_LOGGER.exception("Error when validating DB schema: %s", exc)
_log_schema_errors(table_object, schema_errors)
return schema_errors
def _validate_table_schema_has_correct_collation(
instance: Recorder,
table_object: type[DeclarativeBase],
) -> set[str]:
"""Ensure the table has the correct collation to avoid union errors with mixed collations."""
schema_errors: set[str] = set()
# Mark the session as read_only to ensure that the test data is not committed
# to the database and we always rollback when the scope is exited
with session_scope(session=instance.get_session(), read_only=True) as session:
table = table_object.__tablename__
metadata_obj = MetaData()
connection = session.connection()
metadata_obj.reflect(bind=connection)
dialect_kwargs = metadata_obj.tables[table].dialect_kwargs
# Check if the table has a collation set, if its not set than its
# using the server default collation for the database
collate = (
dialect_kwargs.get("mysql_collate")
or dialect_kwargs.get(
"mariadb_collate"
) # pylint: disable-next=protected-access
or connection.dialect._fetch_setting(connection, "collation_server") # type: ignore[attr-defined]
)
if collate and collate != "utf8mb4_unicode_ci":
_LOGGER.debug(
"Database %s collation is not utf8mb4_unicode_ci",
table,
)
schema_errors.add(f"{table}.utf8mb4_unicode_ci")
return schema_errors
def _validate_table_schema_supports_utf8(
instance: Recorder,
table_object: type[DeclarativeBase],
@ -184,7 +239,10 @@ def correct_db_schema_utf8(
) -> None:
"""Correct utf8 issues detected by validate_db_schema."""
table_name = table_object.__tablename__
if f"{table_name}.4-byte UTF-8" in schema_errors:
if (
f"{table_name}.4-byte UTF-8" in schema_errors
or f"{table_name}.utf8mb4_unicode_ci" in schema_errors
):
from ..migration import ( # pylint: disable=import-outside-toplevel
_correct_table_character_set_and_collation,
)

View File

@ -8,6 +8,7 @@ from ..schema import (
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)
@ -26,6 +27,8 @@ def validate_db_schema(instance: Recorder) -> set[str]:
for table, columns in TABLE_UTF8_COLUMNS.items():
schema_errors |= validate_table_schema_supports_utf8(instance, table, columns)
schema_errors |= validate_db_schema_precision(instance, States)
for table in (States, StateAttributes):
schema_errors |= validate_table_schema_has_correct_collation(instance, table)
return schema_errors

View File

@ -9,6 +9,7 @@ from ..schema import (
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)
@ -26,6 +27,7 @@ def validate_db_schema(instance: Recorder) -> set[str]:
)
for table in (Statistics, StatisticsShortTerm):
schema_errors |= validate_db_schema_precision(instance, table)
schema_errors |= validate_table_schema_has_correct_collation(instance, table)
if schema_errors:
_LOGGER.debug(
"Detected statistics schema errors: %s", ", ".join(sorted(schema_errors))
@ -41,3 +43,4 @@ def correct_db_schema(
correct_db_schema_utf8(instance, StatisticsMeta, schema_errors)
for table in (Statistics, StatisticsShortTerm):
correct_db_schema_precision(instance, table, schema_errors)
correct_db_schema_utf8(instance, table, schema_errors)

View File

@ -74,3 +74,32 @@ async def test_validate_db_schema_fix_utf8_issue_event_data(
"Updating character set and collation of table event_data to utf8mb4"
in caplog.text
)
@pytest.mark.parametrize("enable_schema_validation", [True])
async def test_validate_db_schema_fix_collation_issue(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL.
Note: The test uses SQLite, the purpose is only to exercise the code.
"""
with patch(
"homeassistant.components.recorder.core.Recorder.dialect_name", "mysql"
), patch(
"homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation",
return_value={"events.utf8mb4_unicode_ci"},
):
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)
assert "Schema validation failed" not in caplog.text
assert (
"Database is about to correct DB schema errors: events.utf8mb4_unicode_ci"
in caplog.text
)
assert (
"Updating character set and collation of table events to utf8mb4" in caplog.text
)

View File

@ -104,3 +104,32 @@ async def test_validate_db_schema_fix_utf8_issue_state_attributes(
"Updating character set and collation of table state_attributes to utf8mb4"
in caplog.text
)
@pytest.mark.parametrize("enable_schema_validation", [True])
async def test_validate_db_schema_fix_collation_issue(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL.
Note: The test uses SQLite, the purpose is only to exercise the code.
"""
with patch(
"homeassistant.components.recorder.core.Recorder.dialect_name", "mysql"
), patch(
"homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation",
return_value={"states.utf8mb4_unicode_ci"},
):
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)
assert "Schema validation failed" not in caplog.text
assert (
"Database is about to correct DB schema errors: states.utf8mb4_unicode_ci"
in caplog.text
)
assert (
"Updating character set and collation of table states to utf8mb4" in caplog.text
)

View File

@ -83,3 +83,33 @@ async def test_validate_db_schema_fix_float_issue(
"sum DOUBLE PRECISION",
]
modify_columns_mock.assert_called_once_with(ANY, ANY, table, modification)
@pytest.mark.parametrize("enable_schema_validation", [True])
async def test_validate_db_schema_fix_collation_issue(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL.
Note: The test uses SQLite, the purpose is only to exercise the code.
"""
with patch(
"homeassistant.components.recorder.core.Recorder.dialect_name", "mysql"
), patch(
"homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation",
return_value={"statistics.utf8mb4_unicode_ci"},
):
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)
assert "Schema validation failed" not in caplog.text
assert (
"Database is about to correct DB schema errors: statistics.utf8mb4_unicode_ci"
in caplog.text
)
assert (
"Updating character set and collation of table statistics to utf8mb4"
in caplog.text
)

View File

@ -10,6 +10,7 @@ from homeassistant.components.recorder.auto_repairs.schema import (
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)
from homeassistant.components.recorder.db_schema import States
@ -106,6 +107,69 @@ async def test_validate_db_schema_fix_utf8_issue_with_broken_schema(
assert schema_errors == set()
async def test_validate_db_schema_fix_incorrect_collation(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
recorder_db_url: str,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL when the collation is incorrect."""
if not recorder_db_url.startswith("mysql://"):
# This problem only happens on MySQL
return
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)
instance = get_instance(hass)
session_maker = instance.get_session
def _break_states_schema():
with session_scope(session=session_maker()) as session:
session.execute(
text(
"ALTER TABLE states CHARACTER SET utf8mb3 COLLATE utf8_general_ci, "
"LOCK=EXCLUSIVE;"
)
)
await instance.async_add_executor_job(_break_states_schema)
schema_errors = await instance.async_add_executor_job(
validate_table_schema_has_correct_collation, instance, States
)
assert schema_errors == {"states.utf8mb4_unicode_ci"}
# Now repair the schema
await instance.async_add_executor_job(
correct_db_schema_utf8, instance, States, schema_errors
)
# Now validate the schema again
schema_errors = await instance.async_add_executor_job(
validate_table_schema_has_correct_collation, instance, States
)
assert schema_errors == set()
async def test_validate_db_schema_precision_correct_collation(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
recorder_db_url: str,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema when the schema is correct with the correct collation."""
if not recorder_db_url.startswith("mysql://"):
# This problem only happens on MySQL
return
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)
instance = get_instance(hass)
schema_errors = await instance.async_add_executor_job(
validate_table_schema_has_correct_collation,
instance,
States,
)
assert schema_errors == set()
async def test_validate_db_schema_fix_utf8_issue_with_broken_schema_unrepairable(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,