Create an issue when database backups fail because the system runs out of resources (#109020)

This commit is contained in:
J. Nick Koston 2024-01-30 10:23:58 -10:00 committed by GitHub
parent 6174aa4e59
commit a22244707b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 48 additions and 1 deletions

View File

@ -119,6 +119,7 @@ from .tasks import (
WaitTask,
)
from .util import (
async_create_backup_failure_issue,
build_mysqldb_conv,
dburl_to_path,
end_incomplete_runs,
@ -1006,9 +1007,11 @@ class Recorder(threading.Thread):
def _async_set_database_locked(task: DatabaseLockTask) -> None:
task.database_locked.set()
local_start_time = dt_util.now()
hass = self.hass
with write_lock_db_sqlite(self):
# Notify that lock is being held, wait until database can be used again.
self.hass.add_job(_async_set_database_locked, task)
hass.add_job(_async_set_database_locked, task)
while not task.database_unlock.wait(timeout=DB_LOCK_QUEUE_CHECK_TIMEOUT):
if self._reached_max_backlog_percentage(90):
_LOGGER.warning(
@ -1020,6 +1023,9 @@ class Recorder(threading.Thread):
self.backlog,
)
task.queue_overflow = True
hass.add_job(
async_create_backup_failure_issue, self.hass, local_start_time
)
break
_LOGGER.info(
"Database queue backlog reached %d entries during backup",

View File

@ -12,6 +12,10 @@
"maria_db_range_index_regression": {
"title": "Update MariaDB to {min_version} or later resolve a significant performance issue",
"description": "Older versions of MariaDB suffer from a significant performance regression when retrieving history data or purging the database. Update to MariaDB version {min_version} or later and restart Home Assistant. If you are using the MariaDB core add-on, make sure to update it to the latest version."
},
"backup_failed_out_of_resources": {
"title": "Database backup failed due to lack of resources",
"description": "The database backup stated at {start_time} failed due to lack of resources. The backup cannot be trusted and must be restarted. This can happen if the database is too large or if the system is under heavy load. Consider upgrading the system hardware or reducing the size of the database by decreasing the number of history days to keep or creating a filter."
}
},
"services": {

View File

@ -470,6 +470,24 @@ def _async_create_mariadb_range_index_regression_issue(
)
@callback
def async_create_backup_failure_issue(
hass: HomeAssistant,
local_start_time: datetime,
) -> None:
"""Create an issue when the backup fails because we run out of resources."""
ir.async_create_issue(
hass,
DOMAIN,
"backup_failed_out_of_resources",
is_fixable=False,
severity=ir.IssueSeverity.CRITICAL,
learn_more_url="https://www.home-assistant.io/integrations/recorder",
translation_key="backup_failed_out_of_resources",
translation_placeholders={"start_time": local_start_time.strftime("%H:%M:%S")},
)
def setup_connection_for_dialect(
instance: Recorder,
dialect_name: str,

View File

@ -73,6 +73,7 @@ from homeassistant.const import (
)
from homeassistant.core import Context, CoreState, Event, HomeAssistant, callback
from homeassistant.helpers import entity_registry as er, recorder as recorder_helper
from homeassistant.helpers.issue_registry import async_get as async_get_issue_registry
from homeassistant.setup import async_setup_component, setup_component
from homeassistant.util import dt as dt_util
from homeassistant.util.json import json_loads
@ -1832,6 +1833,15 @@ async def test_database_lock_and_overflow(
assert "Database queue backlog reached more than" in caplog.text
assert not instance.unlock_database()
registry = async_get_issue_registry(hass)
issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources")
assert issue is not None
assert "start_time" in issue.translation_placeholders
start_time = issue.translation_placeholders["start_time"]
assert start_time is not None
# Should be in H:M:S format
assert start_time.count(":") == 2
async def test_database_lock_and_overflow_checks_available_memory(
async_setup_recorder_instance: RecorderInstanceGenerator,
@ -1910,6 +1920,15 @@ async def test_database_lock_and_overflow_checks_available_memory(
db_events = await instance.async_add_executor_job(_get_db_events)
assert len(db_events) >= 2
registry = async_get_issue_registry(hass)
issue = registry.async_get_issue(DOMAIN, "backup_failed_out_of_resources")
assert issue is not None
assert "start_time" in issue.translation_placeholders
start_time = issue.translation_placeholders["start_time"]
assert start_time is not None
# Should be in H:M:S format
assert start_time.count(":") == 2
async def test_database_lock_timeout(
recorder_mock: Recorder, hass: HomeAssistant, recorder_db_url: str