From 01e66d6fb2671fcbfba296a020e11e497a82bb50 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Wed, 19 Jul 2023 02:23:12 -0500 Subject: [PATCH] Improve handling of unrecoverable storage corruption (#96712) * Improve handling of unrecoverable storage corruption fixes #96574 If something in storage gets corrupted core can boot loop or if its integration specific, the integration will fail to start. We now complainly loudly in the log, move away the corrupt data and start fresh to allow startup to proceed so the user can get to the UI and restore from backup without having to attach a console (or otherwise login to the OS and manually modify files). * test for corruption * ensure OSError is still fatal * one more case * create an issue for corrupt storage * fix key * persist * feedback * feedback * better to give the full path * tweaks * grammar * add time * feedback * adjust * try to get issue_domain from storage key * coverage * tweak wording some more --- .../components/homeassistant/strings.json | 11 ++ homeassistant/helpers/storage.py | 73 +++++++- tests/helpers/test_storage.py | 159 +++++++++++++++++- 3 files changed, 236 insertions(+), 7 deletions(-) diff --git a/homeassistant/components/homeassistant/strings.json b/homeassistant/components/homeassistant/strings.json index 791b1a21929..5404ee4af64 100644 --- a/homeassistant/components/homeassistant/strings.json +++ b/homeassistant/components/homeassistant/strings.json @@ -27,6 +27,17 @@ "no_platform_setup": { "title": "Unused YAML configuration for the {platform} integration", "description": "It's not possible to configure {platform} {domain} by adding `{platform_key}` to the {domain} configuration. Please check the documentation for more information on how to set up this integration.\n\nTo resolve this:\n1. Remove `{platform_key}` occurences from the `{domain}:` configuration in your YAML configuration file.\n2. Restart Home Assistant.\n\nExample that should be removed:\n{yaml_example}\n" + }, + "storage_corruption": { + "title": "Storage corruption detected for `{storage_key}`", + "fix_flow": { + "step": { + "confirm": { + "title": "[%key:component::homeassistant::issues::storage_corruption::title%]", + "description": "The `{storage_key}` storage could not be parsed and has been renamed to `{corrupt_path}` to allow Home Assistant to continue.\n\nA default `{storage_key}` may have been created automatically.\n\nIf you made manual edits to the storage file, fix any syntax errors in `{corrupt_path}`, restore the file to the original path `{original_path}`, and restart Home Assistant. Otherwise, restore the system from a backup.\n\nClick SUBMIT below to confirm you have repaired the file or restored from a backup.\n\nThe exact error was: {error}" + } + } + } } }, "system_health": { diff --git a/homeassistant/helpers/storage.py b/homeassistant/helpers/storage.py index 128a36e3e14..dd394c84f91 100644 --- a/homeassistant/helpers/storage.py +++ b/homeassistant/helpers/storage.py @@ -6,15 +6,24 @@ from collections.abc import Callable, Mapping, Sequence from contextlib import suppress from copy import deepcopy import inspect -from json import JSONEncoder +from json import JSONDecodeError, JSONEncoder import logging import os from typing import Any, Generic, TypeVar from homeassistant.const import EVENT_HOMEASSISTANT_FINAL_WRITE -from homeassistant.core import CALLBACK_TYPE, CoreState, Event, HomeAssistant, callback +from homeassistant.core import ( + CALLBACK_TYPE, + DOMAIN as HOMEASSISTANT_DOMAIN, + CoreState, + Event, + HomeAssistant, + callback, +) +from homeassistant.exceptions import HomeAssistantError from homeassistant.loader import MAX_LOAD_CONCURRENTLY, bind_hass from homeassistant.util import json as json_util +import homeassistant.util.dt as dt_util from homeassistant.util.file import WriteError from . import json as json_helper @@ -146,9 +155,63 @@ class Store(Generic[_T]): # and we don't want that to mess with what we're trying to store. data = deepcopy(data) else: - data = await self.hass.async_add_executor_job( - json_util.load_json, self.path - ) + try: + data = await self.hass.async_add_executor_job( + json_util.load_json, self.path + ) + except HomeAssistantError as err: + if isinstance(err.__cause__, JSONDecodeError): + # If we have a JSONDecodeError, it means the file is corrupt. + # We can't recover from this, so we'll log an error, rename the file and + # return None so that we can start with a clean slate which will + # allow startup to continue so they can restore from a backup. + isotime = dt_util.utcnow().isoformat() + corrupt_postfix = f".corrupt.{isotime}" + corrupt_path = f"{self.path}{corrupt_postfix}" + await self.hass.async_add_executor_job( + os.rename, self.path, corrupt_path + ) + storage_key = self.key + _LOGGER.error( + "Unrecoverable error decoding storage %s at %s; " + "This may indicate an unclean shutdown, invalid syntax " + "from manual edits, or disk corruption; " + "The corrupt file has been saved as %s; " + "It is recommended to restore from backup: %s", + storage_key, + self.path, + corrupt_path, + err, + ) + from .issue_registry import ( # pylint: disable=import-outside-toplevel + IssueSeverity, + async_create_issue, + ) + + issue_domain = HOMEASSISTANT_DOMAIN + if ( + domain := (storage_key.partition(".")[0]) + ) and domain in self.hass.config.components: + issue_domain = domain + + async_create_issue( + self.hass, + HOMEASSISTANT_DOMAIN, + f"storage_corruption_{storage_key}_{isotime}", + is_fixable=True, + issue_domain=issue_domain, + translation_key="storage_corruption", + is_persistent=True, + severity=IssueSeverity.CRITICAL, + translation_placeholders={ + "storage_key": storage_key, + "original_path": self.path, + "corrupt_path": corrupt_path, + "error": str(err), + }, + ) + return None + raise if data == {}: return None diff --git a/tests/helpers/test_storage.py b/tests/helpers/test_storage.py index 76dfbdbeb46..81953c7d785 100644 --- a/tests/helpers/test_storage.py +++ b/tests/helpers/test_storage.py @@ -2,6 +2,7 @@ import asyncio from datetime import timedelta import json +import os from typing import Any, NamedTuple from unittest.mock import Mock, patch @@ -12,8 +13,9 @@ from homeassistant.const import ( EVENT_HOMEASSISTANT_FINAL_WRITE, EVENT_HOMEASSISTANT_STOP, ) -from homeassistant.core import CoreState, HomeAssistant -from homeassistant.helpers import storage +from homeassistant.core import DOMAIN as HOMEASSISTANT_DOMAIN, CoreState, HomeAssistant +from homeassistant.exceptions import HomeAssistantError +from homeassistant.helpers import issue_registry as ir, storage from homeassistant.util import dt as dt_util from homeassistant.util.color import RGBColor @@ -548,3 +550,156 @@ async def test_saving_load_round_trip(tmpdir: py.path.local) -> None: } await hass.async_stop(force=True) + + +async def test_loading_corrupt_core_file( + tmpdir: py.path.local, caplog: pytest.LogCaptureFixture +) -> None: + """Test we handle unrecoverable corruption in a core file.""" + loop = asyncio.get_running_loop() + hass = await async_test_home_assistant(loop) + + tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage") + hass.config.config_dir = tmp_storage + + storage_key = "core.anything" + store = storage.Store( + hass, MOCK_VERSION_2, storage_key, minor_version=MOCK_MINOR_VERSION_1 + ) + await store.async_save({"hello": "world"}) + storage_path = os.path.join(tmp_storage, ".storage") + store_file = os.path.join(storage_path, store.key) + + data = await store.async_load() + assert data == {"hello": "world"} + + def _corrupt_store(): + with open(store_file, "w") as f: + f.write("corrupt") + + await hass.async_add_executor_job(_corrupt_store) + + data = await store.async_load() + assert data is None + assert "Unrecoverable error decoding storage" in caplog.text + + issue_registry = ir.async_get(hass) + found_issue = None + issue_entry = None + for (domain, issue), entry in issue_registry.issues.items(): + if domain == HOMEASSISTANT_DOMAIN and issue.startswith( + f"storage_corruption_{storage_key}_" + ): + found_issue = issue + issue_entry = entry + break + + assert found_issue is not None + assert issue_entry is not None + assert issue_entry.is_fixable is True + assert issue_entry.translation_placeholders["storage_key"] == storage_key + assert issue_entry.issue_domain == HOMEASSISTANT_DOMAIN + assert ( + issue_entry.translation_placeholders["error"] + == "unexpected character: line 1 column 1 (char 0)" + ) + + files = await hass.async_add_executor_job( + os.listdir, os.path.join(tmp_storage, ".storage") + ) + assert ".corrupt" in files[0] + + await hass.async_stop(force=True) + + +async def test_loading_corrupt_file_known_domain( + tmpdir: py.path.local, caplog: pytest.LogCaptureFixture +) -> None: + """Test we handle unrecoverable corruption for a known domain.""" + loop = asyncio.get_running_loop() + hass = await async_test_home_assistant(loop) + hass.config.components.add("testdomain") + storage_key = "testdomain.testkey" + + tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage") + hass.config.config_dir = tmp_storage + + store = storage.Store( + hass, MOCK_VERSION_2, storage_key, minor_version=MOCK_MINOR_VERSION_1 + ) + await store.async_save({"hello": "world"}) + storage_path = os.path.join(tmp_storage, ".storage") + store_file = os.path.join(storage_path, store.key) + + data = await store.async_load() + assert data == {"hello": "world"} + + def _corrupt_store(): + with open(store_file, "w") as f: + f.write('{"valid":"json"}..with..corrupt') + + await hass.async_add_executor_job(_corrupt_store) + + data = await store.async_load() + assert data is None + assert "Unrecoverable error decoding storage" in caplog.text + + issue_registry = ir.async_get(hass) + found_issue = None + issue_entry = None + for (domain, issue), entry in issue_registry.issues.items(): + if domain == HOMEASSISTANT_DOMAIN and issue.startswith( + f"storage_corruption_{storage_key}_" + ): + found_issue = issue + issue_entry = entry + break + + assert found_issue is not None + assert issue_entry is not None + assert issue_entry.is_fixable is True + assert issue_entry.translation_placeholders["storage_key"] == storage_key + assert issue_entry.issue_domain == "testdomain" + assert ( + issue_entry.translation_placeholders["error"] + == "unexpected content after document: line 1 column 17 (char 16)" + ) + + files = await hass.async_add_executor_job( + os.listdir, os.path.join(tmp_storage, ".storage") + ) + assert ".corrupt" in files[0] + + await hass.async_stop(force=True) + + +async def test_os_error_is_fatal(tmpdir: py.path.local) -> None: + """Test OSError during load is fatal.""" + loop = asyncio.get_running_loop() + hass = await async_test_home_assistant(loop) + + tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage") + hass.config.config_dir = tmp_storage + + store = storage.Store( + hass, MOCK_VERSION_2, MOCK_KEY, minor_version=MOCK_MINOR_VERSION_1 + ) + await store.async_save({"hello": "world"}) + + with pytest.raises(OSError), patch( + "homeassistant.helpers.storage.json_util.load_json", side_effect=OSError + ): + await store.async_load() + + base_os_error = OSError() + base_os_error.errno = 30 + home_assistant_error = HomeAssistantError() + home_assistant_error.__cause__ = base_os_error + + with pytest.raises(HomeAssistantError), patch( + "homeassistant.helpers.storage.json_util.load_json", + side_effect=home_assistant_error, + ): + await store.async_load() + + await hass.async_stop(force=True)