From 3f42911af4291c2e5d85806e394f02a5f62bc733 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 21 Jul 2025 10:33:23 -0500 Subject: [PATCH] Add streaming to cloud TTS (#148925) --- homeassistant/components/cloud/tts.py | 35 +++- tests/components/cloud/test_tts.py | 244 ++++++++++++++++++++------ 2 files changed, 218 insertions(+), 61 deletions(-) diff --git a/homeassistant/components/cloud/tts.py b/homeassistant/components/cloud/tts.py index 85ca599fa87..179f467922f 100644 --- a/homeassistant/components/cloud/tts.py +++ b/homeassistant/components/cloud/tts.py @@ -17,6 +17,8 @@ from homeassistant.components.tts import ( PLATFORM_SCHEMA as TTS_PLATFORM_SCHEMA, Provider, TextToSpeechEntity, + TTSAudioRequest, + TTSAudioResponse, TtsAudioType, Voice, ) @@ -332,7 +334,7 @@ class CloudTTSEntity(TextToSpeechEntity): def default_options(self) -> dict[str, str]: """Return a dict include default options.""" return { - ATTR_AUDIO_OUTPUT: AudioOutput.MP3, + ATTR_AUDIO_OUTPUT: AudioOutput.MP3.value, } @property @@ -433,6 +435,29 @@ class CloudTTSEntity(TextToSpeechEntity): return (options[ATTR_AUDIO_OUTPUT], data) + async def async_stream_tts_audio( + self, request: TTSAudioRequest + ) -> TTSAudioResponse: + """Generate speech from an incoming message.""" + data_gen = self.cloud.voice.process_tts_stream( + text_stream=request.message_gen, + **_prepare_voice_args( + hass=self.hass, + language=request.language, + voice=request.options.get( + ATTR_VOICE, + ( + self._voice + if request.language == self._language + else DEFAULT_VOICES[request.language] + ), + ), + gender=request.options.get(ATTR_GENDER), + ), + ) + + return TTSAudioResponse(AudioOutput.WAV.value, data_gen) + class CloudProvider(Provider): """Home Assistant Cloud speech API provider.""" @@ -526,9 +551,11 @@ class CloudProvider(Provider): language=language, voice=options.get( ATTR_VOICE, - self._voice - if language == self._language - else DEFAULT_VOICES[language], + ( + self._voice + if language == self._language + else DEFAULT_VOICES[language] + ), ), gender=options.get(ATTR_GENDER), ), diff --git a/tests/components/cloud/test_tts.py b/tests/components/cloud/test_tts.py index c920fdac264..44430f9c39a 100644 --- a/tests/components/cloud/test_tts.py +++ b/tests/components/cloud/test_tts.py @@ -1,10 +1,12 @@ """Tests for cloud tts.""" -from collections.abc import AsyncGenerator, Callable, Coroutine +from collections.abc import AsyncGenerator, AsyncIterable, Callable, Coroutine from copy import deepcopy from http import HTTPStatus +import io from typing import Any from unittest.mock import AsyncMock, MagicMock, patch +import wave from hass_nabucasa.voice import VoiceError, VoiceTokenError from hass_nabucasa.voice_data import TTS_VOICES @@ -239,6 +241,12 @@ async def test_get_tts_audio( side_effect=mock_process_tts_side_effect, ) cloud.voice.process_tts = mock_process_tts + + mock_process_tts_stream = _make_stream_mock("There is someone at the door.") + if mock_process_tts_side_effect: + mock_process_tts_stream.side_effect = mock_process_tts_side_effect + cloud.voice.process_tts_stream = mock_process_tts_stream + assert await async_setup_component(hass, "homeassistant", {}) assert await async_setup_component(hass, DOMAIN, {DOMAIN: {}}) await hass.async_block_till_done() @@ -262,13 +270,27 @@ async def test_get_tts_audio( } await hass.async_block_till_done() - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == "en-US" - assert mock_process_tts.call_args.kwargs["gender"] is None - assert mock_process_tts.call_args.kwargs["voice"] == "JennyNeural" - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + # Force streaming + await client.get(response["path"]) + + if data.get("engine_id", "").startswith("tts."): + # Streaming + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert mock_process_tts_stream.call_args.kwargs["language"] == "en-US" + assert mock_process_tts_stream.call_args.kwargs["gender"] is None + assert mock_process_tts_stream.call_args.kwargs["voice"] == "JennyNeural" + else: + # Non-streaming + assert mock_process_tts.call_count == 1 + assert mock_process_tts.call_args is not None + assert ( + mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." + ) + assert mock_process_tts.call_args.kwargs["language"] == "en-US" + assert mock_process_tts.call_args.kwargs["gender"] is None + assert mock_process_tts.call_args.kwargs["voice"] == "JennyNeural" + assert mock_process_tts.call_args.kwargs["output"] == "mp3" @pytest.mark.parametrize( @@ -321,10 +343,10 @@ async def test_get_tts_audio_logged_out( @pytest.mark.parametrize( - ("mock_process_tts_return_value", "mock_process_tts_side_effect"), + ("mock_process_tts_side_effect"), [ - (b"", None), - (None, VoiceError("Boom!")), + (None,), + (VoiceError("Boom!"),), ], ) async def test_tts_entity( @@ -332,15 +354,13 @@ async def test_tts_entity( hass_client: ClientSessionGenerator, entity_registry: EntityRegistry, cloud: MagicMock, - mock_process_tts_return_value: bytes | None, mock_process_tts_side_effect: Exception | None, ) -> None: """Test text-to-speech entity.""" - mock_process_tts = AsyncMock( - return_value=mock_process_tts_return_value, - side_effect=mock_process_tts_side_effect, - ) - cloud.voice.process_tts = mock_process_tts + mock_process_tts_stream = _make_stream_mock("There is someone at the door.") + if mock_process_tts_side_effect: + mock_process_tts_stream.side_effect = mock_process_tts_side_effect + cloud.voice.process_tts_stream = mock_process_tts_stream assert await async_setup_component(hass, "homeassistant", {}) assert await async_setup_component(hass, DOMAIN, {DOMAIN: {}}) await hass.async_block_till_done() @@ -372,13 +392,14 @@ async def test_tts_entity( } await hass.async_block_till_done() - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == "en-US" - assert mock_process_tts.call_args.kwargs["gender"] is None - assert mock_process_tts.call_args.kwargs["voice"] == "JennyNeural" - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + # Force streaming + await client.get(response["path"]) + + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert mock_process_tts_stream.call_args.kwargs["language"] == "en-US" + assert mock_process_tts_stream.call_args.kwargs["gender"] is None + assert mock_process_tts_stream.call_args.kwargs["voice"] == "JennyNeural" state = hass.states.get(entity_id) assert state @@ -482,6 +503,8 @@ async def test_deprecated_voice( return_value=b"", ) cloud.voice.process_tts = mock_process_tts + mock_process_tts_stream = _make_stream_mock("There is someone at the door.") + cloud.voice.process_tts_stream = mock_process_tts_stream assert await async_setup_component(hass, DOMAIN, {DOMAIN: {}}) await hass.async_block_till_done() @@ -509,18 +532,34 @@ async def test_deprecated_voice( } await hass.async_block_till_done() - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == language - assert mock_process_tts.call_args.kwargs["gender"] is None - assert mock_process_tts.call_args.kwargs["voice"] == replacement_voice - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + # Force streaming + await client.get(response["path"]) + + if data.get("engine_id", "").startswith("tts."): + # Streaming + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert mock_process_tts_stream.call_args.kwargs["language"] == language + assert mock_process_tts_stream.call_args.kwargs["gender"] is None + assert mock_process_tts_stream.call_args.kwargs["voice"] == replacement_voice + else: + # Non-streaming + assert mock_process_tts.call_count == 1 + assert mock_process_tts.call_args is not None + assert ( + mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." + ) + assert mock_process_tts.call_args.kwargs["language"] == language + assert mock_process_tts.call_args.kwargs["gender"] is None + assert mock_process_tts.call_args.kwargs["voice"] == replacement_voice + assert mock_process_tts.call_args.kwargs["output"] == "mp3" + issue = issue_registry.async_get_issue( "cloud", f"deprecated_voice_{replacement_voice}" ) assert issue is None mock_process_tts.reset_mock() + mock_process_tts_stream.reset_mock() # Test with deprecated voice. data["options"] = {"voice": deprecated_voice} @@ -538,15 +577,30 @@ async def test_deprecated_voice( } await hass.async_block_till_done() + # Force streaming + await client.get(response["path"]) + issue_id = f"deprecated_voice_{deprecated_voice}" - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == language - assert mock_process_tts.call_args.kwargs["gender"] is None - assert mock_process_tts.call_args.kwargs["voice"] == replacement_voice - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + if data.get("engine_id", "").startswith("tts."): + # Streaming + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert mock_process_tts_stream.call_args.kwargs["language"] == language + assert mock_process_tts_stream.call_args.kwargs["gender"] is None + assert mock_process_tts_stream.call_args.kwargs["voice"] == replacement_voice + else: + # Non-streaming + assert mock_process_tts.call_count == 1 + assert mock_process_tts.call_args is not None + assert ( + mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." + ) + assert mock_process_tts.call_args.kwargs["language"] == language + assert mock_process_tts.call_args.kwargs["gender"] is None + assert mock_process_tts.call_args.kwargs["voice"] == replacement_voice + assert mock_process_tts.call_args.kwargs["output"] == "mp3" + issue = issue_registry.async_get_issue("cloud", issue_id) assert issue is not None assert issue.breaks_in_ha_version == "2024.8.0" @@ -623,6 +677,8 @@ async def test_deprecated_gender( return_value=b"", ) cloud.voice.process_tts = mock_process_tts + mock_process_tts_stream = _make_stream_mock("There is someone at the door.") + cloud.voice.process_tts_stream = mock_process_tts_stream assert await async_setup_component(hass, DOMAIN, {DOMAIN: {}}) await hass.async_block_till_done() @@ -649,15 +705,30 @@ async def test_deprecated_gender( } await hass.async_block_till_done() - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == language - assert mock_process_tts.call_args.kwargs["voice"] == "XiaoxiaoNeural" - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + # Force streaming + await client.get(response["path"]) + + if data.get("engine_id", "").startswith("tts."): + # Streaming + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert mock_process_tts_stream.call_args.kwargs["language"] == language + assert mock_process_tts_stream.call_args.kwargs["voice"] == "XiaoxiaoNeural" + else: + # Non-streaming + assert mock_process_tts.call_count == 1 + assert mock_process_tts.call_args is not None + assert ( + mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." + ) + assert mock_process_tts.call_args.kwargs["language"] == language + assert mock_process_tts.call_args.kwargs["voice"] == "XiaoxiaoNeural" + assert mock_process_tts.call_args.kwargs["output"] == "mp3" + issue = issue_registry.async_get_issue("cloud", "deprecated_gender") assert issue is None mock_process_tts.reset_mock() + mock_process_tts_stream.reset_mock() # Test with deprecated gender option. data["options"] = {"gender": gender_option} @@ -675,15 +746,30 @@ async def test_deprecated_gender( } await hass.async_block_till_done() + # Force streaming + await client.get(response["path"]) + issue_id = "deprecated_gender" - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == language - assert mock_process_tts.call_args.kwargs["gender"] == gender_option - assert mock_process_tts.call_args.kwargs["voice"] == "XiaoxiaoNeural" - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + if data.get("engine_id", "").startswith("tts."): + # Streaming + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert mock_process_tts_stream.call_args.kwargs["language"] == language + assert mock_process_tts_stream.call_args.kwargs["gender"] == gender_option + assert mock_process_tts_stream.call_args.kwargs["voice"] == "XiaoxiaoNeural" + else: + # Non-streaming + assert mock_process_tts.call_count == 1 + assert mock_process_tts.call_args is not None + assert ( + mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." + ) + assert mock_process_tts.call_args.kwargs["language"] == language + assert mock_process_tts.call_args.kwargs["gender"] == gender_option + assert mock_process_tts.call_args.kwargs["voice"] == "XiaoxiaoNeural" + assert mock_process_tts.call_args.kwargs["output"] == "mp3" + issue = issue_registry.async_get_issue("cloud", issue_id) assert issue is not None assert issue.breaks_in_ha_version == "2024.10.0" @@ -772,6 +858,8 @@ async def test_tts_services( calls = async_mock_service(hass, DOMAIN_MP, SERVICE_PLAY_MEDIA) mock_process_tts = AsyncMock(return_value=b"") cloud.voice.process_tts = mock_process_tts + mock_process_tts_stream = _make_stream_mock("There is someone at the door.") + cloud.voice.process_tts_stream = mock_process_tts_stream assert await async_setup_component(hass, DOMAIN, {DOMAIN: {}}) await hass.async_block_till_done() @@ -793,9 +881,51 @@ async def test_tts_services( assert response.status == HTTPStatus.OK await hass.async_block_till_done() - assert mock_process_tts.call_count == 1 - assert mock_process_tts.call_args is not None - assert mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." - assert mock_process_tts.call_args.kwargs["language"] == service_data[ATTR_LANGUAGE] - assert mock_process_tts.call_args.kwargs["voice"] == "GadisNeural" - assert mock_process_tts.call_args.kwargs["output"] == "mp3" + if service_data.get("entity_id", "").startswith("tts."): + # Streaming + assert mock_process_tts_stream.call_count == 1 + assert mock_process_tts_stream.call_args is not None + assert ( + mock_process_tts_stream.call_args.kwargs["language"] + == service_data[ATTR_LANGUAGE] + ) + assert mock_process_tts_stream.call_args.kwargs["voice"] == "GadisNeural" + else: + # Non-streaming + assert mock_process_tts.call_count == 1 + assert mock_process_tts.call_args is not None + assert ( + mock_process_tts.call_args.kwargs["text"] == "There is someone at the door." + ) + assert ( + mock_process_tts.call_args.kwargs["language"] == service_data[ATTR_LANGUAGE] + ) + assert mock_process_tts.call_args.kwargs["voice"] == "GadisNeural" + assert mock_process_tts.call_args.kwargs["output"] == "mp3" + + +def _make_stream_mock(expected_text: str) -> MagicMock: + """Create a mock TTS stream generator with just a WAV header.""" + with io.BytesIO() as wav_io: + wav_writer: wave.Wave_write = wave.open(wav_io, "wb") + with wav_writer: + wav_writer.setframerate(24000) + wav_writer.setsampwidth(2) + wav_writer.setnchannels(1) + + wav_io.seek(0) + wav_bytes = wav_io.getvalue() + + process_tts_stream = MagicMock() + + async def fake_process_tts_stream(*, text_stream: AsyncIterable[str], **kwargs): + # Verify text + actual_text = "".join([text_chunk async for text_chunk in text_stream]) + assert actual_text == expected_text + + # WAV header + yield wav_bytes + + process_tts_stream.side_effect = fake_process_tts_stream + + return process_tts_stream