From 124931b2eebc0dde424962d70dea74afe81ac254 Mon Sep 17 00:00:00 2001 From: Paulus Schoutsen Date: Mon, 14 Jul 2025 20:23:43 +0200 Subject: [PATCH] TTS to always stream when available (#148695) Co-authored-by: Michael Hansen --- homeassistant/components/tts/__init__.py | 16 +++- .../snapshots/test_pipeline.ambr | 2 +- .../assist_pipeline/test_pipeline.py | 6 +- tests/components/tts/test_init.py | 2 +- .../wyoming/snapshots/test_tts.ambr | 80 +++++++++++++++++++ tests/components/wyoming/test_tts.py | 10 ++- 6 files changed, 107 insertions(+), 9 deletions(-) diff --git a/homeassistant/components/tts/__init__.py b/homeassistant/components/tts/__init__.py index c8e6e0f67fb..cf9099448df 100644 --- a/homeassistant/components/tts/__init__.py +++ b/homeassistant/components/tts/__init__.py @@ -382,7 +382,7 @@ async def _async_convert_audio( assert process.stderr stderr_data = await process.stderr.read() _LOGGER.error(stderr_data.decode()) - raise RuntimeError( + raise HomeAssistantError( f"Unexpected error while running ffmpeg with arguments: {command}. " "See log for details." ) @@ -976,7 +976,7 @@ class SpeechManager: if engine_instance.name is None or engine_instance.name is UNDEFINED: raise HomeAssistantError("TTS engine name is not set.") - if isinstance(engine_instance, Provider) or isinstance(message_or_stream, str): + if isinstance(engine_instance, Provider): if isinstance(message_or_stream, str): message = message_or_stream else: @@ -996,8 +996,18 @@ class SpeechManager: data_gen = make_data_generator(data) else: + if isinstance(message_or_stream, str): + + async def gen_stream() -> AsyncGenerator[str]: + yield message_or_stream + + stream = gen_stream() + + else: + stream = message_or_stream + tts_result = await engine_instance.internal_async_stream_tts_audio( - TTSAudioRequest(language, options, message_or_stream) + TTSAudioRequest(language, options, stream) ) extension = tts_result.extension data_gen = tts_result.data_gen diff --git a/tests/components/assist_pipeline/snapshots/test_pipeline.ambr b/tests/components/assist_pipeline/snapshots/test_pipeline.ambr index 7f760d069e6..95415ddb902 100644 --- a/tests/components/assist_pipeline/snapshots/test_pipeline.ambr +++ b/tests/components/assist_pipeline/snapshots/test_pipeline.ambr @@ -1,5 +1,5 @@ # serializer version: 1 -# name: test_chat_log_tts_streaming[to_stream_deltas0-0-] +# name: test_chat_log_tts_streaming[to_stream_deltas0-1-hello, how are you?] list([ dict({ 'data': dict({ diff --git a/tests/components/assist_pipeline/test_pipeline.py b/tests/components/assist_pipeline/test_pipeline.py index 3a4895440dc..5bc7b86c38c 100644 --- a/tests/components/assist_pipeline/test_pipeline.py +++ b/tests/components/assist_pipeline/test_pipeline.py @@ -1550,9 +1550,9 @@ async def test_pipeline_language_used_instead_of_conversation_language( "?", ], ), - # We are not streaming, so 0 chunks via streaming method - 0, - "", + # We always stream when possible, so 1 chunk via streaming method + 1, + "hello, how are you?", ), # Size above STREAM_RESPONSE_CHUNKS ( diff --git a/tests/components/tts/test_init.py b/tests/components/tts/test_init.py index 22fb10209b0..db42da5de0e 100644 --- a/tests/components/tts/test_init.py +++ b/tests/components/tts/test_init.py @@ -1835,7 +1835,7 @@ async def test_async_convert_audio_error(hass: HomeAssistant) -> None: async def bad_data_gen(): yield bytes(0) - with pytest.raises(RuntimeError): + with pytest.raises(HomeAssistantError): # Simulate a bad WAV file async for _chunk in tts._async_convert_audio( hass, "wav", bad_data_gen(), "mp3" diff --git a/tests/components/wyoming/snapshots/test_tts.ambr b/tests/components/wyoming/snapshots/test_tts.ambr index 53cc02eaacf..67c9b24160c 100644 --- a/tests/components/wyoming/snapshots/test_tts.ambr +++ b/tests/components/wyoming/snapshots/test_tts.ambr @@ -1,6 +1,19 @@ # serializer version: 1 # name: test_get_tts_audio list([ + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-start', + }), + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize-chunk', + }), dict({ 'data': dict({ 'text': 'Hello world', @@ -8,10 +21,29 @@ 'payload': None, 'type': 'synthesize', }), + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-stop', + }), ]) # --- # name: test_get_tts_audio_different_formats list([ + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-start', + }), + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize-chunk', + }), dict({ 'data': dict({ 'text': 'Hello world', @@ -19,10 +51,29 @@ 'payload': None, 'type': 'synthesize', }), + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-stop', + }), ]) # --- # name: test_get_tts_audio_different_formats.1 list([ + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-start', + }), + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize-chunk', + }), dict({ 'data': dict({ 'text': 'Hello world', @@ -30,6 +81,12 @@ 'payload': None, 'type': 'synthesize', }), + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-stop', + }), ]) # --- # name: test_get_tts_audio_streaming @@ -71,6 +128,23 @@ # --- # name: test_voice_speaker list([ + dict({ + 'data': dict({ + 'voice': dict({ + 'name': 'voice1', + 'speaker': 'speaker1', + }), + }), + 'payload': None, + 'type': 'synthesize-start', + }), + dict({ + 'data': dict({ + 'text': 'Hello world', + }), + 'payload': None, + 'type': 'synthesize-chunk', + }), dict({ 'data': dict({ 'text': 'Hello world', @@ -82,5 +156,11 @@ 'payload': None, 'type': 'synthesize', }), + dict({ + 'data': dict({ + }), + 'payload': None, + 'type': 'synthesize-stop', + }), ]) # --- diff --git a/tests/components/wyoming/test_tts.py b/tests/components/wyoming/test_tts.py index 3374328f411..efcf464eebb 100644 --- a/tests/components/wyoming/test_tts.py +++ b/tests/components/wyoming/test_tts.py @@ -52,6 +52,7 @@ async def test_get_tts_audio( # Verify audio audio_events = [ + AudioStart(rate=16000, width=2, channels=1).event(), AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), AudioStop().event(), ] @@ -77,7 +78,10 @@ async def test_get_tts_audio( assert wav_file.getframerate() == 16000 assert wav_file.getsampwidth() == 2 assert wav_file.getnchannels() == 1 - assert wav_file.readframes(wav_file.getnframes()) == audio + + # nframes = 0 due to streaming + assert len(data) == len(audio) + 44 # WAVE header is 44 bytes + assert data[44:] == audio assert mock_client.written == snapshot @@ -88,6 +92,7 @@ async def test_get_tts_audio_different_formats( """Test changing preferred audio format.""" audio = bytes(16000 * 2 * 1) # one second audio_events = [ + AudioStart(rate=16000, width=2, channels=1).event(), AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), AudioStop().event(), ] @@ -123,6 +128,7 @@ async def test_get_tts_audio_different_formats( # MP3 is the default audio_events = [ + AudioStart(rate=16000, width=2, channels=1).event(), AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), AudioStop().event(), ] @@ -167,6 +173,7 @@ async def test_get_tts_audio_audio_oserror( """Test get audio and error raising.""" audio = bytes(100) audio_events = [ + AudioStart(rate=16000, width=2, channels=1).event(), AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), AudioStop().event(), ] @@ -197,6 +204,7 @@ async def test_voice_speaker( """Test using a different voice and speaker.""" audio = bytes(100) audio_events = [ + AudioStart(rate=16000, width=2, channels=1).event(), AudioChunk(audio=audio, rate=16000, width=2, channels=1).event(), AudioStop().event(), ]