diff --git a/homeassistant/components/esphome/assist_satellite.py b/homeassistant/components/esphome/assist_satellite.py index a129a7723dd..9d92b5fcb92 100644 --- a/homeassistant/components/esphome/assist_satellite.py +++ b/homeassistant/components/esphome/assist_satellite.py @@ -310,12 +310,13 @@ class EsphomeAssistSatellite( self.entry_data.api_version ) ) - if feature_flags & VoiceAssistantFeature.SPEAKER: - media_id = tts_output["media_id"] + if feature_flags & VoiceAssistantFeature.SPEAKER and ( + stream := tts.async_get_stream(self.hass, tts_output["token"]) + ): self._tts_streaming_task = ( self.config_entry.async_create_background_task( self.hass, - self._stream_tts_audio(media_id), + self._stream_tts_audio(stream), "esphome_voice_assistant_tts", ) ) @@ -564,7 +565,7 @@ class EsphomeAssistSatellite( async def _stream_tts_audio( self, - media_id: str, + tts_result: tts.ResultStream, sample_rate: int = 16000, sample_width: int = 2, sample_channels: int = 1, @@ -579,15 +580,14 @@ class EsphomeAssistSatellite( if not self._is_running: return - extension, data = await tts.async_get_media_source_audio( - self.hass, - media_id, - ) - - if extension != "wav": - _LOGGER.error("Only WAV audio can be streamed, got %s", extension) + if tts_result.extension != "wav": + _LOGGER.error( + "Only WAV audio can be streamed, got %s", tts_result.extension + ) return + data = b"".join([chunk async for chunk in tts_result.async_stream_result()]) + with io.BytesIO(data) as wav_io, wave.open(wav_io, "rb") as wav_file: if ( (wav_file.getframerate() != sample_rate) diff --git a/tests/components/esphome/test_assist_satellite.py b/tests/components/esphome/test_assist_satellite.py index 5f433a6c0ed..2254d24c9ac 100644 --- a/tests/components/esphome/test_assist_satellite.py +++ b/tests/components/esphome/test_assist_satellite.py @@ -58,6 +58,7 @@ from homeassistant.helpers import ( intent as intent_helper, ) from homeassistant.helpers.entity_component import EntityComponent +from homeassistant.helpers.network import get_url from .conftest import MockESPHomeDevice @@ -133,8 +134,6 @@ async def test_pipeline_api_audio( ) -> None: """Test a complete pipeline run with API audio (over the TCP connection).""" conversation_id = "test-conversation-id" - media_url = "http://test.url" - media_id = "test-media-id" mock_device: MockESPHomeDevice = await mock_esphome_device( mock_client=mock_client, @@ -328,15 +327,22 @@ async def test_pipeline_api_audio( assert satellite.state == AssistSatelliteState.RESPONDING # Should return mock_wav audio + mock_tts_result_stream = MockResultStream(hass, "wav", mock_wav) event_callback( PipelineEvent( type=PipelineEventType.TTS_END, - data={"tts_output": {"url": media_url, "media_id": media_id}}, + data={ + "tts_output": { + "media_id": "test-media-id", + "url": mock_tts_result_stream.url, + "token": mock_tts_result_stream.token, + } + }, ) ) assert mock_client.send_voice_assistant_event.call_args_list[-1].args == ( VoiceAssistantEventType.VOICE_ASSISTANT_TTS_END, - {"url": media_url}, + {"url": get_url(hass) + mock_tts_result_stream.url}, ) event_callback(PipelineEvent(type=PipelineEventType.RUN_END)) @@ -355,12 +361,6 @@ async def test_pipeline_api_audio( original_handle_pipeline_finished() pipeline_finished.set() - async def async_get_media_source_audio( - hass: HomeAssistant, - media_source_id: str, - ) -> tuple[str, bytes]: - return ("wav", mock_wav) - tts_finished = asyncio.Event() original_tts_response_finished = satellite.tts_response_finished @@ -373,10 +373,6 @@ async def test_pipeline_api_audio( "homeassistant.components.assist_satellite.entity.async_pipeline_from_audio_stream", new=async_pipeline_from_audio_stream, ), - patch( - "homeassistant.components.tts.async_get_media_source_audio", - new=async_get_media_source_audio, - ), patch.object(satellite, "handle_pipeline_finished", handle_pipeline_finished), patch.object(satellite, "_stream_tts_audio", _stream_tts_audio), patch.object(satellite, "tts_response_finished", tts_response_finished), @@ -434,8 +430,6 @@ async def test_pipeline_udp_audio( mainly focused on the UDP server. """ conversation_id = "test-conversation-id" - media_url = "http://test.url" - media_id = "test-media-id" mock_device: MockESPHomeDevice = await mock_esphome_device( mock_client=mock_client, @@ -522,10 +516,17 @@ async def test_pipeline_udp_audio( ) # Should return mock_wav audio + mock_tts_result_stream = MockResultStream(hass, "wav", mock_wav) event_callback( PipelineEvent( type=PipelineEventType.TTS_END, - data={"tts_output": {"url": media_url, "media_id": media_id}}, + data={ + "tts_output": { + "media_id": "test-media-id", + "url": mock_tts_result_stream.url, + "token": mock_tts_result_stream.token, + } + }, ) ) @@ -538,12 +539,6 @@ async def test_pipeline_udp_audio( original_handle_pipeline_finished() pipeline_finished.set() - async def async_get_media_source_audio( - hass: HomeAssistant, - media_source_id: str, - ) -> tuple[str, bytes]: - return ("wav", mock_wav) - tts_finished = asyncio.Event() original_tts_response_finished = satellite.tts_response_finished @@ -567,10 +562,6 @@ async def test_pipeline_udp_audio( "homeassistant.components.assist_satellite.entity.async_pipeline_from_audio_stream", new=async_pipeline_from_audio_stream, ), - patch( - "homeassistant.components.tts.async_get_media_source_audio", - new=async_get_media_source_audio, - ), patch.object(satellite, "handle_pipeline_finished", handle_pipeline_finished), patch.object(satellite, "tts_response_finished", tts_response_finished), ): @@ -652,8 +643,6 @@ async def test_pipeline_media_player( mainly focused on tts_response_finished getting automatically called. """ conversation_id = "test-conversation-id" - media_url = "http://test.url" - media_id = "test-media-id" mock_device: MockESPHomeDevice = await mock_esphome_device( mock_client=mock_client, @@ -733,10 +722,17 @@ async def test_pipeline_media_player( ) # Should return mock_wav audio + mock_tts_result_stream = MockResultStream(hass, "wav", mock_wav) event_callback( PipelineEvent( type=PipelineEventType.TTS_END, - data={"tts_output": {"url": media_url, "media_id": media_id}}, + data={ + "tts_output": { + "media_id": "test-media-id", + "url": mock_tts_result_stream.url, + "token": mock_tts_result_stream.token, + } + }, ) ) @@ -749,12 +745,6 @@ async def test_pipeline_media_player( original_handle_pipeline_finished() pipeline_finished.set() - async def async_get_media_source_audio( - hass: HomeAssistant, - media_source_id: str, - ) -> tuple[str, bytes]: - return ("wav", mock_wav) - tts_finished = asyncio.Event() original_tts_response_finished = satellite.tts_response_finished @@ -767,10 +757,6 @@ async def test_pipeline_media_player( "homeassistant.components.assist_satellite.entity.async_pipeline_from_audio_stream", new=async_pipeline_from_audio_stream, ), - patch( - "homeassistant.components.tts.async_get_media_source_audio", - new=async_get_media_source_audio, - ), patch.object(satellite, "handle_pipeline_finished", handle_pipeline_finished), patch.object(satellite, "tts_response_finished", tts_response_finished), ): @@ -944,80 +930,63 @@ async def test_streaming_tts_errors( # Should not stream if not running satellite._is_running = False - await satellite._stream_tts_audio("test-media-id") + await satellite._stream_tts_audio(MockResultStream(hass, "wav", mock_wav)) mock_client.send_voice_assistant_audio.assert_not_called() satellite._is_running = True # Should only stream WAV - async def get_mp3( - hass: HomeAssistant, - media_source_id: str, - ) -> tuple[str, bytes]: - return ("mp3", b"") - - with patch( - "homeassistant.components.tts.async_get_media_source_audio", new=get_mp3 - ): - await satellite._stream_tts_audio("test-media-id") - mock_client.send_voice_assistant_audio.assert_not_called() + await satellite._stream_tts_audio(MockResultStream(hass, "mp3", b"")) + mock_client.send_voice_assistant_audio.assert_not_called() # Needs to be the correct sample rate, etc. - async def get_bad_wav( - hass: HomeAssistant, - media_source_id: str, - ) -> tuple[str, bytes]: - with io.BytesIO() as wav_io: - with wave.open(wav_io, "wb") as wav_file: - wav_file.setframerate(48000) - wav_file.setsampwidth(2) - wav_file.setnchannels(1) - wav_file.writeframes(b"test-wav") + with io.BytesIO() as wav_io: + with wave.open(wav_io, "wb") as wav_file: + wav_file.setframerate(48000) + wav_file.setsampwidth(2) + wav_file.setnchannels(1) + wav_file.writeframes(b"test-wav") - return ("wav", wav_io.getvalue()) + mock_tts_result_stream = MockResultStream(hass, "wav", wav_io.getvalue()) - with patch( - "homeassistant.components.tts.async_get_media_source_audio", new=get_bad_wav - ): - await satellite._stream_tts_audio("test-media-id") - mock_client.send_voice_assistant_audio.assert_not_called() + await satellite._stream_tts_audio(mock_tts_result_stream) + mock_client.send_voice_assistant_audio.assert_not_called() # Check that TTS_STREAM_* events still get sent after cancel media_fetched = asyncio.Event() - async def get_slow_wav( - hass: HomeAssistant, - media_source_id: str, - ) -> tuple[str, bytes]: + mock_tts_result_stream = MockResultStream(hass, "wav", b"") + + async def async_stream_result_slowly(): media_fetched.set() await asyncio.sleep(1) - return ("wav", mock_wav) + yield mock_wav + + mock_tts_result_stream.async_stream_result = async_stream_result_slowly mock_client.send_voice_assistant_event.reset_mock() - with patch( - "homeassistant.components.tts.async_get_media_source_audio", new=get_slow_wav - ): - task = asyncio.create_task(satellite._stream_tts_audio("test-media-id")) - async with asyncio.timeout(1): - # Wait for media to be fetched - await media_fetched.wait() - # Cancel task - task.cancel() - await task + task = asyncio.create_task(satellite._stream_tts_audio(mock_tts_result_stream)) + async with asyncio.timeout(1): + # Wait for media to be fetched + await media_fetched.wait() - # No audio should have gone out - mock_client.send_voice_assistant_audio.assert_not_called() - assert len(mock_client.send_voice_assistant_event.call_args_list) == 2 + # Cancel task + task.cancel() + await task - # The TTS_STREAM_* events should have gone out - assert mock_client.send_voice_assistant_event.call_args_list[-2].args == ( - VoiceAssistantEventType.VOICE_ASSISTANT_TTS_STREAM_START, - {}, - ) - assert mock_client.send_voice_assistant_event.call_args_list[-1].args == ( - VoiceAssistantEventType.VOICE_ASSISTANT_TTS_STREAM_END, - {}, - ) + # No audio should have gone out + mock_client.send_voice_assistant_audio.assert_not_called() + assert len(mock_client.send_voice_assistant_event.call_args_list) == 2 + + # The TTS_STREAM_* events should have gone out + assert mock_client.send_voice_assistant_event.call_args_list[-2].args == ( + VoiceAssistantEventType.VOICE_ASSISTANT_TTS_STREAM_START, + {}, + ) + assert mock_client.send_voice_assistant_event.call_args_list[-1].args == ( + VoiceAssistantEventType.VOICE_ASSISTANT_TTS_STREAM_END, + {}, + ) async def test_tts_format_from_media_player(