From 3dd641816035d8176197b9d5df03563db158d87c Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Mon, 16 Sep 2024 03:10:07 -0500 Subject: [PATCH] Use sample bytes in ESPHome media format (#126016) --- .../components/esphome/assist_satellite.py | 19 ++++- .../components/esphome/ffmpeg_proxy.py | 13 +++- .../components/esphome/media_player.py | 19 ++++- .../esphome/test_assist_satellite.py | 69 +++++++++++++++++++ tests/components/esphome/test_media_player.py | 22 ++++-- 5 files changed, 131 insertions(+), 11 deletions(-) diff --git a/homeassistant/components/esphome/assist_satellite.py b/homeassistant/components/esphome/assist_satellite.py index 08dd2ac0774..7ce46fab64b 100644 --- a/homeassistant/components/esphome/assist_satellite.py +++ b/homeassistant/components/esphome/assist_satellite.py @@ -402,10 +402,23 @@ class EsphomeAssistSatellite( if supported_format.purpose == MediaPlayerFormatPurpose.ANNOUNCEMENT: self._attr_tts_options = { tts.ATTR_PREFERRED_FORMAT: supported_format.format, - tts.ATTR_PREFERRED_SAMPLE_RATE: supported_format.sample_rate, - tts.ATTR_PREFERRED_SAMPLE_CHANNELS: supported_format.num_channels, - tts.ATTR_PREFERRED_SAMPLE_BYTES: 2, } + + if supported_format.sample_rate > 0: + self._attr_tts_options[tts.ATTR_PREFERRED_SAMPLE_RATE] = ( + supported_format.sample_rate + ) + + if supported_format.sample_rate > 0: + self._attr_tts_options[tts.ATTR_PREFERRED_SAMPLE_CHANNELS] = ( + supported_format.num_channels + ) + + if supported_format.sample_rate > 0: + self._attr_tts_options[tts.ATTR_PREFERRED_SAMPLE_BYTES] = ( + supported_format.sample_bytes + ) + break async def _stream_tts_audio( diff --git a/homeassistant/components/esphome/ffmpeg_proxy.py b/homeassistant/components/esphome/ffmpeg_proxy.py index d2f538bfbd5..1649c628be9 100644 --- a/homeassistant/components/esphome/ffmpeg_proxy.py +++ b/homeassistant/components/esphome/ffmpeg_proxy.py @@ -26,11 +26,12 @@ def async_create_proxy_url( media_format: str, rate: int | None = None, channels: int | None = None, + width: int | None = None, ) -> str: """Create a one-time use proxy URL that automatically converts the media.""" data: FFmpegProxyData = hass.data[DATA_FFMPEG_PROXY] return data.async_create_proxy_url( - device_id, media_url, media_format, rate, channels + device_id, media_url, media_format, rate, channels, width ) @@ -50,6 +51,9 @@ class FFmpegConversionInfo: channels: int | None """Target number of channels (None to keep source channels).""" + width: int | None + """Target sample width in bytes (None to keep source width).""" + @dataclass class FFmpegProxyData: @@ -70,11 +74,12 @@ class FFmpegProxyData: media_format: str, rate: int | None, channels: int | None, + width: int | None, ) -> str: """Create a one-time use proxy URL that automatically converts the media.""" convert_id = secrets.token_urlsafe(16) self.conversions[device_id][convert_id] = FFmpegConversionInfo( - media_url, media_format, rate, channels + media_url, media_format, rate, channels, width ) _LOGGER.debug("Media URL allowed by proxy: %s", media_url) @@ -136,6 +141,10 @@ class FFmpegConvertResponse(web.StreamResponse): # Number of channels command_args.extend(["-ac", str(self.convert_info.channels)]) + if self.convert_info.width == 2: + # 16-bit samples + command_args.extend(["-sample_fmt", "s16"]) + # Output to stdout command_args.append("pipe:") diff --git a/homeassistant/components/esphome/media_player.py b/homeassistant/components/esphome/media_player.py index d742029bcef..3930b71d106 100644 --- a/homeassistant/components/esphome/media_player.py +++ b/homeassistant/components/esphome/media_player.py @@ -170,13 +170,28 @@ class EsphomeMediaPlayer( _LOGGER.debug("Proxying media url %s with format %s", url, format_to_use) device_id = self.device_entry.id media_format = format_to_use.format + + # 0 = None + rate: int | None = None + channels: int | None = None + width: int | None = None + if format_to_use.sample_rate > 0: + rate = format_to_use.sample_rate + + if format_to_use.num_channels > 0: + channels = format_to_use.num_channels + + if format_to_use.sample_bytes > 0: + width = format_to_use.sample_bytes + proxy_url = async_create_proxy_url( self.hass, device_id, url, media_format=media_format, - rate=format_to_use.sample_rate, - channels=format_to_use.num_channels, + rate=rate, + channels=channels, + width=width, ) # Resolve URL diff --git a/tests/components/esphome/test_assist_satellite.py b/tests/components/esphome/test_assist_satellite.py index 928ef38d250..f9a431e19d8 100644 --- a/tests/components/esphome/test_assist_satellite.py +++ b/tests/components/esphome/test_assist_satellite.py @@ -1006,6 +1006,7 @@ async def test_tts_format_from_media_player( sample_rate=48000, num_channels=2, purpose=MediaPlayerFormatPurpose.DEFAULT, + sample_bytes=2, ), # This is the format that should be used for tts MediaPlayerSupportedFormat( @@ -1013,6 +1014,7 @@ async def test_tts_format_from_media_player( sample_rate=22050, num_channels=1, purpose=MediaPlayerFormatPurpose.ANNOUNCEMENT, + sample_bytes=2, ), ], ) @@ -1050,6 +1052,73 @@ async def test_tts_format_from_media_player( } +async def test_tts_minimal_format_from_media_player( + hass: HomeAssistant, + mock_client: APIClient, + mock_esphome_device: Callable[ + [APIClient, list[EntityInfo], list[UserService], list[EntityState]], + Awaitable[MockESPHomeDevice], + ], +) -> None: + """Test text-to-speech format when media player only specifies the codec.""" + mock_device: MockESPHomeDevice = await mock_esphome_device( + mock_client=mock_client, + entity_info=[ + MediaPlayerInfo( + object_id="mymedia_player", + key=1, + name="my media_player", + unique_id="my_media_player", + supports_pause=True, + supported_formats=[ + MediaPlayerSupportedFormat( + format="flac", + sample_rate=48000, + num_channels=2, + purpose=MediaPlayerFormatPurpose.DEFAULT, + sample_bytes=2, + ), + # This is the format that should be used for tts + MediaPlayerSupportedFormat( + format="mp3", + sample_rate=0, # source rate + num_channels=0, # source channels + purpose=MediaPlayerFormatPurpose.ANNOUNCEMENT, + sample_bytes=0, # source width + ), + ], + ) + ], + user_service=[], + states=[], + device_info={ + "voice_assistant_feature_flags": VoiceAssistantFeature.VOICE_ASSISTANT + }, + ) + await hass.async_block_till_done() + + satellite = get_satellite_entity(hass, mock_device.device_info.mac_address) + assert satellite is not None + + with patch( + "homeassistant.components.assist_satellite.entity.async_pipeline_from_audio_stream", + ) as mock_pipeline_from_audio_stream: + await satellite.handle_pipeline_start( + conversation_id="", + flags=0, + audio_settings=VoiceAssistantAudioSettings(), + wake_word_phrase=None, + ) + + mock_pipeline_from_audio_stream.assert_called_once() + kwargs = mock_pipeline_from_audio_stream.call_args_list[0].kwargs + + # Should be ANNOUNCEMENT format from media player + assert kwargs.get("tts_audio_output") == { + tts.ATTR_PREFERRED_FORMAT: "mp3", + } + + async def test_announce_supported_features( hass: HomeAssistant, mock_client: APIClient, diff --git a/tests/components/esphome/test_media_player.py b/tests/components/esphome/test_media_player.py index e859324b394..799666fc66e 100644 --- a/tests/components/esphome/test_media_player.py +++ b/tests/components/esphome/test_media_player.py @@ -310,15 +310,17 @@ async def test_media_player_proxy( supported_formats=[ MediaPlayerSupportedFormat( format="flac", - sample_rate=48000, - num_channels=2, + sample_rate=0, # source rate + num_channels=0, # source channels purpose=MediaPlayerFormatPurpose.DEFAULT, + sample_bytes=0, # source width ), MediaPlayerSupportedFormat( format="wav", sample_rate=16000, num_channels=1, purpose=MediaPlayerFormatPurpose.ANNOUNCEMENT, + sample_bytes=2, ), MediaPlayerSupportedFormat( format="mp3", @@ -369,7 +371,13 @@ async def test_media_player_proxy( mock_async_create_proxy_url.assert_called_once() device_id = mock_async_create_proxy_url.call_args[0][1] mock_async_create_proxy_url.assert_called_once_with( - hass, device_id, media_url, media_format="flac", rate=48000, channels=2 + hass, + device_id, + media_url, + media_format="flac", + rate=None, + channels=None, + width=None, ) media_args = mock_client.media_player_command.call_args.kwargs @@ -395,7 +403,13 @@ async def test_media_player_proxy( mock_async_create_proxy_url.assert_called_once() device_id = mock_async_create_proxy_url.call_args[0][1] mock_async_create_proxy_url.assert_called_once_with( - hass, device_id, media_url, media_format="wav", rate=16000, channels=1 + hass, + device_id, + media_url, + media_format="wav", + rate=16000, + channels=1, + width=2, ) media_args = mock_client.media_player_command.call_args.kwargs