diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 9cf7d10936..647bbc7653 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -35,6 +35,27 @@ void VoiceAssistant::setup() { temp_ring_buffer->write((void *) data.data(), data.size()); } }); + +#ifdef USE_MEDIA_PLAYER + if (this->media_player_ != nullptr) { + this->media_player_->add_on_state_callback([this]() { + switch (this->media_player_->state) { + case media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING: + if (this->media_player_response_state_ == MediaPlayerResponseState::URL_SENT) { + // State changed to announcing after receiving the url + this->media_player_response_state_ = MediaPlayerResponseState::PLAYING; + } + break; + default: + if (this->media_player_response_state_ == MediaPlayerResponseState::PLAYING) { + // No longer announcing the TTS response + this->media_player_response_state_ = MediaPlayerResponseState::FINISHED; + } + break; + } + }); + } +#endif } float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } @@ -223,6 +244,13 @@ void VoiceAssistant::loop() { msg.wake_word_phrase = this->wake_word_; this->wake_word_ = ""; + // Reset media player state tracking +#ifdef USE_MEDIA_PLAYER + if (this->media_player_ != nullptr) { + this->media_player_response_state_ = MediaPlayerResponseState::IDLE; + } +#endif + if (this->api_client_ == nullptr || !this->api_client_->send_message(msg)) { ESP_LOGW(TAG, "Could not request start"); this->error_trigger_->trigger("not-connected", "Could not request start"); @@ -314,17 +342,10 @@ void VoiceAssistant::loop() { #endif #ifdef USE_MEDIA_PLAYER if (this->media_player_ != nullptr) { - playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING); + playing = (this->media_player_response_state_ == MediaPlayerResponseState::PLAYING); - if (playing && this->media_player_wait_for_announcement_start_) { - // Announcement has started playing, wait for it to finish - this->media_player_wait_for_announcement_start_ = false; - this->media_player_wait_for_announcement_end_ = true; - } - - if (!playing && this->media_player_wait_for_announcement_end_) { - // Announcement has finished playing - this->media_player_wait_for_announcement_end_ = false; + if (this->media_player_response_state_ == MediaPlayerResponseState::FINISHED) { + this->media_player_response_state_ = MediaPlayerResponseState::IDLE; this->cancel_timeout("playing"); ESP_LOGD(TAG, "Announcement finished playing"); this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); @@ -555,7 +576,7 @@ void VoiceAssistant::request_stop() { break; case State::AWAITING_RESPONSE: this->signal_stop_(); - // Fallthrough intended to stop a streaming TTS announcement that has potentially started + break; case State::STREAMING_RESPONSE: #ifdef USE_MEDIA_PLAYER // Stop any ongoing media player announcement @@ -565,6 +586,10 @@ void VoiceAssistant::request_stop() { .set_announcement(true) .perform(); } + if (this->started_streaming_tts_) { + // Haven't reached the TTS_END stage, so send the stop signal to HA. + this->signal_stop_(); + } #endif break; case State::RESPONSE_FINISHED: @@ -648,13 +673,16 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { if (this->media_player_ != nullptr) { for (const auto &arg : msg.data) { if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) { + this->media_player_response_state_ = MediaPlayerResponseState::URL_SENT; + this->media_player_->make_call().set_media_url(this->tts_response_url_).set_announcement(true).perform(); - this->media_player_wait_for_announcement_start_ = true; - this->media_player_wait_for_announcement_end_ = false; this->started_streaming_tts_ = true; + this->start_playback_timeout_(); + tts_url_for_trigger = this->tts_response_url_; this->tts_response_url_.clear(); // Reset streaming URL + this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE); } } } @@ -713,18 +741,22 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { this->defer([this, url]() { #ifdef USE_MEDIA_PLAYER if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) { + this->media_player_response_state_ = MediaPlayerResponseState::URL_SENT; + this->media_player_->make_call().set_media_url(url).set_announcement(true).perform(); - this->media_player_wait_for_announcement_start_ = true; - this->media_player_wait_for_announcement_end_ = false; - // Start the playback timeout, as the media player state isn't immediately updated this->start_playback_timeout_(); } + this->started_streaming_tts_ = false; // Helps indicate reaching the TTS_END stage #endif this->tts_end_trigger_->trigger(url); }); State new_state = this->local_output_ ? State::STREAMING_RESPONSE : State::IDLE; - this->set_state_(new_state, new_state); + if (new_state != this->state_) { + // Don't needlessly change the state. The intent progress stage may have already changed the state to streaming + // response. + this->set_state_(new_state, new_state); + } break; } case api::enums::VOICE_ASSISTANT_RUN_END: { @@ -875,6 +907,9 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg) #ifdef USE_MEDIA_PLAYER if (this->media_player_ != nullptr) { this->tts_start_trigger_->trigger(msg.text); + + this->media_player_response_state_ = MediaPlayerResponseState::URL_SENT; + if (!msg.preannounce_media_id.empty()) { this->media_player_->make_call().set_media_url(msg.preannounce_media_id).set_announcement(true).perform(); } @@ -886,9 +921,6 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg) .perform(); this->continue_conversation_ = msg.start_conversation; - this->media_player_wait_for_announcement_start_ = true; - this->media_player_wait_for_announcement_end_ = false; - // Start the playback timeout, as the media player state isn't immediately updated this->start_playback_timeout_(); if (this->continuous_) { diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index 2424ea6052..95f77dbf09 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -90,6 +90,15 @@ struct Configuration { uint32_t max_active_wake_words; }; +#ifdef USE_MEDIA_PLAYER +enum class MediaPlayerResponseState { + IDLE, + URL_SENT, + PLAYING, + FINISHED, +}; +#endif + class VoiceAssistant : public Component { public: VoiceAssistant(); @@ -272,8 +281,8 @@ class VoiceAssistant : public Component { media_player::MediaPlayer *media_player_{nullptr}; std::string tts_response_url_{""}; bool started_streaming_tts_{false}; - bool media_player_wait_for_announcement_start_{false}; - bool media_player_wait_for_announcement_end_{false}; + + MediaPlayerResponseState media_player_response_state_{MediaPlayerResponseState::IDLE}; #endif bool local_output_{false};