From 9f3f4ead4f71a5af9860a7527ac5d96bd3d3d24e Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Thu, 26 Jun 2025 20:18:51 +0100 Subject: [PATCH] [voice_assistant] Support streaming TTS responses and fixes crash for long responses (#9224) --- CODEOWNERS | 2 +- .../components/voice_assistant/__init__.py | 13 ++++++- .../voice_assistant/voice_assistant.cpp | 38 ++++++++++++++++++- .../voice_assistant/voice_assistant.h | 4 ++ 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index 66ea80f8d6..a0812c9cd6 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -490,7 +490,7 @@ esphome/components/vbus/* @ssieb esphome/components/veml3235/* @kbx81 esphome/components/veml7700/* @latonita esphome/components/version/* @esphome/core -esphome/components/voice_assistant/* @jesserockz +esphome/components/voice_assistant/* @jesserockz @kahrendt esphome/components/wake_on_lan/* @clydebarrow @willwill2will54 esphome/components/watchdog/* @oarcher esphome/components/waveshare_epaper/* @clydebarrow diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py index b9309ab422..59c7ec8383 100644 --- a/esphome/components/voice_assistant/__init__.py +++ b/esphome/components/voice_assistant/__init__.py @@ -17,10 +17,11 @@ from esphome.const import ( AUTO_LOAD = ["socket"] DEPENDENCIES = ["api", "microphone"] -CODEOWNERS = ["@jesserockz"] +CODEOWNERS = ["@jesserockz", "@kahrendt"] CONF_ON_END = "on_end" CONF_ON_INTENT_END = "on_intent_end" +CONF_ON_INTENT_PROGRESS = "on_intent_progress" CONF_ON_INTENT_START = "on_intent_start" CONF_ON_LISTENING = "on_listening" CONF_ON_START = "on_start" @@ -136,6 +137,9 @@ CONFIG_SCHEMA = cv.All( cv.Optional(CONF_ON_INTENT_START): automation.validate_automation( single=True ), + cv.Optional(CONF_ON_INTENT_PROGRESS): automation.validate_automation( + single=True + ), cv.Optional(CONF_ON_INTENT_END): automation.validate_automation( single=True ), @@ -282,6 +286,13 @@ async def to_code(config): config[CONF_ON_INTENT_START], ) + if CONF_ON_INTENT_PROGRESS in config: + await automation.build_automation( + var.get_intent_progress_trigger(), + [(cg.std_string, "x")], + config[CONF_ON_INTENT_PROGRESS], + ) + if CONF_ON_INTENT_END in config: await automation.build_automation( var.get_intent_end_trigger(), diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index a692a7556e..879d9492f0 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -555,7 +555,7 @@ void VoiceAssistant::request_stop() { break; case State::AWAITING_RESPONSE: this->signal_stop_(); - break; + // Fallthrough intended to stop a streaming TTS announcement that has potentially started case State::STREAMING_RESPONSE: #ifdef USE_MEDIA_PLAYER // Stop any ongoing media player announcement @@ -599,6 +599,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { switch (msg.event_type) { case api::enums::VOICE_ASSISTANT_RUN_START: ESP_LOGD(TAG, "Assist Pipeline running"); +#ifdef USE_MEDIA_PLAYER + this->started_streaming_tts_ = false; + for (auto arg : msg.data) { + if (arg.name == "url") { + this->tts_response_url_ = std::move(arg.value); + } + } +#endif this->defer([this]() { this->start_trigger_->trigger(); }); break; case api::enums::VOICE_ASSISTANT_WAKE_WORD_START: @@ -622,6 +630,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { if (text.empty()) { ESP_LOGW(TAG, "No text in STT_END event"); return; + } else if (text.length() > 500) { + text = text.substr(0, 497) + "..."; } ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); this->defer([this, text]() { this->stt_end_trigger_->trigger(text); }); @@ -631,6 +641,27 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { ESP_LOGD(TAG, "Intent started"); this->defer([this]() { this->intent_start_trigger_->trigger(); }); break; + case api::enums::VOICE_ASSISTANT_INTENT_PROGRESS: { + ESP_LOGD(TAG, "Intent progress"); + std::string tts_url_for_trigger = ""; +#ifdef USE_MEDIA_PLAYER + if (this->media_player_ != nullptr) { + for (const auto &arg : msg.data) { + if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) { + this->media_player_->make_call().set_media_url(this->tts_response_url_).set_announcement(true).perform(); + + this->media_player_wait_for_announcement_start_ = true; + this->media_player_wait_for_announcement_end_ = false; + this->started_streaming_tts_ = true; + tts_url_for_trigger = this->tts_response_url_; + this->tts_response_url_.clear(); // Reset streaming URL + } + } + } +#endif + this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_->trigger(tts_url_for_trigger); }); + break; + } case api::enums::VOICE_ASSISTANT_INTENT_END: { for (auto arg : msg.data) { if (arg.name == "conversation_id") { @@ -653,6 +684,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { ESP_LOGW(TAG, "No text in TTS_START event"); return; } + if (text.length() > 500) { + text = text.substr(0, 497) + "..."; + } ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); this->defer([this, text]() { this->tts_start_trigger_->trigger(text); @@ -678,7 +712,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); this->defer([this, url]() { #ifdef USE_MEDIA_PLAYER - if (this->media_player_ != nullptr) { + if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) { this->media_player_->make_call().set_media_url(url).set_announcement(true).perform(); this->media_player_wait_for_announcement_start_ = true; diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index 865731522f..2424ea6052 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -177,6 +177,7 @@ class VoiceAssistant : public Component { Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; } Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; } + Trigger *get_intent_progress_trigger() const { return this->intent_progress_trigger_; } Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } Trigger<> *get_end_trigger() const { return this->end_trigger_; } Trigger<> *get_start_trigger() const { return this->start_trigger_; } @@ -233,6 +234,7 @@ class VoiceAssistant : public Component { Trigger<> *tts_stream_start_trigger_ = new Trigger<>(); Trigger<> *tts_stream_end_trigger_ = new Trigger<>(); #endif + Trigger *intent_progress_trigger_ = new Trigger(); Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); Trigger *stt_end_trigger_ = new Trigger(); Trigger *tts_end_trigger_ = new Trigger(); @@ -268,6 +270,8 @@ class VoiceAssistant : public Component { #endif #ifdef USE_MEDIA_PLAYER media_player::MediaPlayer *media_player_{nullptr}; + std::string tts_response_url_{""}; + bool started_streaming_tts_{false}; bool media_player_wait_for_announcement_start_{false}; bool media_player_wait_for_announcement_end_{false}; #endif