From 9f3f4ead4f71a5af9860a7527ac5d96bd3d3d24e Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kahrendt@gmail.com>
Date: Thu, 26 Jun 2025 20:18:51 +0100
Subject: [PATCH] [voice_assistant] Support streaming TTS responses and fixes
 crash for long responses (#9224)

---
 CODEOWNERS                                    |  2 +-
 .../components/voice_assistant/__init__.py    | 13 ++++++-
 .../voice_assistant/voice_assistant.cpp       | 38 ++++++++++++++++++-
 .../voice_assistant/voice_assistant.h         |  4 ++
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 66ea80f8d6..a0812c9cd6 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -490,7 +490,7 @@ esphome/components/vbus/* @ssieb
 esphome/components/veml3235/* @kbx81
 esphome/components/veml7700/* @latonita
 esphome/components/version/* @esphome/core
-esphome/components/voice_assistant/* @jesserockz
+esphome/components/voice_assistant/* @jesserockz @kahrendt
 esphome/components/wake_on_lan/* @clydebarrow @willwill2will54
 esphome/components/watchdog/* @oarcher
 esphome/components/waveshare_epaper/* @clydebarrow
diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py
index b9309ab422..59c7ec8383 100644
--- a/esphome/components/voice_assistant/__init__.py
+++ b/esphome/components/voice_assistant/__init__.py
@@ -17,10 +17,11 @@ from esphome.const import (
 AUTO_LOAD = ["socket"]
 DEPENDENCIES = ["api", "microphone"]
 
-CODEOWNERS = ["@jesserockz"]
+CODEOWNERS = ["@jesserockz", "@kahrendt"]
 
 CONF_ON_END = "on_end"
 CONF_ON_INTENT_END = "on_intent_end"
+CONF_ON_INTENT_PROGRESS = "on_intent_progress"
 CONF_ON_INTENT_START = "on_intent_start"
 CONF_ON_LISTENING = "on_listening"
 CONF_ON_START = "on_start"
@@ -136,6 +137,9 @@ CONFIG_SCHEMA = cv.All(
             cv.Optional(CONF_ON_INTENT_START): automation.validate_automation(
                 single=True
             ),
+            cv.Optional(CONF_ON_INTENT_PROGRESS): automation.validate_automation(
+                single=True
+            ),
             cv.Optional(CONF_ON_INTENT_END): automation.validate_automation(
                 single=True
             ),
@@ -282,6 +286,13 @@ async def to_code(config):
             config[CONF_ON_INTENT_START],
         )
 
+    if CONF_ON_INTENT_PROGRESS in config:
+        await automation.build_automation(
+            var.get_intent_progress_trigger(),
+            [(cg.std_string, "x")],
+            config[CONF_ON_INTENT_PROGRESS],
+        )
+
     if CONF_ON_INTENT_END in config:
         await automation.build_automation(
             var.get_intent_end_trigger(),
diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index a692a7556e..879d9492f0 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -555,7 +555,7 @@ void VoiceAssistant::request_stop() {
       break;
     case State::AWAITING_RESPONSE:
       this->signal_stop_();
-      break;
+      // Fallthrough intended to stop a streaming TTS announcement that has potentially started
     case State::STREAMING_RESPONSE:
 #ifdef USE_MEDIA_PLAYER
       // Stop any ongoing media player announcement
@@ -599,6 +599,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
   switch (msg.event_type) {
     case api::enums::VOICE_ASSISTANT_RUN_START:
       ESP_LOGD(TAG, "Assist Pipeline running");
+#ifdef USE_MEDIA_PLAYER
+      this->started_streaming_tts_ = false;
+      for (auto arg : msg.data) {
+        if (arg.name == "url") {
+          this->tts_response_url_ = std::move(arg.value);
+        }
+      }
+#endif
       this->defer([this]() { this->start_trigger_->trigger(); });
       break;
     case api::enums::VOICE_ASSISTANT_WAKE_WORD_START:
@@ -622,6 +630,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
       if (text.empty()) {
         ESP_LOGW(TAG, "No text in STT_END event");
         return;
+      } else if (text.length() > 500) {
+        text = text.substr(0, 497) + "...";
       }
       ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
       this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
@@ -631,6 +641,27 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
       ESP_LOGD(TAG, "Intent started");
       this->defer([this]() { this->intent_start_trigger_->trigger(); });
       break;
+    case api::enums::VOICE_ASSISTANT_INTENT_PROGRESS: {
+      ESP_LOGD(TAG, "Intent progress");
+      std::string tts_url_for_trigger = "";
+#ifdef USE_MEDIA_PLAYER
+      if (this->media_player_ != nullptr) {
+        for (const auto &arg : msg.data) {
+          if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
+            this->media_player_->make_call().set_media_url(this->tts_response_url_).set_announcement(true).perform();
+
+            this->media_player_wait_for_announcement_start_ = true;
+            this->media_player_wait_for_announcement_end_ = false;
+            this->started_streaming_tts_ = true;
+            tts_url_for_trigger = this->tts_response_url_;
+            this->tts_response_url_.clear();  // Reset streaming URL
+          }
+        }
+      }
+#endif
+      this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_->trigger(tts_url_for_trigger); });
+      break;
+    }
     case api::enums::VOICE_ASSISTANT_INTENT_END: {
       for (auto arg : msg.data) {
         if (arg.name == "conversation_id") {
@@ -653,6 +684,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
         ESP_LOGW(TAG, "No text in TTS_START event");
         return;
       }
+      if (text.length() > 500) {
+        text = text.substr(0, 497) + "...";
+      }
       ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
       this->defer([this, text]() {
         this->tts_start_trigger_->trigger(text);
@@ -678,7 +712,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
       ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
       this->defer([this, url]() {
 #ifdef USE_MEDIA_PLAYER
-        if (this->media_player_ != nullptr) {
+        if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
           this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
 
           this->media_player_wait_for_announcement_start_ = true;
diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h
index 865731522f..2424ea6052 100644
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -177,6 +177,7 @@ class VoiceAssistant : public Component {
 
   Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; }
   Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; }
+  Trigger<std::string> *get_intent_progress_trigger() const { return this->intent_progress_trigger_; }
   Trigger<> *get_listening_trigger() const { return this->listening_trigger_; }
   Trigger<> *get_end_trigger() const { return this->end_trigger_; }
   Trigger<> *get_start_trigger() const { return this->start_trigger_; }
@@ -233,6 +234,7 @@ class VoiceAssistant : public Component {
   Trigger<> *tts_stream_start_trigger_ = new Trigger<>();
   Trigger<> *tts_stream_end_trigger_ = new Trigger<>();
 #endif
+  Trigger<std::string> *intent_progress_trigger_ = new Trigger<std::string>();
   Trigger<> *wake_word_detected_trigger_ = new Trigger<>();
   Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>();
   Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>();
@@ -268,6 +270,8 @@ class VoiceAssistant : public Component {
 #endif
 #ifdef USE_MEDIA_PLAYER
   media_player::MediaPlayer *media_player_{nullptr};
+  std::string tts_response_url_{""};
+  bool started_streaming_tts_{false};
   bool media_player_wait_for_announcement_start_{false};
   bool media_player_wait_for_announcement_end_{false};
 #endif