From c7f597bc753deef9c49d0edb95f7e4c79e5f170d Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org>
Date: Thu, 1 May 2025 06:11:09 -0500
Subject: [PATCH] [voice_assistant] voice assistant can configure enabled wake
 words (#8657)

---
 .../components/voice_assistant/__init__.py    | 16 +++--
 .../voice_assistant/voice_assistant.cpp       | 53 ++++++++++++++
 .../voice_assistant/voice_assistant.h         | 20 ++++--
 .../voice_assistant/common-idf.yaml           | 69 +++++++++++++++++++
 .../voice_assistant/test.esp32-c3-idf.yaml    |  2 +-
 .../voice_assistant/test.esp32-idf.yaml       |  2 +-
 6 files changed, 150 insertions(+), 12 deletions(-)
 create mode 100644 tests/components/voice_assistant/common-idf.yaml

diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py
index ca0b6da742..b9309ab422 100644
--- a/esphome/components/voice_assistant/__init__.py
+++ b/esphome/components/voice_assistant/__init__.py
@@ -1,7 +1,7 @@
 from esphome import automation
 from esphome.automation import register_action, register_condition
 import esphome.codegen as cg
-from esphome.components import media_player, microphone, speaker
+from esphome.components import media_player, micro_wake_word, microphone, speaker
 import esphome.config_validation as cv
 from esphome.const import (
     CONF_ID,
@@ -41,6 +41,7 @@ CONF_AUTO_GAIN = "auto_gain"
 CONF_NOISE_SUPPRESSION_LEVEL = "noise_suppression_level"
 CONF_VOLUME_MULTIPLIER = "volume_multiplier"
 
+CONF_MICRO_WAKE_WORD = "micro_wake_word"
 CONF_WAKE_WORD = "wake_word"
 
 CONF_CONVERSATION_TIMEOUT = "conversation_timeout"
@@ -96,11 +97,12 @@ CONFIG_SCHEMA = cv.All(
                 min_channels=1,
                 max_channels=1,
             ),
-            cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
             cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
                 media_player.MediaPlayer
             ),
+            cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
             cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean,
+            cv.Optional(CONF_MICRO_WAKE_WORD): cv.use_id(micro_wake_word.MicroWakeWord),
             cv.Optional(CONF_VAD_THRESHOLD): cv.invalid(
                 "VAD threshold is no longer supported, as it requires the deprecated esp_adf external component. Use an i2s_audio microphone/speaker instead. Additionally, you may need to configure the audio_adc and audio_dac components depending on your hardware."
             ),
@@ -191,14 +193,18 @@ async def to_code(config):
     mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
     cg.add(var.set_microphone_source(mic_source))
 
-    if CONF_SPEAKER in config:
-        spkr = await cg.get_variable(config[CONF_SPEAKER])
-        cg.add(var.set_speaker(spkr))
+    if CONF_MICRO_WAKE_WORD in config:
+        mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD])
+        cg.add(var.set_micro_wake_word(mww))
 
     if CONF_MEDIA_PLAYER in config:
         mp = await cg.get_variable(config[CONF_MEDIA_PLAYER])
         cg.add(var.set_media_player(mp))
 
+    if CONF_SPEAKER in config:
+        spkr = await cg.get_variable(config[CONF_SPEAKER])
+        cg.add(var.set_speaker(spkr))
+
     cg.add(var.set_use_wake_word(config[CONF_USE_WAKE_WORD]))
 
     if (vad_threshold := config.get(CONF_VAD_THRESHOLD)) is not None:
diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index 37b97239c8..d35717ef91 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -869,6 +869,59 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
 #endif
 }
 
+void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
+#ifdef USE_MICRO_WAKE_WORD
+  if (this->micro_wake_word_) {
+    // Disable all wake words first
+    for (auto &model : this->micro_wake_word_->get_wake_words()) {
+      model->disable();
+    }
+
+    // Enable only active wake words
+    for (auto ww_id : active_wake_words) {
+      for (auto &model : this->micro_wake_word_->get_wake_words()) {
+        if (model->get_id() == ww_id) {
+          model->enable();
+          ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
+        }
+      }
+    }
+  }
+#endif
+};
+
+const Configuration &VoiceAssistant::get_configuration() {
+  this->config_.available_wake_words.clear();
+  this->config_.active_wake_words.clear();
+
+#ifdef USE_MICRO_WAKE_WORD
+  if (this->micro_wake_word_) {
+    this->config_.max_active_wake_words = 1;
+
+    for (auto &model : this->micro_wake_word_->get_wake_words()) {
+      if (model->is_enabled()) {
+        this->config_.active_wake_words.push_back(model->get_id());
+      }
+
+      WakeWord wake_word;
+      wake_word.id = model->get_id();
+      wake_word.wake_word = model->get_wake_word();
+      for (const auto &lang : model->get_trained_languages()) {
+        wake_word.trained_languages.push_back(lang);
+      }
+      this->config_.available_wake_words.push_back(std::move(wake_word));
+    }
+  } else {
+#endif
+    // No microWakeWord
+    this->config_.max_active_wake_words = 0;
+#ifdef USE_MICRO_WAKE_WORD
+  }
+#endif
+
+  return this->config_;
+};
+
 VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 
 }  // namespace voice_assistant
diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h
index 7122d69527..865731522f 100644
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -12,12 +12,15 @@
 #include "esphome/components/api/api_connection.h"
 #include "esphome/components/api/api_pb2.h"
 #include "esphome/components/microphone/microphone_source.h"
-#ifdef USE_SPEAKER
-#include "esphome/components/speaker/speaker.h"
-#endif
 #ifdef USE_MEDIA_PLAYER
 #include "esphome/components/media_player/media_player.h"
 #endif
+#ifdef USE_MICRO_WAKE_WORD
+#include "esphome/components/micro_wake_word/micro_wake_word.h"
+#endif
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
 #include "esphome/components/socket/socket.h"
 
 #include <unordered_map>
@@ -99,6 +102,9 @@ class VoiceAssistant : public Component {
   void failed_to_start();
 
   void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
+#ifdef USE_MICRO_WAKE_WORD
+  void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; }
+#endif
 #ifdef USE_SPEAKER
   void set_speaker(speaker::Speaker *speaker) {
     this->speaker_ = speaker;
@@ -152,8 +158,8 @@ class VoiceAssistant : public Component {
   void on_audio(const api::VoiceAssistantAudio &msg);
   void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg);
   void on_announce(const api::VoiceAssistantAnnounceRequest &msg);
-  void on_set_configuration(const std::vector<std::string> &active_wake_words){};
-  const Configuration &get_configuration() { return this->config_; };
+  void on_set_configuration(const std::vector<std::string> &active_wake_words);
+  const Configuration &get_configuration();
 
   bool is_running() const { return this->state_ != State::IDLE; }
   void set_continuous(bool continuous) { this->continuous_ = continuous; }
@@ -295,6 +301,10 @@ class VoiceAssistant : public Component {
   bool start_udp_socket_();
 
   Configuration config_{};
+
+#ifdef USE_MICRO_WAKE_WORD
+  micro_wake_word::MicroWakeWord *micro_wake_word_{nullptr};
+#endif
 };
 
 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
diff --git a/tests/components/voice_assistant/common-idf.yaml b/tests/components/voice_assistant/common-idf.yaml
new file mode 100644
index 0000000000..b1d249d5b4
--- /dev/null
+++ b/tests/components/voice_assistant/common-idf.yaml
@@ -0,0 +1,69 @@
+esphome:
+  on_boot:
+    then:
+      - voice_assistant.start
+      - voice_assistant.start_continuous
+      - voice_assistant.stop
+
+wifi:
+  ssid: MySSID
+  password: password1
+
+api:
+
+i2s_audio:
+  i2s_lrclk_pin: ${i2s_lrclk_pin}
+  i2s_bclk_pin: ${i2s_bclk_pin}
+  i2s_mclk_pin: ${i2s_mclk_pin}
+
+micro_wake_word:
+  id: mww_id
+  on_wake_word_detected:
+    - voice_assistant.start:
+        wake_word: !lambda return wake_word;
+  models:
+    - model: okay_nabu
+
+microphone:
+  - platform: i2s_audio
+    id: mic_id_external
+    i2s_din_pin: ${i2s_din_pin}
+    adc_type: external
+    pdm: false
+
+speaker:
+  - platform: i2s_audio
+    id: speaker_id
+    dac_type: external
+    i2s_dout_pin: ${i2s_dout_pin}
+
+voice_assistant:
+  microphone:
+    microphone: mic_id_external
+    gain_factor: 4
+    channels: 0
+  speaker: speaker_id
+  micro_wake_word: mww_id
+  conversation_timeout: 60s
+  on_listening:
+    - logger.log: "Voice assistant microphone listening"
+  on_start:
+    - logger.log: "Voice assistant started"
+  on_stt_end:
+    - logger.log:
+        format: "Voice assistant STT ended with result %s"
+        args: [x.c_str()]
+  on_tts_start:
+    - logger.log:
+        format: "Voice assistant TTS started with text %s"
+        args: [x.c_str()]
+  on_tts_end:
+    - logger.log:
+        format: "Voice assistant TTS ended with url %s"
+        args: [x.c_str()]
+  on_end:
+    - logger.log: "Voice assistant ended"
+  on_error:
+    - logger.log:
+        format: "Voice assistant error - code %s, message: %s"
+        args: [code.c_str(), message.c_str()]
diff --git a/tests/components/voice_assistant/test.esp32-c3-idf.yaml b/tests/components/voice_assistant/test.esp32-c3-idf.yaml
index f596d927cb..46745e4308 100644
--- a/tests/components/voice_assistant/test.esp32-c3-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-c3-idf.yaml
@@ -5,4 +5,4 @@ substitutions:
   i2s_din_pin: GPIO3
   i2s_dout_pin: GPIO2
 
-<<: !include common.yaml
+<<: !include common-idf.yaml
diff --git a/tests/components/voice_assistant/test.esp32-idf.yaml b/tests/components/voice_assistant/test.esp32-idf.yaml
index f6e553f9dc..0fe5d347be 100644
--- a/tests/components/voice_assistant/test.esp32-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-idf.yaml
@@ -5,4 +5,4 @@ substitutions:
   i2s_din_pin: GPIO13
   i2s_dout_pin: GPIO12
 
-<<: !include common.yaml
+<<: !include common-idf.yaml