[voice_assistant] voice assistant can configure enabled wake words (#8657)

2025-07-28 14:16:40 +00:00 · 2025-05-01 06:11:09 -05:00 · 2025-05-01 06:11:09 -05:00 · c7f597bc75
commit c7f597bc75
parent e215fafebe
6 changed files with 150 additions and 12 deletions
--- a/esphome/components/voice_assistant/init.py
+++ b/esphome/components/voice_assistant/init.py
@ -1,7 +1,7 @@
 from esphome import automation
 from esphome.automation import register_action, register_condition
 import esphome.codegen as cg
-from esphome.components import media_player, microphone, speaker
+from esphome.components import media_player, micro_wake_word, microphone, speaker
 import esphome.config_validation as cv
 from esphome.const import (
    CONF_ID,
@ -41,6 +41,7 @@ CONF_AUTO_GAIN = "auto_gain"
 CONF_NOISE_SUPPRESSION_LEVEL = "noise_suppression_level"
 CONF_VOLUME_MULTIPLIER = "volume_multiplier"

+CONF_MICRO_WAKE_WORD = "micro_wake_word"
 CONF_WAKE_WORD = "wake_word"

 CONF_CONVERSATION_TIMEOUT = "conversation_timeout"
@ -96,11 +97,12 @@ CONFIG_SCHEMA = cv.All(
                min_channels=1,
                max_channels=1,
            ),
-            cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
            cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
                media_player.MediaPlayer
            ),
+            cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
            cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean,
+            cv.Optional(CONF_MICRO_WAKE_WORD): cv.use_id(micro_wake_word.MicroWakeWord),
            cv.Optional(CONF_VAD_THRESHOLD): cv.invalid(
                "VAD threshold is no longer supported, as it requires the deprecated esp_adf external component. Use an i2s_audio microphone/speaker instead. Additionally, you may need to configure the audio_adc and audio_dac components depending on your hardware."
            ),
@ -191,14 +193,18 @@ async def to_code(config):
    mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
    cg.add(var.set_microphone_source(mic_source))

-    if CONF_SPEAKER in config:
-        spkr = await cg.get_variable(config[CONF_SPEAKER])
-        cg.add(var.set_speaker(spkr))
+    if CONF_MICRO_WAKE_WORD in config:
+        mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD])
+        cg.add(var.set_micro_wake_word(mww))

    if CONF_MEDIA_PLAYER in config:
        mp = await cg.get_variable(config[CONF_MEDIA_PLAYER])
        cg.add(var.set_media_player(mp))

+    if CONF_SPEAKER in config:
+        spkr = await cg.get_variable(config[CONF_SPEAKER])
+        cg.add(var.set_speaker(spkr))
+
    cg.add(var.set_use_wake_word(config[CONF_USE_WAKE_WORD]))

    if (vad_threshold := config.get(CONF_VAD_THRESHOLD)) is not None:
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@ -869,6 +869,59 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
 #endif
 }

+void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
+#ifdef USE_MICRO_WAKE_WORD
+  if (this->micro_wake_word_) {
+    // Disable all wake words first
+    for (auto &model : this->micro_wake_word_->get_wake_words()) {
+      model->disable();
+    }
+
+    // Enable only active wake words
+    for (auto ww_id : active_wake_words) {
+      for (auto &model : this->micro_wake_word_->get_wake_words()) {
+        if (model->get_id() == ww_id) {
+          model->enable();
+          ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
+        }
+      }
+    }
+  }
+#endif
+};
+
+const Configuration &VoiceAssistant::get_configuration() {
+  this->config_.available_wake_words.clear();
+  this->config_.active_wake_words.clear();
+
+#ifdef USE_MICRO_WAKE_WORD
+  if (this->micro_wake_word_) {
+    this->config_.max_active_wake_words = 1;
+
+    for (auto &model : this->micro_wake_word_->get_wake_words()) {
+      if (model->is_enabled()) {
+        this->config_.active_wake_words.push_back(model->get_id());
+      }
+
+      WakeWord wake_word;
+      wake_word.id = model->get_id();
+      wake_word.wake_word = model->get_wake_word();
+      for (const auto &lang : model->get_trained_languages()) {
+        wake_word.trained_languages.push_back(lang);
+      }
+      this->config_.available_wake_words.push_back(std::move(wake_word));
+    }
+  } else {
+#endif
+    // No microWakeWord
+    this->config_.max_active_wake_words = 0;
+#ifdef USE_MICRO_WAKE_WORD
+  }
+#endif
+
+  return this->config_;
+};
+
 VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)

 }  // namespace voice_assistant
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@ -12,12 +12,15 @@
 #include "esphome/components/api/api_connection.h"
 #include "esphome/components/api/api_pb2.h"
 #include "esphome/components/microphone/microphone_source.h"
-#ifdef USE_SPEAKER
-#include "esphome/components/speaker/speaker.h"
-#endif
 #ifdef USE_MEDIA_PLAYER
 #include "esphome/components/media_player/media_player.h"
 #endif
+#ifdef USE_MICRO_WAKE_WORD
+#include "esphome/components/micro_wake_word/micro_wake_word.h"
+#endif
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
 #include "esphome/components/socket/socket.h"

 #include <unordered_map>
@ -99,6 +102,9 @@ class VoiceAssistant : public Component {
  void failed_to_start();

  void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
+#ifdef USE_MICRO_WAKE_WORD
+  void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; }
+#endif
 #ifdef USE_SPEAKER
  void set_speaker(speaker::Speaker *speaker) {
    this->speaker_ = speaker;
@ -152,8 +158,8 @@ class VoiceAssistant : public Component {
  void on_audio(const api::VoiceAssistantAudio &msg);
  void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg);
  void on_announce(const api::VoiceAssistantAnnounceRequest &msg);
-  void on_set_configuration(const std::vector<std::string> &active_wake_words){};
-  const Configuration &get_configuration() { return this->config_; };
+  void on_set_configuration(const std::vector<std::string> &active_wake_words);
+  const Configuration &get_configuration();

  bool is_running() const { return this->state_ != State::IDLE; }
  void set_continuous(bool continuous) { this->continuous_ = continuous; }
@ -295,6 +301,10 @@ class VoiceAssistant : public Component {
  bool start_udp_socket_();

  Configuration config_{};
+
+#ifdef USE_MICRO_WAKE_WORD
+  micro_wake_word::MicroWakeWord *micro_wake_word_{nullptr};
+#endif
 };

 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
--- a/tests/components/voice_assistant/common-idf.yaml
+++ b/tests/components/voice_assistant/common-idf.yaml
@ -0,0 +1,69 @@
+esphome:
+  on_boot:
+    then:
+      - voice_assistant.start
+      - voice_assistant.start_continuous
+      - voice_assistant.stop
+
+wifi:
+  ssid: MySSID
+  password: password1
+
+api:
+
+i2s_audio:
+  i2s_lrclk_pin: ${i2s_lrclk_pin}
+  i2s_bclk_pin: ${i2s_bclk_pin}
+  i2s_mclk_pin: ${i2s_mclk_pin}
+
+micro_wake_word:
+  id: mww_id
+  on_wake_word_detected:
+    - voice_assistant.start:
+        wake_word: !lambda return wake_word;
+  models:
+    - model: okay_nabu
+
+microphone:
+  - platform: i2s_audio
+    id: mic_id_external
+    i2s_din_pin: ${i2s_din_pin}
+    adc_type: external
+    pdm: false
+
+speaker:
+  - platform: i2s_audio
+    id: speaker_id
+    dac_type: external
+    i2s_dout_pin: ${i2s_dout_pin}
+
+voice_assistant:
+  microphone:
+    microphone: mic_id_external
+    gain_factor: 4
+    channels: 0
+  speaker: speaker_id
+  micro_wake_word: mww_id
+  conversation_timeout: 60s
+  on_listening:
+    - logger.log: "Voice assistant microphone listening"
+  on_start:
+    - logger.log: "Voice assistant started"
+  on_stt_end:
+    - logger.log:
+        format: "Voice assistant STT ended with result %s"
+        args: [x.c_str()]
+  on_tts_start:
+    - logger.log:
+        format: "Voice assistant TTS started with text %s"
+        args: [x.c_str()]
+  on_tts_end:
+    - logger.log:
+        format: "Voice assistant TTS ended with url %s"
+        args: [x.c_str()]
+  on_end:
+    - logger.log: "Voice assistant ended"
+  on_error:
+    - logger.log:
+        format: "Voice assistant error - code %s, message: %s"
+        args: [code.c_str(), message.c_str()]
--- a/tests/components/voice_assistant/test.esp32-c3-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-c3-idf.yaml
@ -5,4 +5,4 @@ substitutions:
  i2s_din_pin: GPIO3
  i2s_dout_pin: GPIO2

-<<: !include common.yaml
+<<: !include common-idf.yaml
--- a/tests/components/voice_assistant/test.esp32-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-idf.yaml
@ -5,4 +5,4 @@ substitutions:
  i2s_din_pin: GPIO13
  i2s_dout_pin: GPIO12

-<<: !include common.yaml
+<<: !include common-idf.yaml