[voice_assistant] voice assistant can configure enabled wake words (#8657)

2025-07-28 14:16:40 +00:00 · 2025-05-01 06:11:09 -05:00 · 2025-05-01 06:11:09 -05:00 · c7f597bc75
commit c7f597bc75
parent e215fafebe
6 changed files with 150 additions and 12 deletions
--- a/esphome/components/voice_assistant/init.py
+++ b/esphome/components/voice_assistant/init.py
@ -1,7 +1,7 @@
 from esphome import automation
 from esphome.automation import register_action, register_condition
 import esphome.codegen as cg
-from esphome.components import media_player, microphone, speaker
+from esphome.components import media_player, micro_wake_word, microphone, speaker
 import esphome.config_validation as cv
 from esphome.const import (
    CONF_ID,
@ -41,6 +41,7 @@ CONF_AUTO_GAIN = "auto_gain"
 CONF_NOISE_SUPPRESSION_LEVEL = "noise_suppression_level"
 CONF_VOLUME_MULTIPLIER = "volume_multiplier"
 CONF_MICRO_WAKE_WORD = "micro_wake_word"
 CONF_WAKE_WORD = "wake_word"
 CONF_CONVERSATION_TIMEOUT = "conversation_timeout"
@ -96,11 +97,12 @@ CONFIG_SCHEMA = cv.All(
                min_channels=1,
                max_channels=1,
            ),
            cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
            cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
                media_player.MediaPlayer
            ),
            cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
            cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean,
            cv.Optional(CONF_MICRO_WAKE_WORD): cv.use_id(micro_wake_word.MicroWakeWord),
            cv.Optional(CONF_VAD_THRESHOLD): cv.invalid(
                "VAD threshold is no longer supported, as it requires the deprecated esp_adf external component. Use an i2s_audio microphone/speaker instead. Additionally, you may need to configure the audio_adc and audio_dac components depending on your hardware."
            ),
@ -191,14 +193,18 @@ async def to_code(config):
    mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
    cg.add(var.set_microphone_source(mic_source))
-    if CONF_SPEAKER in config:
+    if CONF_MICRO_WAKE_WORD in config:
-        spkr = await cg.get_variable(config[CONF_SPEAKER])
+        mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD])
-        cg.add(var.set_speaker(spkr))
+        cg.add(var.set_micro_wake_word(mww))
    if CONF_MEDIA_PLAYER in config:
        mp = await cg.get_variable(config[CONF_MEDIA_PLAYER])
        cg.add(var.set_media_player(mp))
    if CONF_SPEAKER in config:
        spkr = await cg.get_variable(config[CONF_SPEAKER])
        cg.add(var.set_speaker(spkr))
    cg.add(var.set_use_wake_word(config[CONF_USE_WAKE_WORD]))
    if (vad_threshold := config.get(CONF_VAD_THRESHOLD)) is not None:
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@ -869,6 +869,59 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
 #endif
 }
 void VoiceAssistant::on_set_configuration(const std::vector<std::string> &active_wake_words) {
 #ifdef USE_MICRO_WAKE_WORD
  if (this->micro_wake_word_) {
    // Disable all wake words first
    for (auto &model : this->micro_wake_word_->get_wake_words()) {
      model->disable();
    }
    // Enable only active wake words
    for (auto ww_id : active_wake_words) {
      for (auto &model : this->micro_wake_word_->get_wake_words()) {
        if (model->get_id() == ww_id) {
          model->enable();
          ESP_LOGD(TAG, "Enabled wake word: %s (id=%s)", model->get_wake_word().c_str(), model->get_id().c_str());
        }
      }
    }
  }
 #endif
 };
 const Configuration &VoiceAssistant::get_configuration() {
  this->config_.available_wake_words.clear();
  this->config_.active_wake_words.clear();
 #ifdef USE_MICRO_WAKE_WORD
  if (this->micro_wake_word_) {
    this->config_.max_active_wake_words = 1;
    for (auto &model : this->micro_wake_word_->get_wake_words()) {
      if (model->is_enabled()) {
        this->config_.active_wake_words.push_back(model->get_id());
      }
      WakeWord wake_word;
      wake_word.id = model->get_id();
      wake_word.wake_word = model->get_wake_word();
      for (const auto &lang : model->get_trained_languages()) {
        wake_word.trained_languages.push_back(lang);
      }
      this->config_.available_wake_words.push_back(std::move(wake_word));
    }
  } else {
 #endif
    // No microWakeWord
    this->config_.max_active_wake_words = 0;
 #ifdef USE_MICRO_WAKE_WORD
  }
 #endif
  return this->config_;
 };
 VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
 }  // namespace voice_assistant
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@ -12,12 +12,15 @@
 #include "esphome/components/api/api_connection.h"
 #include "esphome/components/api/api_pb2.h"
 #include "esphome/components/microphone/microphone_source.h"
 #ifdef USE_SPEAKER
 #include "esphome/components/speaker/speaker.h"
 #endif
 #ifdef USE_MEDIA_PLAYER
 #include "esphome/components/media_player/media_player.h"
 #endif
 #ifdef USE_MICRO_WAKE_WORD
 #include "esphome/components/micro_wake_word/micro_wake_word.h"
 #endif
 #ifdef USE_SPEAKER
 #include "esphome/components/speaker/speaker.h"
 #endif
 #include "esphome/components/socket/socket.h"
 #include <unordered_map>
@ -99,6 +102,9 @@ class VoiceAssistant : public Component {
  void failed_to_start();
  void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
 #ifdef USE_MICRO_WAKE_WORD
  void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; }
 #endif
 #ifdef USE_SPEAKER
  void set_speaker(speaker::Speaker *speaker) {
    this->speaker_ = speaker;
@ -152,8 +158,8 @@ class VoiceAssistant : public Component {
  void on_audio(const api::VoiceAssistantAudio &msg);
  void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg);
  void on_announce(const api::VoiceAssistantAnnounceRequest &msg);
-  void on_set_configuration(const std::vector<std::string> &active_wake_words){};
+  void on_set_configuration(const std::vector<std::string> &active_wake_words);
-  const Configuration &get_configuration() { return this->config_; };
+  const Configuration &get_configuration();
  bool is_running() const { return this->state_ != State::IDLE; }
  void set_continuous(bool continuous) { this->continuous_ = continuous; }
@ -295,6 +301,10 @@ class VoiceAssistant : public Component {
  bool start_udp_socket_();
  Configuration config_{};
 #ifdef USE_MICRO_WAKE_WORD
  micro_wake_word::MicroWakeWord *micro_wake_word_{nullptr};
 #endif
 };
 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
--- a/tests/components/voice_assistant/common-idf.yaml
+++ b/tests/components/voice_assistant/common-idf.yaml
@ -0,0 +1,69 @@
 esphome:
  on_boot:
    then:
      - voice_assistant.start
      - voice_assistant.start_continuous
      - voice_assistant.stop
 wifi:
  ssid: MySSID
  password: password1
 api:
 i2s_audio:
  i2s_lrclk_pin: ${i2s_lrclk_pin}
  i2s_bclk_pin: ${i2s_bclk_pin}
  i2s_mclk_pin: ${i2s_mclk_pin}
 micro_wake_word:
  id: mww_id
  on_wake_word_detected:
    - voice_assistant.start:
        wake_word: !lambda return wake_word;
  models:
    - model: okay_nabu
 microphone:
  - platform: i2s_audio
    id: mic_id_external
    i2s_din_pin: ${i2s_din_pin}
    adc_type: external
    pdm: false
 speaker:
  - platform: i2s_audio
    id: speaker_id
    dac_type: external
    i2s_dout_pin: ${i2s_dout_pin}
 voice_assistant:
  microphone:
    microphone: mic_id_external
    gain_factor: 4
    channels: 0
  speaker: speaker_id
  micro_wake_word: mww_id
  conversation_timeout: 60s
  on_listening:
    - logger.log: "Voice assistant microphone listening"
  on_start:
    - logger.log: "Voice assistant started"
  on_stt_end:
    - logger.log:
        format: "Voice assistant STT ended with result %s"
        args: [x.c_str()]
  on_tts_start:
    - logger.log:
        format: "Voice assistant TTS started with text %s"
        args: [x.c_str()]
  on_tts_end:
    - logger.log:
        format: "Voice assistant TTS ended with url %s"
        args: [x.c_str()]
  on_end:
    - logger.log: "Voice assistant ended"
  on_error:
    - logger.log:
        format: "Voice assistant error - code %s, message: %s"
        args: [code.c_str(), message.c_str()]
--- a/tests/components/voice_assistant/test.esp32-c3-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-c3-idf.yaml
@ -5,4 +5,4 @@ substitutions:
  i2s_din_pin: GPIO3
  i2s_dout_pin: GPIO2
-<<: !include common.yaml
+<<: !include common-idf.yaml
--- a/tests/components/voice_assistant/test.esp32-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-idf.yaml
@ -5,4 +5,4 @@ substitutions:
  i2s_din_pin: GPIO13
  i2s_dout_pin: GPIO12
-<<: !include common.yaml
+<<: !include common-idf.yaml