[micro_wake_word] Use microphone callback and avoid unnecessary allocation attempts (#8626)

2025-07-28 14:16:40 +00:00 · 2025-04-27 18:23:25 -05:00 · 2025-04-27 18:23:25 -05:00 · ee646d7324
commit ee646d7324
parent e557bca420
2 changed files with 49 additions and 52 deletions
--- a/esphome/components/micro_wake_word/micro_wake_word.cpp
+++ b/esphome/components/micro_wake_word/micro_wake_word.cpp
@ -61,6 +61,29 @@ void MicroWakeWord::dump_config() {
 void MicroWakeWord::setup() {
  ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
  this->microphone_->add_data_callback([this](const std::vector<int16_t> &data) {
    if (this->state_ != State::DETECTING_WAKE_WORD) {
      return;
    }
    std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
    if (this->ring_buffer_.use_count() == 2) {
      // mWW still owns the ring buffer and temp_ring_buffer does as well, proceed to copy audio into ring buffer
      size_t bytes_free = temp_ring_buffer->free();
      if (bytes_free < data.size() * sizeof(int16_t)) {
        ESP_LOGW(
            TAG,
            "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
            "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
            bytes_free, data.size());
        temp_ring_buffer->reset();
      }
      temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
    }
  });
  if (!this->register_streaming_ops_(this->streaming_op_resolver_)) {
    this->mark_failed();
    return;
@ -107,7 +130,6 @@ void MicroWakeWord::loop() {
      ESP_LOGD(TAG, "Starting Microphone");
      this->microphone_->start();
      this->set_state_(State::STARTING_MICROPHONE);
      this->high_freq_.start();
      break;
    case State::STARTING_MICROPHONE:
      if (this->microphone_->is_running()) {
@ -115,21 +137,19 @@ void MicroWakeWord::loop() {
      }
      break;
    case State::DETECTING_WAKE_WORD:
-      while (!this->has_enough_samples_()) {
+      while (this->has_enough_samples_()) {
-        this->read_microphone_();
+        this->update_model_probabilities_();
-      }
+        if (this->detect_wake_words_()) {
-      this->update_model_probabilities_();
+          ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
-      if (this->detect_wake_words_()) {
+          this->detected_ = true;
-        ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
+          this->set_state_(State::STOP_MICROPHONE);
-        this->detected_ = true;
+        }
        this->set_state_(State::STOP_MICROPHONE);
      }
      break;
    case State::STOP_MICROPHONE:
      ESP_LOGD(TAG, "Stopping Microphone");
      this->microphone_->stop();
      this->set_state_(State::STOPPING_MICROPHONE);
      this->high_freq_.stop();
      this->unload_models_();
      this->deallocate_buffers_();
      break;
@ -157,6 +177,11 @@ void MicroWakeWord::start() {
    return;
  }
  if (this->state_ != State::IDLE) {
    ESP_LOGW(TAG, "Wake word is already running");
    return;
  }
  if (!this->load_models_() || !this->allocate_buffers_()) {
    ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
    this->status_set_error();
@ -169,11 +194,6 @@ void MicroWakeWord::start() {
    return;
  }
  if (this->state_ != State::IDLE) {
    ESP_LOGW(TAG, "Wake word is already running");
    return;
  }
  this->reset_states_();
  this->set_state_(State::START_MICROPHONE);
 }
@ -196,26 +216,6 @@ void MicroWakeWord::set_state_(State state) {
  this->state_ = state;
 }
 size_t MicroWakeWord::read_microphone_() {
  size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
  if (bytes_read == 0) {
    return 0;
  }
  size_t bytes_free = this->ring_buffer_->free();
  if (bytes_free < bytes_read) {
    ESP_LOGW(TAG,
             "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
             "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
             bytes_free, bytes_read);
    this->ring_buffer_->reset();
  }
  return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
 }
 bool MicroWakeWord::allocate_buffers_() {
  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
@ -235,9 +235,9 @@ bool MicroWakeWord::allocate_buffers_() {
    }
  }
-  if (this->ring_buffer_ == nullptr) {
+  if (this->ring_buffer_.use_count() == 0) {
    this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
-    if (this->ring_buffer_ == nullptr) {
+    if (this->ring_buffer_.use_count() == 0) {
      ESP_LOGE(TAG, "Could not allocate ring buffer");
      return false;
    }
@ -248,10 +248,17 @@ bool MicroWakeWord::allocate_buffers_() {
 void MicroWakeWord::deallocate_buffers_() {
  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
-  audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
+  if (this->input_buffer_ != nullptr) {
-  this->input_buffer_ = nullptr;
+    audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
-  audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
+    this->input_buffer_ = nullptr;
-  this->preprocessor_audio_buffer_ = nullptr;
+  }
  if (this->preprocessor_audio_buffer_ != nullptr) {
    audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
    this->preprocessor_audio_buffer_ = nullptr;
  }
  this->ring_buffer_.reset();
 }
 bool MicroWakeWord::load_models_() {
--- a/esphome/components/micro_wake_word/micro_wake_word.h
+++ b/esphome/components/micro_wake_word/micro_wake_word.h
@ -62,9 +62,8 @@ class MicroWakeWord : public Component {
  microphone::Microphone *microphone_{nullptr};
  Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
  State state_{State::IDLE};
  HighFrequencyLoopRequester high_freq_;
-  std::unique_ptr<RingBuffer> ring_buffer_;
+  std::shared_ptr<RingBuffer> ring_buffer_;
  std::vector<WakeWordModel> wake_word_models_;
@ -98,15 +97,6 @@ class MicroWakeWord : public Component {
  /// @return True if enough samples, false otherwise.
  bool has_enough_samples_();
  /** Reads audio from microphone into the ring buffer
   *
   * Audio data (16000 kHz with int16 samples) is read into the input_buffer_.
   * Verifies the ring buffer has enough space for all audio data. If not, it logs
   * a warning and resets the ring buffer entirely.
   * @return Number of bytes written to the ring buffer
   */
  size_t read_microphone_();
  /// @brief Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_
  /// @return True if successful, false otherwise
  bool allocate_buffers_();