From ee646d73247713fe45c539c8e22bf617ee96f4dd Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org>
Date: Sun, 27 Apr 2025 18:23:25 -0500
Subject: [PATCH] [micro_wake_word] Use microphone callback and avoid
 unnecessary allocation attempts (#8626)

---
 .../micro_wake_word/micro_wake_word.cpp       | 89 ++++++++++---------
 .../micro_wake_word/micro_wake_word.h         | 12 +--
 2 files changed, 49 insertions(+), 52 deletions(-)
diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp
index b58c7ec434..533aa9fb75 100644
--- a/esphome/components/micro_wake_word/micro_wake_word.cpp
+++ b/esphome/components/micro_wake_word/micro_wake_word.cpp
@@ -61,6 +61,29 @@ void MicroWakeWord::dump_config() {
 void MicroWakeWord::setup() {
   ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
 
+  this->microphone_->add_data_callback([this](const std::vector<int16_t> &data) {
+    if (this->state_ != State::DETECTING_WAKE_WORD) {
+      return;
+    }
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
+    if (this->ring_buffer_.use_count() == 2) {
+      // mWW still owns the ring buffer and temp_ring_buffer does as well, proceed to copy audio into ring buffer
+
+      size_t bytes_free = temp_ring_buffer->free();
+
+      if (bytes_free < data.size() * sizeof(int16_t)) {
+        ESP_LOGW(
+            TAG,
+            "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
+            "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
+            bytes_free, data.size());
+
+        temp_ring_buffer->reset();
+      }
+      temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
+    }
+  });
+
   if (!this->register_streaming_ops_(this->streaming_op_resolver_)) {
     this->mark_failed();
     return;
@@ -107,7 +130,6 @@ void MicroWakeWord::loop() {
       ESP_LOGD(TAG, "Starting Microphone");
       this->microphone_->start();
       this->set_state_(State::STARTING_MICROPHONE);
-      this->high_freq_.start();
       break;
     case State::STARTING_MICROPHONE:
       if (this->microphone_->is_running()) {
@@ -115,21 +137,19 @@ void MicroWakeWord::loop() {
       }
       break;
     case State::DETECTING_WAKE_WORD:
-      while (!this->has_enough_samples_()) {
-        this->read_microphone_();
-      }
-      this->update_model_probabilities_();
-      if (this->detect_wake_words_()) {
-        ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
-        this->detected_ = true;
-        this->set_state_(State::STOP_MICROPHONE);
+      while (this->has_enough_samples_()) {
+        this->update_model_probabilities_();
+        if (this->detect_wake_words_()) {
+          ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
+          this->detected_ = true;
+          this->set_state_(State::STOP_MICROPHONE);
+        }
       }
       break;
     case State::STOP_MICROPHONE:
       ESP_LOGD(TAG, "Stopping Microphone");
       this->microphone_->stop();
       this->set_state_(State::STOPPING_MICROPHONE);
-      this->high_freq_.stop();
       this->unload_models_();
       this->deallocate_buffers_();
       break;
@@ -157,6 +177,11 @@ void MicroWakeWord::start() {
     return;
   }
 
+  if (this->state_ != State::IDLE) {
+    ESP_LOGW(TAG, "Wake word is already running");
+    return;
+  }
+
   if (!this->load_models_() || !this->allocate_buffers_()) {
     ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
     this->status_set_error();
@@ -169,11 +194,6 @@ void MicroWakeWord::start() {
     return;
   }
 
-  if (this->state_ != State::IDLE) {
-    ESP_LOGW(TAG, "Wake word is already running");
-    return;
-  }
-
   this->reset_states_();
   this->set_state_(State::START_MICROPHONE);
 }
@@ -196,26 +216,6 @@ void MicroWakeWord::set_state_(State state) {
   this->state_ = state;
 }
 
-size_t MicroWakeWord::read_microphone_() {
-  size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
-  if (bytes_read == 0) {
-    return 0;
-  }
-
-  size_t bytes_free = this->ring_buffer_->free();
-
-  if (bytes_free < bytes_read) {
-    ESP_LOGW(TAG,
-             "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
-             "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
-             bytes_free, bytes_read);
-
-    this->ring_buffer_->reset();
-  }
-
-  return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
-}
-
 bool MicroWakeWord::allocate_buffers_() {
   ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
 
@@ -235,9 +235,9 @@ bool MicroWakeWord::allocate_buffers_() {
     }
   }
 
-  if (this->ring_buffer_ == nullptr) {
+  if (this->ring_buffer_.use_count() == 0) {
     this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
-    if (this->ring_buffer_ == nullptr) {
+    if (this->ring_buffer_.use_count() == 0) {
       ESP_LOGE(TAG, "Could not allocate ring buffer");
       return false;
     }
@@ -248,10 +248,17 @@ bool MicroWakeWord::allocate_buffers_() {
 
 void MicroWakeWord::deallocate_buffers_() {
   ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
-  audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
-  this->input_buffer_ = nullptr;
-  audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
-  this->preprocessor_audio_buffer_ = nullptr;
+  if (this->input_buffer_ != nullptr) {
+    audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
+    this->input_buffer_ = nullptr;
+  }
+
+  if (this->preprocessor_audio_buffer_ != nullptr) {
+    audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
+    this->preprocessor_audio_buffer_ = nullptr;
+  }
+
+  this->ring_buffer_.reset();
 }
 
 bool MicroWakeWord::load_models_() {
diff --git a/esphome/components/micro_wake_word/micro_wake_word.h b/esphome/components/micro_wake_word/micro_wake_word.h
index 0c805b75fc..443911b1e4 100644
--- a/esphome/components/micro_wake_word/micro_wake_word.h
+++ b/esphome/components/micro_wake_word/micro_wake_word.h
@@ -62,9 +62,8 @@ class MicroWakeWord : public Component {
   microphone::Microphone *microphone_{nullptr};
   Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
   State state_{State::IDLE};
-  HighFrequencyLoopRequester high_freq_;
 
-  std::unique_ptr<RingBuffer> ring_buffer_;
+  std::shared_ptr<RingBuffer> ring_buffer_;
 
   std::vector<WakeWordModel> wake_word_models_;
 
@@ -98,15 +97,6 @@ class MicroWakeWord : public Component {
   /// @return True if enough samples, false otherwise.
   bool has_enough_samples_();
 
-  /** Reads audio from microphone into the ring buffer
-   *
-   * Audio data (16000 kHz with int16 samples) is read into the input_buffer_.
-   * Verifies the ring buffer has enough space for all audio data. If not, it logs
-   * a warning and resets the ring buffer entirely.
-   * @return Number of bytes written to the ring buffer
-   */
-  size_t read_microphone_();
-
   /// @brief Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_
   /// @return True if successful, false otherwise
   bool allocate_buffers_();