[micro_wake_word] Use microphone callback and avoid unnecessary allocation attempts (#8626)

2025-07-28 14:16:40 +00:00 · 2025-04-27 18:23:25 -05:00 · 2025-04-27 18:23:25 -05:00 · ee646d7324
commit ee646d7324
parent e557bca420
2 changed files with 49 additions and 52 deletions
--- a/esphome/components/micro_wake_word/micro_wake_word.cpp
+++ b/esphome/components/micro_wake_word/micro_wake_word.cpp
@ -61,6 +61,29 @@ void MicroWakeWord::dump_config() {
 void MicroWakeWord::setup() {
  ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");

+  this->microphone_->add_data_callback([this](const std::vector<int16_t> &data) {
+    if (this->state_ != State::DETECTING_WAKE_WORD) {
+      return;
+    }
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
+    if (this->ring_buffer_.use_count() == 2) {
+      // mWW still owns the ring buffer and temp_ring_buffer does as well, proceed to copy audio into ring buffer
+
+      size_t bytes_free = temp_ring_buffer->free();
+
+      if (bytes_free < data.size() * sizeof(int16_t)) {
+        ESP_LOGW(
+            TAG,
+            "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
+            "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
+            bytes_free, data.size());
+
+        temp_ring_buffer->reset();
+      }
+      temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
+    }
+  });
+
  if (!this->register_streaming_ops_(this->streaming_op_resolver_)) {
    this->mark_failed();
    return;
@ -107,7 +130,6 @@ void MicroWakeWord::loop() {
      ESP_LOGD(TAG, "Starting Microphone");
      this->microphone_->start();
      this->set_state_(State::STARTING_MICROPHONE);
-      this->high_freq_.start();
      break;
    case State::STARTING_MICROPHONE:
      if (this->microphone_->is_running()) {
@ -115,21 +137,19 @@ void MicroWakeWord::loop() {
      }
      break;
    case State::DETECTING_WAKE_WORD:
-      while (!this->has_enough_samples_()) {
-        this->read_microphone_();
-      }
+      while (this->has_enough_samples_()) {
        this->update_model_probabilities_();
        if (this->detect_wake_words_()) {
          ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
          this->detected_ = true;
          this->set_state_(State::STOP_MICROPHONE);
        }
+      }
      break;
    case State::STOP_MICROPHONE:
      ESP_LOGD(TAG, "Stopping Microphone");
      this->microphone_->stop();
      this->set_state_(State::STOPPING_MICROPHONE);
-      this->high_freq_.stop();
      this->unload_models_();
      this->deallocate_buffers_();
      break;
@ -157,6 +177,11 @@ void MicroWakeWord::start() {
    return;
  }

+  if (this->state_ != State::IDLE) {
+    ESP_LOGW(TAG, "Wake word is already running");
+    return;
+  }
+
  if (!this->load_models_() || !this->allocate_buffers_()) {
    ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
    this->status_set_error();
@ -169,11 +194,6 @@ void MicroWakeWord::start() {
    return;
  }

-  if (this->state_ != State::IDLE) {
-    ESP_LOGW(TAG, "Wake word is already running");
-    return;
-  }
-
  this->reset_states_();
  this->set_state_(State::START_MICROPHONE);
 }
@ -196,26 +216,6 @@ void MicroWakeWord::set_state_(State state) {
  this->state_ = state;
 }

-size_t MicroWakeWord::read_microphone_() {
-  size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
-  if (bytes_read == 0) {
-    return 0;
-  }
-
-  size_t bytes_free = this->ring_buffer_->free();
-
-  if (bytes_free < bytes_read) {
-    ESP_LOGW(TAG,
-             "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
-             "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
-             bytes_free, bytes_read);
-
-    this->ring_buffer_->reset();
-  }
-
-  return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
-}
-
 bool MicroWakeWord::allocate_buffers_() {
  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);

@ -235,9 +235,9 @@ bool MicroWakeWord::allocate_buffers_() {
    }
  }

-  if (this->ring_buffer_ == nullptr) {
+  if (this->ring_buffer_.use_count() == 0) {
    this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
-    if (this->ring_buffer_ == nullptr) {
+    if (this->ring_buffer_.use_count() == 0) {
      ESP_LOGE(TAG, "Could not allocate ring buffer");
      return false;
    }
@ -248,12 +248,19 @@ bool MicroWakeWord::allocate_buffers_() {

 void MicroWakeWord::deallocate_buffers_() {
  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
+  if (this->input_buffer_ != nullptr) {
    audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
    this->input_buffer_ = nullptr;
+  }
+
+  if (this->preprocessor_audio_buffer_ != nullptr) {
    audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
    this->preprocessor_audio_buffer_ = nullptr;
  }

+  this->ring_buffer_.reset();
+}
+
 bool MicroWakeWord::load_models_() {
  // Setup preprocesor feature generator
  if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
--- a/esphome/components/micro_wake_word/micro_wake_word.h
+++ b/esphome/components/micro_wake_word/micro_wake_word.h
@ -62,9 +62,8 @@ class MicroWakeWord : public Component {
  microphone::Microphone *microphone_{nullptr};
  Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
  State state_{State::IDLE};
-  HighFrequencyLoopRequester high_freq_;

-  std::unique_ptr<RingBuffer> ring_buffer_;
+  std::shared_ptr<RingBuffer> ring_buffer_;

  std::vector<WakeWordModel> wake_word_models_;

@ -98,15 +97,6 @@ class MicroWakeWord : public Component {
  /// @return True if enough samples, false otherwise.
  bool has_enough_samples_();

-  /** Reads audio from microphone into the ring buffer
-   *
-   * Audio data (16000 kHz with int16 samples) is read into the input_buffer_.
-   * Verifies the ring buffer has enough space for all audio data. If not, it logs
-   * a warning and resets the ring buffer entirely.
-   * @return Number of bytes written to the ring buffer
-   */
-  size_t read_microphone_();
-
  /// @brief Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_
  /// @return True if successful, false otherwise
  bool allocate_buffers_();