From ee646d73247713fe45c539c8e22bf617ee96f4dd Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Sun, 27 Apr 2025 18:23:25 -0500 Subject: [PATCH] [micro_wake_word] Use microphone callback and avoid unnecessary allocation attempts (#8626) --- .../micro_wake_word/micro_wake_word.cpp | 89 ++++++++++--------- .../micro_wake_word/micro_wake_word.h | 12 +-- 2 files changed, 49 insertions(+), 52 deletions(-) diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp index b58c7ec434..533aa9fb75 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.cpp +++ b/esphome/components/micro_wake_word/micro_wake_word.cpp @@ -61,6 +61,29 @@ void MicroWakeWord::dump_config() { void MicroWakeWord::setup() { ESP_LOGCONFIG(TAG, "Setting up microWakeWord..."); + this->microphone_->add_data_callback([this](const std::vector &data) { + if (this->state_ != State::DETECTING_WAKE_WORD) { + return; + } + std::shared_ptr temp_ring_buffer = this->ring_buffer_; + if (this->ring_buffer_.use_count() == 2) { + // mWW still owns the ring buffer and temp_ring_buffer does as well, proceed to copy audio into ring buffer + + size_t bytes_free = temp_ring_buffer->free(); + + if (bytes_free < data.size() * sizeof(int16_t)) { + ESP_LOGW( + TAG, + "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " + "Resetting the ring buffer. Wake word detection accuracy will be reduced.", + bytes_free, data.size()); + + temp_ring_buffer->reset(); + } + temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t)); + } + }); + if (!this->register_streaming_ops_(this->streaming_op_resolver_)) { this->mark_failed(); return; @@ -107,7 +130,6 @@ void MicroWakeWord::loop() { ESP_LOGD(TAG, "Starting Microphone"); this->microphone_->start(); this->set_state_(State::STARTING_MICROPHONE); - this->high_freq_.start(); break; case State::STARTING_MICROPHONE: if (this->microphone_->is_running()) { @@ -115,21 +137,19 @@ void MicroWakeWord::loop() { } break; case State::DETECTING_WAKE_WORD: - while (!this->has_enough_samples_()) { - this->read_microphone_(); - } - this->update_model_probabilities_(); - if (this->detect_wake_words_()) { - ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str()); - this->detected_ = true; - this->set_state_(State::STOP_MICROPHONE); + while (this->has_enough_samples_()) { + this->update_model_probabilities_(); + if (this->detect_wake_words_()) { + ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str()); + this->detected_ = true; + this->set_state_(State::STOP_MICROPHONE); + } } break; case State::STOP_MICROPHONE: ESP_LOGD(TAG, "Stopping Microphone"); this->microphone_->stop(); this->set_state_(State::STOPPING_MICROPHONE); - this->high_freq_.stop(); this->unload_models_(); this->deallocate_buffers_(); break; @@ -157,6 +177,11 @@ void MicroWakeWord::start() { return; } + if (this->state_ != State::IDLE) { + ESP_LOGW(TAG, "Wake word is already running"); + return; + } + if (!this->load_models_() || !this->allocate_buffers_()) { ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers"); this->status_set_error(); @@ -169,11 +194,6 @@ void MicroWakeWord::start() { return; } - if (this->state_ != State::IDLE) { - ESP_LOGW(TAG, "Wake word is already running"); - return; - } - this->reset_states_(); this->set_state_(State::START_MICROPHONE); } @@ -196,26 +216,6 @@ void MicroWakeWord::set_state_(State state) { this->state_ = state; } -size_t MicroWakeWord::read_microphone_() { - size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t)); - if (bytes_read == 0) { - return 0; - } - - size_t bytes_free = this->ring_buffer_->free(); - - if (bytes_free < bytes_read) { - ESP_LOGW(TAG, - "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " - "Resetting the ring buffer. Wake word detection accuracy will be reduced.", - bytes_free, bytes_read); - - this->ring_buffer_->reset(); - } - - return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read); -} - bool MicroWakeWord::allocate_buffers_() { ExternalRAMAllocator audio_samples_allocator(ExternalRAMAllocator::ALLOW_FAILURE); @@ -235,9 +235,9 @@ bool MicroWakeWord::allocate_buffers_() { } } - if (this->ring_buffer_ == nullptr) { + if (this->ring_buffer_.use_count() == 0) { this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t)); - if (this->ring_buffer_ == nullptr) { + if (this->ring_buffer_.use_count() == 0) { ESP_LOGE(TAG, "Could not allocate ring buffer"); return false; } @@ -248,10 +248,17 @@ bool MicroWakeWord::allocate_buffers_() { void MicroWakeWord::deallocate_buffers_() { ExternalRAMAllocator audio_samples_allocator(ExternalRAMAllocator::ALLOW_FAILURE); - audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t)); - this->input_buffer_ = nullptr; - audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_()); - this->preprocessor_audio_buffer_ = nullptr; + if (this->input_buffer_ != nullptr) { + audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t)); + this->input_buffer_ = nullptr; + } + + if (this->preprocessor_audio_buffer_ != nullptr) { + audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_()); + this->preprocessor_audio_buffer_ = nullptr; + } + + this->ring_buffer_.reset(); } bool MicroWakeWord::load_models_() { diff --git a/esphome/components/micro_wake_word/micro_wake_word.h b/esphome/components/micro_wake_word/micro_wake_word.h index 0c805b75fc..443911b1e4 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.h +++ b/esphome/components/micro_wake_word/micro_wake_word.h @@ -62,9 +62,8 @@ class MicroWakeWord : public Component { microphone::Microphone *microphone_{nullptr}; Trigger *wake_word_detected_trigger_ = new Trigger(); State state_{State::IDLE}; - HighFrequencyLoopRequester high_freq_; - std::unique_ptr ring_buffer_; + std::shared_ptr ring_buffer_; std::vector wake_word_models_; @@ -98,15 +97,6 @@ class MicroWakeWord : public Component { /// @return True if enough samples, false otherwise. bool has_enough_samples_(); - /** Reads audio from microphone into the ring buffer - * - * Audio data (16000 kHz with int16 samples) is read into the input_buffer_. - * Verifies the ring buffer has enough space for all audio data. If not, it logs - * a warning and resets the ring buffer entirely. - * @return Number of bytes written to the ring buffer - */ - size_t read_microphone_(); - /// @brief Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_ /// @return True if successful, false otherwise bool allocate_buffers_();