From 39b119e9ccb32e9a03a2292c4363ca86037b25a3 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Tue, 6 May 2025 16:48:56 -0500 Subject: [PATCH] [micro_wake_word] Experimental cutoff adjustments and uses mic sample rate (#8702) --- .../micro_wake_word/micro_wake_word.cpp | 18 ++++++++++-------- .../micro_wake_word/micro_wake_word.h | 2 -- .../micro_wake_word/preprocessor_settings.h | 2 -- .../micro_wake_word/streaming_model.cpp | 10 ++++++---- .../micro_wake_word/streaming_model.h | 19 +++++++++++++------ 5 files changed, 29 insertions(+), 22 deletions(-) diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp index 46ca328730..a44348fdc9 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.cpp +++ b/esphome/components/micro_wake_word/micro_wake_word.cpp @@ -22,8 +22,6 @@ static const ssize_t DETECTION_QUEUE_LENGTH = 5; static const size_t DATA_TIMEOUT_MS = 50; static const uint32_t RING_BUFFER_DURATION_MS = 120; -static const uint32_t RING_BUFFER_SAMPLES = RING_BUFFER_DURATION_MS * (AUDIO_SAMPLE_FREQUENCY / 1000); -static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t); static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072; static const UBaseType_t INFERENCE_TASK_PRIORITY = 3; @@ -141,13 +139,15 @@ void MicroWakeWord::inference_task(void *params) { xEventGroupSetBits(this_mww->event_group_, EventGroupBits::TASK_STARTING); { // Ensures any C++ objects fall out of scope to deallocate before deleting the task - const size_t new_samples_to_read = this_mww->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000); + + const size_t new_bytes_to_process = + this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(this_mww->features_step_size_); std::unique_ptr audio_buffer; int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE]; if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) { // Allocate audio transfer buffer - audio_buffer = audio::AudioSourceTransferBuffer::create(new_samples_to_read * sizeof(int16_t)); + audio_buffer = audio::AudioSourceTransferBuffer::create(new_bytes_to_process); if (audio_buffer == nullptr) { xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY); @@ -156,7 +156,8 @@ void MicroWakeWord::inference_task(void *params) { if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) { // Allocate ring buffer - std::shared_ptr temp_ring_buffer = RingBuffer::create(RING_BUFFER_SIZE); + std::shared_ptr temp_ring_buffer = RingBuffer::create( + this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(RING_BUFFER_DURATION_MS)); if (temp_ring_buffer.use_count() == 0) { xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY); } @@ -171,13 +172,13 @@ void MicroWakeWord::inference_task(void *params) { while (!(xEventGroupGetBits(this_mww->event_group_) & COMMAND_STOP)) { audio_buffer->transfer_data_from_source(pdMS_TO_TICKS(DATA_TIMEOUT_MS)); - if (audio_buffer->available() < new_samples_to_read * sizeof(int16_t)) { + if (audio_buffer->available() < new_bytes_to_process) { // Insufficient data to generate new spectrogram features, read more next iteration continue; } // Generate new spectrogram features - size_t processed_samples = this_mww->generate_features_( + uint32_t processed_samples = this_mww->generate_features_( (int16_t *) audio_buffer->get_buffer_start(), audio_buffer->available() / sizeof(int16_t), features_buffer); audio_buffer->decrease_buffer_length(processed_samples * sizeof(int16_t)); @@ -297,7 +298,8 @@ void MicroWakeWord::loop() { if ((this->inference_task_handle_ == nullptr) && !this->status_has_error()) { // Setup preprocesor feature generator. If done in the task, it would lock the task to its initial core, as it // uses floating point operations. - if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) { + if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, + this->microphone_source_->get_audio_stream_info().get_sample_rate())) { this->status_momentary_error( "Failed to allocate buffers for spectrogram feature processor, attempting again in 1 second", 1000); return; diff --git a/esphome/components/micro_wake_word/micro_wake_word.h b/esphome/components/micro_wake_word/micro_wake_word.h index 626b8bffb8..d46c40e48b 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.h +++ b/esphome/components/micro_wake_word/micro_wake_word.h @@ -121,8 +121,6 @@ class MicroWakeWord : public Component { /// @param audio_features (int8_t *) Buffer containing new spectrogram features /// @return True if successful, false if any errors were encountered bool update_model_probabilities_(const int8_t audio_features[PREPROCESSOR_FEATURE_SIZE]); - - inline uint16_t new_samples_to_get_() { return (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)); } }; } // namespace micro_wake_word diff --git a/esphome/components/micro_wake_word/preprocessor_settings.h b/esphome/components/micro_wake_word/preprocessor_settings.h index 025e21c5f7..3de21de92e 100644 --- a/esphome/components/micro_wake_word/preprocessor_settings.h +++ b/esphome/components/micro_wake_word/preprocessor_settings.h @@ -15,8 +15,6 @@ namespace micro_wake_word { static const uint8_t PREPROCESSOR_FEATURE_SIZE = 40; // Duration of each slice used as input into the preprocessor static const uint8_t FEATURE_DURATION_MS = 30; -// Audio sample frequency in hertz -static const uint16_t AUDIO_SAMPLE_FREQUENCY = 16000; static const float FILTERBANK_LOWER_BAND_LIMIT = 125.0; static const float FILTERBANK_UPPER_BAND_LIMIT = 7500.0; diff --git a/esphome/components/micro_wake_word/streaming_model.cpp b/esphome/components/micro_wake_word/streaming_model.cpp index 6512c0f569..ce3d8c2e4c 100644 --- a/esphome/components/micro_wake_word/streaming_model.cpp +++ b/esphome/components/micro_wake_word/streaming_model.cpp @@ -159,12 +159,13 @@ void StreamingModel::reset_probabilities() { this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION; } -WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t probability_cutoff, +WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size, bool default_enabled, bool internal_only) { this->id_ = id; this->model_start_ = model_start; - this->probability_cutoff_ = probability_cutoff; + this->default_probability_cutoff_ = default_probability_cutoff; + this->probability_cutoff_ = default_probability_cutoff; this->sliding_window_size_ = sliding_window_average_size; this->recent_streaming_probabilities_.resize(sliding_window_average_size, 0); this->wake_word_ = wake_word; @@ -222,10 +223,11 @@ DetectionEvent WakeWordModel::determine_detected() { return detection_event; } -VADModel::VADModel(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size, +VADModel::VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size) { this->model_start_ = model_start; - this->probability_cutoff_ = probability_cutoff; + this->default_probability_cutoff_ = default_probability_cutoff; + this->probability_cutoff_ = default_probability_cutoff; this->sliding_window_size_ = sliding_window_size; this->recent_streaming_probabilities_.resize(sliding_window_size, 0); this->tensor_arena_size_ = tensor_arena_size; diff --git a/esphome/components/micro_wake_word/streaming_model.h b/esphome/components/micro_wake_word/streaming_model.h index 5bd1cf356a..b7b22b9700 100644 --- a/esphome/components/micro_wake_word/streaming_model.h +++ b/esphome/components/micro_wake_word/streaming_model.h @@ -50,9 +50,14 @@ class StreamingModel { virtual void disable() { this->enabled_ = false; } /// @brief Return true if the model is enabled. - bool is_enabled() { return this->enabled_; } + bool is_enabled() const { return this->enabled_; } - bool get_unprocessed_probability_status() { return this->unprocessed_probability_status_; } + bool get_unprocessed_probability_status() const { return this->unprocessed_probability_status_; } + + // Quantized probability cutoffs mapping 0.0 - 1.0 to 0 - 255 + uint8_t get_default_probability_cutoff() const { return this->default_probability_cutoff_; } + uint8_t get_probability_cutoff() const { return this->probability_cutoff_; } + void set_probability_cutoff(uint8_t probability_cutoff) { this->probability_cutoff_ = probability_cutoff; } protected: /// @brief Allocates tensor and variable arenas and sets up the model interpreter @@ -69,8 +74,10 @@ class StreamingModel { uint8_t current_stride_step_{0}; int16_t ignore_windows_{-MIN_SLICES_BEFORE_DETECTION}; - uint8_t probability_cutoff_; // Quantized probability cutoff mapping 0.0 - 1.0 to 0 - 255 + uint8_t default_probability_cutoff_; + uint8_t probability_cutoff_; size_t sliding_window_size_; + size_t last_n_index_{0}; size_t tensor_arena_size_; std::vector recent_streaming_probabilities_; @@ -88,14 +95,14 @@ class WakeWordModel final : public StreamingModel { /// @brief Constructs a wake word model object /// @param id (std::string) identifier for this model /// @param model_start (const uint8_t *) pointer to the start of the model's TFLite FlatBuffer - /// @param probability_cutoff (uint8_t) probability cutoff for acceping the wake word has been said + /// @param default_probability_cutoff (uint8_t) probability cutoff for acceping the wake word has been said /// @param sliding_window_average_size (size_t) the length of the sliding window computing the mean rolling /// probability /// @param wake_word (std::string) Friendly name of the wake word /// @param tensor_arena_size (size_t) Size in bytes for allocating the tensor arena /// @param default_enabled (bool) If true, it will be enabled by default on first boot /// @param internal_only (bool) If true, the model will not be exposed to HomeAssistant as an available model - WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t probability_cutoff, + WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size, bool default_enabled, bool internal_only); @@ -132,7 +139,7 @@ class WakeWordModel final : public StreamingModel { class VADModel final : public StreamingModel { public: - VADModel(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size, + VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size); void log_model_config() override;