[micro_wake_word] Experimental cutoff adjustments and uses mic sample rate (#8702)

This commit is contained in:
Kevin Ahrendt 2025-05-06 16:48:56 -05:00 committed by GitHub
parent 4d43caf6c1
commit 39b119e9cc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 29 additions and 22 deletions

View File

@ -22,8 +22,6 @@ static const ssize_t DETECTION_QUEUE_LENGTH = 5;
static const size_t DATA_TIMEOUT_MS = 50;
static const uint32_t RING_BUFFER_DURATION_MS = 120;
static const uint32_t RING_BUFFER_SAMPLES = RING_BUFFER_DURATION_MS * (AUDIO_SAMPLE_FREQUENCY / 1000);
static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072;
static const UBaseType_t INFERENCE_TASK_PRIORITY = 3;
@ -141,13 +139,15 @@ void MicroWakeWord::inference_task(void *params) {
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::TASK_STARTING);
{ // Ensures any C++ objects fall out of scope to deallocate before deleting the task
const size_t new_samples_to_read = this_mww->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000);
const size_t new_bytes_to_process =
this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(this_mww->features_step_size_);
std::unique_ptr<audio::AudioSourceTransferBuffer> audio_buffer;
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE];
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
// Allocate audio transfer buffer
audio_buffer = audio::AudioSourceTransferBuffer::create(new_samples_to_read * sizeof(int16_t));
audio_buffer = audio::AudioSourceTransferBuffer::create(new_bytes_to_process);
if (audio_buffer == nullptr) {
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
@ -156,7 +156,8 @@ void MicroWakeWord::inference_task(void *params) {
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
// Allocate ring buffer
std::shared_ptr<RingBuffer> temp_ring_buffer = RingBuffer::create(RING_BUFFER_SIZE);
std::shared_ptr<RingBuffer> temp_ring_buffer = RingBuffer::create(
this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(RING_BUFFER_DURATION_MS));
if (temp_ring_buffer.use_count() == 0) {
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
}
@ -171,13 +172,13 @@ void MicroWakeWord::inference_task(void *params) {
while (!(xEventGroupGetBits(this_mww->event_group_) & COMMAND_STOP)) {
audio_buffer->transfer_data_from_source(pdMS_TO_TICKS(DATA_TIMEOUT_MS));
if (audio_buffer->available() < new_samples_to_read * sizeof(int16_t)) {
if (audio_buffer->available() < new_bytes_to_process) {
// Insufficient data to generate new spectrogram features, read more next iteration
continue;
}
// Generate new spectrogram features
size_t processed_samples = this_mww->generate_features_(
uint32_t processed_samples = this_mww->generate_features_(
(int16_t *) audio_buffer->get_buffer_start(), audio_buffer->available() / sizeof(int16_t), features_buffer);
audio_buffer->decrease_buffer_length(processed_samples * sizeof(int16_t));
@ -297,7 +298,8 @@ void MicroWakeWord::loop() {
if ((this->inference_task_handle_ == nullptr) && !this->status_has_error()) {
// Setup preprocesor feature generator. If done in the task, it would lock the task to its initial core, as it
// uses floating point operations.
if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_,
this->microphone_source_->get_audio_stream_info().get_sample_rate())) {
this->status_momentary_error(
"Failed to allocate buffers for spectrogram feature processor, attempting again in 1 second", 1000);
return;

View File

@ -121,8 +121,6 @@ class MicroWakeWord : public Component {
/// @param audio_features (int8_t *) Buffer containing new spectrogram features
/// @return True if successful, false if any errors were encountered
bool update_model_probabilities_(const int8_t audio_features[PREPROCESSOR_FEATURE_SIZE]);
inline uint16_t new_samples_to_get_() { return (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)); }
};
} // namespace micro_wake_word

View File

@ -15,8 +15,6 @@ namespace micro_wake_word {
static const uint8_t PREPROCESSOR_FEATURE_SIZE = 40;
// Duration of each slice used as input into the preprocessor
static const uint8_t FEATURE_DURATION_MS = 30;
// Audio sample frequency in hertz
static const uint16_t AUDIO_SAMPLE_FREQUENCY = 16000;
static const float FILTERBANK_LOWER_BAND_LIMIT = 125.0;
static const float FILTERBANK_UPPER_BAND_LIMIT = 7500.0;

View File

@ -159,12 +159,13 @@ void StreamingModel::reset_probabilities() {
this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
}
WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t probability_cutoff,
WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff,
size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size,
bool default_enabled, bool internal_only) {
this->id_ = id;
this->model_start_ = model_start;
this->probability_cutoff_ = probability_cutoff;
this->default_probability_cutoff_ = default_probability_cutoff;
this->probability_cutoff_ = default_probability_cutoff;
this->sliding_window_size_ = sliding_window_average_size;
this->recent_streaming_probabilities_.resize(sliding_window_average_size, 0);
this->wake_word_ = wake_word;
@ -222,10 +223,11 @@ DetectionEvent WakeWordModel::determine_detected() {
return detection_event;
}
VADModel::VADModel(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size,
VADModel::VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size,
size_t tensor_arena_size) {
this->model_start_ = model_start;
this->probability_cutoff_ = probability_cutoff;
this->default_probability_cutoff_ = default_probability_cutoff;
this->probability_cutoff_ = default_probability_cutoff;
this->sliding_window_size_ = sliding_window_size;
this->recent_streaming_probabilities_.resize(sliding_window_size, 0);
this->tensor_arena_size_ = tensor_arena_size;

View File

@ -50,9 +50,14 @@ class StreamingModel {
virtual void disable() { this->enabled_ = false; }
/// @brief Return true if the model is enabled.
bool is_enabled() { return this->enabled_; }
bool is_enabled() const { return this->enabled_; }
bool get_unprocessed_probability_status() { return this->unprocessed_probability_status_; }
bool get_unprocessed_probability_status() const { return this->unprocessed_probability_status_; }
// Quantized probability cutoffs mapping 0.0 - 1.0 to 0 - 255
uint8_t get_default_probability_cutoff() const { return this->default_probability_cutoff_; }
uint8_t get_probability_cutoff() const { return this->probability_cutoff_; }
void set_probability_cutoff(uint8_t probability_cutoff) { this->probability_cutoff_ = probability_cutoff; }
protected:
/// @brief Allocates tensor and variable arenas and sets up the model interpreter
@ -69,8 +74,10 @@ class StreamingModel {
uint8_t current_stride_step_{0};
int16_t ignore_windows_{-MIN_SLICES_BEFORE_DETECTION};
uint8_t probability_cutoff_; // Quantized probability cutoff mapping 0.0 - 1.0 to 0 - 255
uint8_t default_probability_cutoff_;
uint8_t probability_cutoff_;
size_t sliding_window_size_;
size_t last_n_index_{0};
size_t tensor_arena_size_;
std::vector<uint8_t> recent_streaming_probabilities_;
@ -88,14 +95,14 @@ class WakeWordModel final : public StreamingModel {
/// @brief Constructs a wake word model object
/// @param id (std::string) identifier for this model
/// @param model_start (const uint8_t *) pointer to the start of the model's TFLite FlatBuffer
/// @param probability_cutoff (uint8_t) probability cutoff for acceping the wake word has been said
/// @param default_probability_cutoff (uint8_t) probability cutoff for acceping the wake word has been said
/// @param sliding_window_average_size (size_t) the length of the sliding window computing the mean rolling
/// probability
/// @param wake_word (std::string) Friendly name of the wake word
/// @param tensor_arena_size (size_t) Size in bytes for allocating the tensor arena
/// @param default_enabled (bool) If true, it will be enabled by default on first boot
/// @param internal_only (bool) If true, the model will not be exposed to HomeAssistant as an available model
WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t probability_cutoff,
WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff,
size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size,
bool default_enabled, bool internal_only);
@ -132,7 +139,7 @@ class WakeWordModel final : public StreamingModel {
class VADModel final : public StreamingModel {
public:
VADModel(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size,
VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size,
size_t tensor_arena_size);
void log_model_config() override;