mirror of
https://github.com/esphome/esphome.git
synced 2025-07-28 06:06:33 +00:00
[micro_wake_word] Experimental cutoff adjustments and uses mic sample rate (#8702)
This commit is contained in:
parent
4d43caf6c1
commit
39b119e9cc
@ -22,8 +22,6 @@ static const ssize_t DETECTION_QUEUE_LENGTH = 5;
|
||||
static const size_t DATA_TIMEOUT_MS = 50;
|
||||
|
||||
static const uint32_t RING_BUFFER_DURATION_MS = 120;
|
||||
static const uint32_t RING_BUFFER_SAMPLES = RING_BUFFER_DURATION_MS * (AUDIO_SAMPLE_FREQUENCY / 1000);
|
||||
static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
|
||||
|
||||
static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072;
|
||||
static const UBaseType_t INFERENCE_TASK_PRIORITY = 3;
|
||||
@ -141,13 +139,15 @@ void MicroWakeWord::inference_task(void *params) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::TASK_STARTING);
|
||||
|
||||
{ // Ensures any C++ objects fall out of scope to deallocate before deleting the task
|
||||
const size_t new_samples_to_read = this_mww->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000);
|
||||
|
||||
const size_t new_bytes_to_process =
|
||||
this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(this_mww->features_step_size_);
|
||||
std::unique_ptr<audio::AudioSourceTransferBuffer> audio_buffer;
|
||||
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE];
|
||||
|
||||
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
|
||||
// Allocate audio transfer buffer
|
||||
audio_buffer = audio::AudioSourceTransferBuffer::create(new_samples_to_read * sizeof(int16_t));
|
||||
audio_buffer = audio::AudioSourceTransferBuffer::create(new_bytes_to_process);
|
||||
|
||||
if (audio_buffer == nullptr) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
|
||||
@ -156,7 +156,8 @@ void MicroWakeWord::inference_task(void *params) {
|
||||
|
||||
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
|
||||
// Allocate ring buffer
|
||||
std::shared_ptr<RingBuffer> temp_ring_buffer = RingBuffer::create(RING_BUFFER_SIZE);
|
||||
std::shared_ptr<RingBuffer> temp_ring_buffer = RingBuffer::create(
|
||||
this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(RING_BUFFER_DURATION_MS));
|
||||
if (temp_ring_buffer.use_count() == 0) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
|
||||
}
|
||||
@ -171,13 +172,13 @@ void MicroWakeWord::inference_task(void *params) {
|
||||
while (!(xEventGroupGetBits(this_mww->event_group_) & COMMAND_STOP)) {
|
||||
audio_buffer->transfer_data_from_source(pdMS_TO_TICKS(DATA_TIMEOUT_MS));
|
||||
|
||||
if (audio_buffer->available() < new_samples_to_read * sizeof(int16_t)) {
|
||||
if (audio_buffer->available() < new_bytes_to_process) {
|
||||
// Insufficient data to generate new spectrogram features, read more next iteration
|
||||
continue;
|
||||
}
|
||||
|
||||
// Generate new spectrogram features
|
||||
size_t processed_samples = this_mww->generate_features_(
|
||||
uint32_t processed_samples = this_mww->generate_features_(
|
||||
(int16_t *) audio_buffer->get_buffer_start(), audio_buffer->available() / sizeof(int16_t), features_buffer);
|
||||
audio_buffer->decrease_buffer_length(processed_samples * sizeof(int16_t));
|
||||
|
||||
@ -297,7 +298,8 @@ void MicroWakeWord::loop() {
|
||||
if ((this->inference_task_handle_ == nullptr) && !this->status_has_error()) {
|
||||
// Setup preprocesor feature generator. If done in the task, it would lock the task to its initial core, as it
|
||||
// uses floating point operations.
|
||||
if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
|
||||
if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_,
|
||||
this->microphone_source_->get_audio_stream_info().get_sample_rate())) {
|
||||
this->status_momentary_error(
|
||||
"Failed to allocate buffers for spectrogram feature processor, attempting again in 1 second", 1000);
|
||||
return;
|
||||
|
@ -121,8 +121,6 @@ class MicroWakeWord : public Component {
|
||||
/// @param audio_features (int8_t *) Buffer containing new spectrogram features
|
||||
/// @return True if successful, false if any errors were encountered
|
||||
bool update_model_probabilities_(const int8_t audio_features[PREPROCESSOR_FEATURE_SIZE]);
|
||||
|
||||
inline uint16_t new_samples_to_get_() { return (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)); }
|
||||
};
|
||||
|
||||
} // namespace micro_wake_word
|
||||
|
@ -15,8 +15,6 @@ namespace micro_wake_word {
|
||||
static const uint8_t PREPROCESSOR_FEATURE_SIZE = 40;
|
||||
// Duration of each slice used as input into the preprocessor
|
||||
static const uint8_t FEATURE_DURATION_MS = 30;
|
||||
// Audio sample frequency in hertz
|
||||
static const uint16_t AUDIO_SAMPLE_FREQUENCY = 16000;
|
||||
|
||||
static const float FILTERBANK_LOWER_BAND_LIMIT = 125.0;
|
||||
static const float FILTERBANK_UPPER_BAND_LIMIT = 7500.0;
|
||||
|
@ -159,12 +159,13 @@ void StreamingModel::reset_probabilities() {
|
||||
this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
|
||||
}
|
||||
|
||||
WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t probability_cutoff,
|
||||
WakeWordModel::WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff,
|
||||
size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size,
|
||||
bool default_enabled, bool internal_only) {
|
||||
this->id_ = id;
|
||||
this->model_start_ = model_start;
|
||||
this->probability_cutoff_ = probability_cutoff;
|
||||
this->default_probability_cutoff_ = default_probability_cutoff;
|
||||
this->probability_cutoff_ = default_probability_cutoff;
|
||||
this->sliding_window_size_ = sliding_window_average_size;
|
||||
this->recent_streaming_probabilities_.resize(sliding_window_average_size, 0);
|
||||
this->wake_word_ = wake_word;
|
||||
@ -222,10 +223,11 @@ DetectionEvent WakeWordModel::determine_detected() {
|
||||
return detection_event;
|
||||
}
|
||||
|
||||
VADModel::VADModel(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size,
|
||||
VADModel::VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size,
|
||||
size_t tensor_arena_size) {
|
||||
this->model_start_ = model_start;
|
||||
this->probability_cutoff_ = probability_cutoff;
|
||||
this->default_probability_cutoff_ = default_probability_cutoff;
|
||||
this->probability_cutoff_ = default_probability_cutoff;
|
||||
this->sliding_window_size_ = sliding_window_size;
|
||||
this->recent_streaming_probabilities_.resize(sliding_window_size, 0);
|
||||
this->tensor_arena_size_ = tensor_arena_size;
|
||||
|
@ -50,9 +50,14 @@ class StreamingModel {
|
||||
virtual void disable() { this->enabled_ = false; }
|
||||
|
||||
/// @brief Return true if the model is enabled.
|
||||
bool is_enabled() { return this->enabled_; }
|
||||
bool is_enabled() const { return this->enabled_; }
|
||||
|
||||
bool get_unprocessed_probability_status() { return this->unprocessed_probability_status_; }
|
||||
bool get_unprocessed_probability_status() const { return this->unprocessed_probability_status_; }
|
||||
|
||||
// Quantized probability cutoffs mapping 0.0 - 1.0 to 0 - 255
|
||||
uint8_t get_default_probability_cutoff() const { return this->default_probability_cutoff_; }
|
||||
uint8_t get_probability_cutoff() const { return this->probability_cutoff_; }
|
||||
void set_probability_cutoff(uint8_t probability_cutoff) { this->probability_cutoff_ = probability_cutoff; }
|
||||
|
||||
protected:
|
||||
/// @brief Allocates tensor and variable arenas and sets up the model interpreter
|
||||
@ -69,8 +74,10 @@ class StreamingModel {
|
||||
uint8_t current_stride_step_{0};
|
||||
int16_t ignore_windows_{-MIN_SLICES_BEFORE_DETECTION};
|
||||
|
||||
uint8_t probability_cutoff_; // Quantized probability cutoff mapping 0.0 - 1.0 to 0 - 255
|
||||
uint8_t default_probability_cutoff_;
|
||||
uint8_t probability_cutoff_;
|
||||
size_t sliding_window_size_;
|
||||
|
||||
size_t last_n_index_{0};
|
||||
size_t tensor_arena_size_;
|
||||
std::vector<uint8_t> recent_streaming_probabilities_;
|
||||
@ -88,14 +95,14 @@ class WakeWordModel final : public StreamingModel {
|
||||
/// @brief Constructs a wake word model object
|
||||
/// @param id (std::string) identifier for this model
|
||||
/// @param model_start (const uint8_t *) pointer to the start of the model's TFLite FlatBuffer
|
||||
/// @param probability_cutoff (uint8_t) probability cutoff for acceping the wake word has been said
|
||||
/// @param default_probability_cutoff (uint8_t) probability cutoff for acceping the wake word has been said
|
||||
/// @param sliding_window_average_size (size_t) the length of the sliding window computing the mean rolling
|
||||
/// probability
|
||||
/// @param wake_word (std::string) Friendly name of the wake word
|
||||
/// @param tensor_arena_size (size_t) Size in bytes for allocating the tensor arena
|
||||
/// @param default_enabled (bool) If true, it will be enabled by default on first boot
|
||||
/// @param internal_only (bool) If true, the model will not be exposed to HomeAssistant as an available model
|
||||
WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t probability_cutoff,
|
||||
WakeWordModel(const std::string &id, const uint8_t *model_start, uint8_t default_probability_cutoff,
|
||||
size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size,
|
||||
bool default_enabled, bool internal_only);
|
||||
|
||||
@ -132,7 +139,7 @@ class WakeWordModel final : public StreamingModel {
|
||||
|
||||
class VADModel final : public StreamingModel {
|
||||
public:
|
||||
VADModel(const uint8_t *model_start, uint8_t probability_cutoff, size_t sliding_window_size,
|
||||
VADModel(const uint8_t *model_start, uint8_t default_probability_cutoff, size_t sliding_window_size,
|
||||
size_t tensor_arena_size);
|
||||
|
||||
void log_model_config() override;
|
||||
|
Loading…
x
Reference in New Issue
Block a user