From 9f629dcaa245053d313f9db26c778ca33c27541c Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Tue, 29 Apr 2025 17:27:03 -0500 Subject: [PATCH] [i2s_audio, microphone, micro_wake_word, voice_assistant] Use microphone source to process incoming audio (#8645) Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com> --- .../i2s_audio/microphone/__init__.py | 45 ++++++++-- .../microphone/i2s_audio_microphone.cpp | 87 +++++++++---------- .../microphone/i2s_audio_microphone.h | 4 +- .../components/micro_wake_word/__init__.py | 27 +++++- .../micro_wake_word/micro_wake_word.cpp | 14 +-- .../micro_wake_word/micro_wake_word.h | 8 +- esphome/components/microphone/__init__.py | 8 +- esphome/components/microphone/automation.h | 4 +- esphome/components/microphone/microphone.h | 5 +- .../microphone/microphone_source.cpp | 4 +- .../components/voice_assistant/__init__.py | 26 +++++- .../voice_assistant/voice_assistant.cpp | 20 ++--- .../voice_assistant/voice_assistant.h | 6 +- tests/components/micro_wake_word/common.yaml | 1 + tests/components/voice_assistant/common.yaml | 5 +- 15 files changed, 166 insertions(+), 98 deletions(-) diff --git a/esphome/components/i2s_audio/microphone/__init__.py b/esphome/components/i2s_audio/microphone/__init__.py index 4950a25751..06eb29986d 100644 --- a/esphome/components/i2s_audio/microphone/__init__.py +++ b/esphome/components/i2s_audio/microphone/__init__.py @@ -1,13 +1,20 @@ from esphome import pins import esphome.codegen as cg -from esphome.components import esp32, microphone +from esphome.components import audio, esp32, microphone from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin import esphome.config_validation as cv -from esphome.const import CONF_ID, CONF_NUMBER +from esphome.const import ( + CONF_BITS_PER_SAMPLE, + CONF_CHANNEL, + CONF_ID, + CONF_NUM_CHANNELS, + CONF_NUMBER, + CONF_SAMPLE_RATE, +) from .. import ( - CONF_CHANNEL, CONF_I2S_DIN_PIN, + CONF_LEFT, CONF_MONO, CONF_RIGHT, I2SAudioIn, @@ -32,7 +39,7 @@ INTERNAL_ADC_VARIANTS = [esp32.const.VARIANT_ESP32] PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3] -def validate_esp32_variant(config): +def _validate_esp32_variant(config): variant = esp32.get_esp32_variant() if config[CONF_ADC_TYPE] == "external": if config[CONF_PDM]: @@ -46,12 +53,34 @@ def validate_esp32_variant(config): raise NotImplementedError -def validate_channel(config): +def _validate_channel(config): if config[CONF_CHANNEL] == CONF_MONO: raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.") return config +def _set_num_channels_from_config(config): + if config[CONF_CHANNEL] in (CONF_LEFT, CONF_RIGHT): + config[CONF_NUM_CHANNELS] = 1 + else: + config[CONF_NUM_CHANNELS] = 2 + + return config + + +def _set_stream_limits(config): + audio.set_stream_limits( + min_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE), + max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE), + min_channels=config.get(CONF_NUM_CHANNELS), + max_channels=config.get(CONF_NUM_CHANNELS), + min_sample_rate=config.get(CONF_SAMPLE_RATE), + max_sample_rate=config.get(CONF_SAMPLE_RATE), + )(config) + + return config + + BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend( i2s_audio_component_schema( I2SAudioMicrophone, @@ -79,8 +108,10 @@ CONFIG_SCHEMA = cv.All( }, key=CONF_ADC_TYPE, ), - validate_esp32_variant, - validate_channel, + _validate_esp32_variant, + _validate_channel, + _set_num_channels_from_config, + _set_stream_limits, ) diff --git a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp index 3ab3c88142..78a7f92c2f 100644 --- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp +++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp @@ -56,6 +56,35 @@ void I2SAudioMicrophone::start_() { } esp_err_t err; + uint8_t channel_count = 1; +#ifdef USE_I2S_LEGACY + uint8_t bits_per_sample = this->bits_per_sample_; + + if (this->channel_ == I2S_CHANNEL_FMT_RIGHT_LEFT) { + channel_count = 2; + } +#else + if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_AUTO) { + this->slot_bit_width_ = I2S_SLOT_BIT_WIDTH_16BIT; + } + uint8_t bits_per_sample = this->slot_bit_width_; + + if (this->slot_mode_ == I2S_SLOT_MODE_STEREO) { + channel_count = 2; + } +#endif + +#ifdef USE_ESP32_VARIANT_ESP32 + // ESP32 reads audio aligned to a multiple of 2 bytes. For example, if configured for 24 bits per sample, then it will + // produce 32 bits per sample, where the actual data is in the most significant bits. Other ESP32 variants produce 24 + // bits per sample in this situation. + if (bits_per_sample < 16) { + bits_per_sample = 16; + } else if ((bits_per_sample > 16) && (bits_per_sample <= 32)) { + bits_per_sample = 32; + } +#endif + #ifdef USE_I2S_LEGACY i2s_driver_config_t config = { .mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX), @@ -144,6 +173,8 @@ void I2SAudioMicrophone::start_() { i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config(); #if SOC_I2S_SUPPORTS_PDM_RX if (this->pdm_) { + bits_per_sample = 16; // PDM mics are always 16 bits per sample with the IDF 5 driver + i2s_pdm_rx_clk_config_t clk_cfg = { .sample_rate_hz = this->sample_rate_, .clk_src = clk_src, @@ -187,13 +218,8 @@ void I2SAudioMicrophone::start_() { .clk_src = clk_src, .mclk_multiple = I2S_MCLK_MULTIPLE_256, }; - i2s_data_bit_width_t data_bit_width; - if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_8BIT) { - data_bit_width = I2S_DATA_BIT_WIDTH_16BIT; - } else { - data_bit_width = I2S_DATA_BIT_WIDTH_8BIT; - } - i2s_std_slot_config_t std_slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(data_bit_width, this->slot_mode_); + i2s_std_slot_config_t std_slot_cfg = + I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) this->slot_bit_width_, this->slot_mode_); std_slot_cfg.slot_bit_width = this->slot_bit_width_; std_slot_cfg.slot_mask = this->std_slot_mask_; @@ -222,6 +248,8 @@ void I2SAudioMicrophone::start_() { } #endif + this->audio_stream_info_ = audio::AudioStreamInfo(bits_per_sample, channel_count, this->sample_rate_); + this->state_ = microphone::STATE_RUNNING; this->high_freq_.start(); this->status_clear_error(); @@ -284,7 +312,7 @@ void I2SAudioMicrophone::stop_() { this->status_clear_error(); } -size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wait) { +size_t I2SAudioMicrophone::read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait) { size_t bytes_read = 0; #ifdef USE_I2S_LEGACY esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait); @@ -303,38 +331,7 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa return 0; } this->status_clear_warning(); - // ESP-IDF I2S implementation right-extends 8-bit data to 16 bits, - // and 24-bit data to 32 bits. -#ifdef USE_I2S_LEGACY - switch (this->bits_per_sample_) { - case I2S_BITS_PER_SAMPLE_8BIT: - case I2S_BITS_PER_SAMPLE_16BIT: - return bytes_read; - case I2S_BITS_PER_SAMPLE_24BIT: - case I2S_BITS_PER_SAMPLE_32BIT: { - size_t samples_read = bytes_read / sizeof(int32_t); - for (size_t i = 0; i < samples_read; i++) { - int32_t temp = reinterpret_cast(buf)[i] >> 14; - buf[i] = clamp(temp, INT16_MIN, INT16_MAX); - } - return samples_read * sizeof(int16_t); - } - default: - ESP_LOGE(TAG, "Unsupported bits per sample: %d", this->bits_per_sample_); - return 0; - } -#else -#ifndef USE_ESP32_VARIANT_ESP32 - // For newer ESP32 variants 8 bit data needs to be extended to 16 bit. - if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_8BIT) { - size_t samples_read = bytes_read / sizeof(int8_t); - for (size_t i = samples_read - 1; i >= 0; i--) { - int16_t temp = static_cast(reinterpret_cast(buf)[i]) << 8; - buf[i] = temp; - } - return samples_read * sizeof(int16_t); - } -#else +#if defined(USE_ESP32_VARIANT_ESP32) and not defined(USE_I2S_LEGACY) // For ESP32 8/16 bit standard mono mode samples need to be switched. if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) { size_t samples_read = bytes_read / sizeof(int16_t); @@ -346,14 +343,14 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa } #endif return bytes_read; -#endif } void I2SAudioMicrophone::read_() { - std::vector samples; - samples.resize(BUFFER_SIZE); - size_t bytes_read = this->read(samples.data(), BUFFER_SIZE * sizeof(int16_t), 0); - samples.resize(bytes_read / sizeof(int16_t)); + std::vector samples; + const size_t bytes_to_read = this->audio_stream_info_.ms_to_bytes(32); + samples.resize(bytes_to_read); + size_t bytes_read = this->read_(samples.data(), bytes_to_read, 0); + samples.resize(bytes_read); this->data_callbacks_.call(samples); } diff --git a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h index 2dbacb447e..072d312e0f 100644 --- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h +++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h @@ -25,9 +25,6 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub void set_pdm(bool pdm) { this->pdm_ = pdm; } - size_t read(int16_t *buf, size_t len, TickType_t ticks_to_wait); - size_t read(int16_t *buf, size_t len) override { return this->read(buf, len, pdMS_TO_TICKS(100)); } - #ifdef USE_I2S_LEGACY #if SOC_I2S_SUPPORTS_ADC void set_adc_channel(adc1_channel_t channel) { @@ -41,6 +38,7 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub void start_(); void stop_(); void read_(); + size_t read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait); #ifdef USE_I2S_LEGACY int8_t din_pin_{I2S_PIN_NO_CHANGE}; diff --git a/esphome/components/micro_wake_word/__init__.py b/esphome/components/micro_wake_word/__init__.py index 0862406e46..9d5caca937 100644 --- a/esphome/components/micro_wake_word/__init__.py +++ b/esphome/components/micro_wake_word/__init__.py @@ -328,7 +328,14 @@ CONFIG_SCHEMA = cv.All( cv.Schema( { cv.GenerateID(): cv.declare_id(MicroWakeWord), - cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), + cv.Optional( + CONF_MICROPHONE, default={} + ): microphone.microphone_source_schema( + min_bits_per_sample=16, + max_bits_per_sample=16, + min_channels=1, + max_channels=1, + ), cv.Required(CONF_MODELS): cv.ensure_list( cv.maybe_simple_value(MODEL_SCHEMA, key=CONF_MODEL) ), @@ -404,15 +411,27 @@ def _feature_step_size_validate(config): raise cv.Invalid("Cannot load models with different features step sizes.") -FINAL_VALIDATE_SCHEMA = _feature_step_size_validate +FINAL_VALIDATE_SCHEMA = cv.All( + cv.Schema( + { + cv.Required( + CONF_MICROPHONE + ): microphone.final_validate_microphone_source_schema( + "micro_wake_word", sample_rate=16000 + ), + }, + extra=cv.ALLOW_EXTRA, + ), + _feature_step_size_validate, +) async def to_code(config): var = cg.new_Pvariable(config[CONF_ID]) await cg.register_component(var, config) - mic = await cg.get_variable(config[CONF_MICROPHONE]) - cg.add(var.set_microphone(mic)) + mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE]) + cg.add(var.set_microphone_source(mic_source)) esp32.add_idf_component( name="esp-tflite-micro", diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp index 533aa9fb75..dd1a8be378 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.cpp +++ b/esphome/components/micro_wake_word/micro_wake_word.cpp @@ -61,7 +61,7 @@ void MicroWakeWord::dump_config() { void MicroWakeWord::setup() { ESP_LOGCONFIG(TAG, "Setting up microWakeWord..."); - this->microphone_->add_data_callback([this](const std::vector &data) { + this->microphone_source_->add_data_callback([this](const std::vector &data) { if (this->state_ != State::DETECTING_WAKE_WORD) { return; } @@ -71,7 +71,7 @@ void MicroWakeWord::setup() { size_t bytes_free = temp_ring_buffer->free(); - if (bytes_free < data.size() * sizeof(int16_t)) { + if (bytes_free < data.size()) { ESP_LOGW( TAG, "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " @@ -80,7 +80,7 @@ void MicroWakeWord::setup() { temp_ring_buffer->reset(); } - temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t)); + temp_ring_buffer->write((void *) data.data(), data.size()); } }); @@ -128,11 +128,11 @@ void MicroWakeWord::loop() { break; case State::START_MICROPHONE: ESP_LOGD(TAG, "Starting Microphone"); - this->microphone_->start(); + this->microphone_source_->start(); this->set_state_(State::STARTING_MICROPHONE); break; case State::STARTING_MICROPHONE: - if (this->microphone_->is_running()) { + if (this->microphone_source_->is_running()) { this->set_state_(State::DETECTING_WAKE_WORD); } break; @@ -148,13 +148,13 @@ void MicroWakeWord::loop() { break; case State::STOP_MICROPHONE: ESP_LOGD(TAG, "Stopping Microphone"); - this->microphone_->stop(); + this->microphone_source_->stop(); this->set_state_(State::STOPPING_MICROPHONE); this->unload_models_(); this->deallocate_buffers_(); break; case State::STOPPING_MICROPHONE: - if (this->microphone_->is_stopped()) { + if (this->microphone_source_->is_stopped()) { this->set_state_(State::IDLE); if (this->detected_) { this->wake_word_detected_trigger_->trigger(this->detected_wake_word_); diff --git a/esphome/components/micro_wake_word/micro_wake_word.h b/esphome/components/micro_wake_word/micro_wake_word.h index 443911b1e4..b06d35ca1f 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.h +++ b/esphome/components/micro_wake_word/micro_wake_word.h @@ -9,7 +9,7 @@ #include "esphome/core/component.h" #include "esphome/core/ring_buffer.h" -#include "esphome/components/microphone/microphone.h" +#include "esphome/components/microphone/microphone_source.h" #include @@ -46,7 +46,9 @@ class MicroWakeWord : public Component { void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; } - void set_microphone(microphone::Microphone *microphone) { this->microphone_ = microphone; } + void set_microphone_source(microphone::MicrophoneSource *microphone_source) { + this->microphone_source_ = microphone_source; + } Trigger *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; } @@ -59,7 +61,7 @@ class MicroWakeWord : public Component { #endif protected: - microphone::Microphone *microphone_{nullptr}; + microphone::MicrophoneSource *microphone_source_{nullptr}; Trigger *wake_word_detected_trigger_ = new Trigger(); State state_{State::IDLE}; diff --git a/esphome/components/microphone/__init__.py b/esphome/components/microphone/__init__.py index b9d24bc4a7..dcae513578 100644 --- a/esphome/components/microphone/__init__.py +++ b/esphome/components/microphone/__init__.py @@ -36,7 +36,7 @@ StopCaptureAction = microphone_ns.class_( DataTrigger = microphone_ns.class_( "DataTrigger", - automation.Trigger.template(cg.std_vector.template(cg.int16).operator("ref")), + automation.Trigger.template(cg.std_vector.template(cg.uint8).operator("ref")), ) IsCapturingCondition = microphone_ns.class_( @@ -98,10 +98,11 @@ def microphone_source_schema( return config return cv.All( - cv.maybe_simple_value( + automation.maybe_conf( + CONF_MICROPHONE, { cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource), - cv.Required(CONF_MICROPHONE): cv.use_id(Microphone), + cv.GenerateID(CONF_MICROPHONE): cv.use_id(Microphone), cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range( min_bits_per_sample, max_bits_per_sample ), @@ -112,7 +113,6 @@ def microphone_source_schema( ), cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64), }, - key=CONF_MICROPHONE, ), ) diff --git a/esphome/components/microphone/automation.h b/esphome/components/microphone/automation.h index 29c0ec5df2..324699c0af 100644 --- a/esphome/components/microphone/automation.h +++ b/esphome/components/microphone/automation.h @@ -16,10 +16,10 @@ template class StopCaptureAction : public Action, public void play(Ts... x) override { this->parent_->stop(); } }; -class DataTrigger : public Trigger &> { +class DataTrigger : public Trigger &> { public: explicit DataTrigger(Microphone *mic) { - mic->add_data_callback([this](const std::vector &data) { this->trigger(data); }); + mic->add_data_callback([this](const std::vector &data) { this->trigger(data); }); } }; diff --git a/esphome/components/microphone/microphone.h b/esphome/components/microphone/microphone.h index 58552aa34a..cef8d0f4c3 100644 --- a/esphome/components/microphone/microphone.h +++ b/esphome/components/microphone/microphone.h @@ -22,10 +22,9 @@ class Microphone { public: virtual void start() = 0; virtual void stop() = 0; - void add_data_callback(std::function &)> &&data_callback) { + void add_data_callback(std::function &)> &&data_callback) { this->data_callbacks_.add(std::move(data_callback)); } - virtual size_t read(int16_t *buf, size_t len) = 0; bool is_running() const { return this->state_ == STATE_RUNNING; } bool is_stopped() const { return this->state_ == STATE_STOPPED; } @@ -37,7 +36,7 @@ class Microphone { audio::AudioStreamInfo audio_stream_info_; - CallbackManager &)> data_callbacks_{}; + CallbackManager &)> data_callbacks_{}; }; } // namespace microphone diff --git a/esphome/components/microphone/microphone_source.cpp b/esphome/components/microphone/microphone_source.cpp index 7e397348b9..dcd3b31622 100644 --- a/esphome/components/microphone/microphone_source.cpp +++ b/esphome/components/microphone/microphone_source.cpp @@ -10,9 +10,7 @@ void MicrophoneSource::add_data_callback(std::functionprocess_audio_(data)); } }; - // Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a - // mic callback. - // this->mic_->add_data_callback(std::move(filtered_callback)); + this->mic_->add_data_callback(std::move(filtered_callback)); } void MicrophoneSource::start() { diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py index e8cdca94b8..ca0b6da742 100644 --- a/esphome/components/voice_assistant/__init__.py +++ b/esphome/components/voice_assistant/__init__.py @@ -88,7 +88,14 @@ CONFIG_SCHEMA = cv.All( cv.Schema( { cv.GenerateID(): cv.declare_id(VoiceAssistant), - cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), + cv.Optional( + CONF_MICROPHONE, default={} + ): microphone.microphone_source_schema( + min_bits_per_sample=16, + max_bits_per_sample=16, + min_channels=1, + max_channels=1, + ), cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker), cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id( media_player.MediaPlayer @@ -163,13 +170,26 @@ CONFIG_SCHEMA = cv.All( tts_stream_validate, ) +FINAL_VALIDATE_SCHEMA = cv.All( + cv.Schema( + { + cv.Optional( + CONF_MICROPHONE + ): microphone.final_validate_microphone_source_schema( + "voice_assistant", sample_rate=16000 + ), + }, + extra=cv.ALLOW_EXTRA, + ), +) + async def to_code(config): var = cg.new_Pvariable(config[CONF_ID]) await cg.register_component(var, config) - mic = await cg.get_variable(config[CONF_MICROPHONE]) - cg.add(var.set_microphone(mic)) + mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE]) + cg.add(var.set_microphone_source(mic_source)) if CONF_SPEAKER in config: spkr = await cg.get_variable(config[CONF_SPEAKER]) diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index c62767d7d5..37b97239c8 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -29,10 +29,10 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE; VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; } void VoiceAssistant::setup() { - this->mic_->add_data_callback([this](const std::vector &data) { + this->mic_source_->add_data_callback([this](const std::vector &data) { std::shared_ptr temp_ring_buffer = this->ring_buffer_; if (this->ring_buffer_.use_count() > 1) { - temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t)); + temp_ring_buffer->write((void *) data.data(), data.size()); } }); } @@ -162,7 +162,7 @@ void VoiceAssistant::reset_conversation_id() { void VoiceAssistant::loop() { if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE && this->state_ != State::STOPPING_MICROPHONE) { - if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) { + if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) { this->set_state_(State::STOP_MICROPHONE, State::IDLE); } else { this->set_state_(State::IDLE, State::IDLE); @@ -193,12 +193,12 @@ void VoiceAssistant::loop() { } this->clear_buffers_(); - this->mic_->start(); + this->mic_source_->start(); this->set_state_(State::STARTING_MICROPHONE); break; } case State::STARTING_MICROPHONE: { - if (this->mic_->is_running()) { + if (this->mic_source_->is_running()) { this->set_state_(this->desired_state_); } break; @@ -262,8 +262,8 @@ void VoiceAssistant::loop() { break; } case State::STOP_MICROPHONE: { - if (this->mic_->is_running()) { - this->mic_->stop(); + if (this->mic_source_->is_running()) { + this->mic_source_->stop(); this->set_state_(State::STOPPING_MICROPHONE); } else { this->set_state_(this->desired_state_); @@ -271,7 +271,7 @@ void VoiceAssistant::loop() { break; } case State::STOPPING_MICROPHONE: { - if (this->mic_->is_stopped()) { + if (this->mic_source_->is_stopped()) { this->set_state_(this->desired_state_); } break; @@ -478,7 +478,7 @@ void VoiceAssistant::start_streaming() { ESP_LOGD(TAG, "Client started, streaming microphone"); this->audio_mode_ = AUDIO_MODE_API; - if (this->mic_->is_running()) { + if (this->mic_source_->is_running()) { this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); } else { this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); @@ -508,7 +508,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por return; } - if (this->mic_->is_running()) { + if (this->mic_source_->is_running()) { this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); } else { this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index cb57a6b05d..7122d69527 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -11,7 +11,7 @@ #include "esphome/components/api/api_connection.h" #include "esphome/components/api/api_pb2.h" -#include "esphome/components/microphone/microphone.h" +#include "esphome/components/microphone/microphone_source.h" #ifdef USE_SPEAKER #include "esphome/components/speaker/speaker.h" #endif @@ -98,7 +98,7 @@ class VoiceAssistant : public Component { void start_streaming(struct sockaddr_storage *addr, uint16_t port); void failed_to_start(); - void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; } + void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; } #ifdef USE_SPEAKER void set_speaker(speaker::Speaker *speaker) { this->speaker_ = speaker; @@ -249,7 +249,7 @@ class VoiceAssistant : public Component { bool has_timers_{false}; bool timer_tick_running_{false}; - microphone::Microphone *mic_{nullptr}; + microphone::MicrophoneSource *mic_source_{nullptr}; #ifdef USE_SPEAKER void write_speaker_(); speaker::Speaker *speaker_{nullptr}; diff --git a/tests/components/micro_wake_word/common.yaml b/tests/components/micro_wake_word/common.yaml index c5422baa67..b5507397f8 100644 --- a/tests/components/micro_wake_word/common.yaml +++ b/tests/components/micro_wake_word/common.yaml @@ -11,6 +11,7 @@ microphone: bits_per_sample: 16bit micro_wake_word: + microphone: echo_microphone on_wake_word_detected: - logger.log: "Wake word detected" models: diff --git a/tests/components/voice_assistant/common.yaml b/tests/components/voice_assistant/common.yaml index e7374941f7..f248154b7e 100644 --- a/tests/components/voice_assistant/common.yaml +++ b/tests/components/voice_assistant/common.yaml @@ -30,7 +30,10 @@ speaker: i2s_dout_pin: ${i2s_dout_pin} voice_assistant: - microphone: mic_id_external + microphone: + microphone: mic_id_external + gain_factor: 4 + channels: 0 speaker: speaker_id conversation_timeout: 60s on_listening: