[i2s_audio, microphone, micro_wake_word, voice_assistant] Use microphone source to process incoming audio (#8645)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
This commit is contained in:
Kevin Ahrendt 2025-04-29 17:27:03 -05:00 committed by GitHub
parent 0fe6c65ba3
commit 9f629dcaa2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 166 additions and 98 deletions

View File

@ -1,13 +1,20 @@
from esphome import pins from esphome import pins
import esphome.codegen as cg import esphome.codegen as cg
from esphome.components import esp32, microphone from esphome.components import audio, esp32, microphone
from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin
import esphome.config_validation as cv import esphome.config_validation as cv
from esphome.const import CONF_ID, CONF_NUMBER from esphome.const import (
CONF_BITS_PER_SAMPLE,
CONF_CHANNEL,
CONF_ID,
CONF_NUM_CHANNELS,
CONF_NUMBER,
CONF_SAMPLE_RATE,
)
from .. import ( from .. import (
CONF_CHANNEL,
CONF_I2S_DIN_PIN, CONF_I2S_DIN_PIN,
CONF_LEFT,
CONF_MONO, CONF_MONO,
CONF_RIGHT, CONF_RIGHT,
I2SAudioIn, I2SAudioIn,
@ -32,7 +39,7 @@ INTERNAL_ADC_VARIANTS = [esp32.const.VARIANT_ESP32]
PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3] PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3]
def validate_esp32_variant(config): def _validate_esp32_variant(config):
variant = esp32.get_esp32_variant() variant = esp32.get_esp32_variant()
if config[CONF_ADC_TYPE] == "external": if config[CONF_ADC_TYPE] == "external":
if config[CONF_PDM]: if config[CONF_PDM]:
@ -46,12 +53,34 @@ def validate_esp32_variant(config):
raise NotImplementedError raise NotImplementedError
def validate_channel(config): def _validate_channel(config):
if config[CONF_CHANNEL] == CONF_MONO: if config[CONF_CHANNEL] == CONF_MONO:
raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.") raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.")
return config return config
def _set_num_channels_from_config(config):
if config[CONF_CHANNEL] in (CONF_LEFT, CONF_RIGHT):
config[CONF_NUM_CHANNELS] = 1
else:
config[CONF_NUM_CHANNELS] = 2
return config
def _set_stream_limits(config):
audio.set_stream_limits(
min_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
min_channels=config.get(CONF_NUM_CHANNELS),
max_channels=config.get(CONF_NUM_CHANNELS),
min_sample_rate=config.get(CONF_SAMPLE_RATE),
max_sample_rate=config.get(CONF_SAMPLE_RATE),
)(config)
return config
BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend( BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend(
i2s_audio_component_schema( i2s_audio_component_schema(
I2SAudioMicrophone, I2SAudioMicrophone,
@ -79,8 +108,10 @@ CONFIG_SCHEMA = cv.All(
}, },
key=CONF_ADC_TYPE, key=CONF_ADC_TYPE,
), ),
validate_esp32_variant, _validate_esp32_variant,
validate_channel, _validate_channel,
_set_num_channels_from_config,
_set_stream_limits,
) )

View File

@ -56,6 +56,35 @@ void I2SAudioMicrophone::start_() {
} }
esp_err_t err; esp_err_t err;
uint8_t channel_count = 1;
#ifdef USE_I2S_LEGACY
uint8_t bits_per_sample = this->bits_per_sample_;
if (this->channel_ == I2S_CHANNEL_FMT_RIGHT_LEFT) {
channel_count = 2;
}
#else
if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_AUTO) {
this->slot_bit_width_ = I2S_SLOT_BIT_WIDTH_16BIT;
}
uint8_t bits_per_sample = this->slot_bit_width_;
if (this->slot_mode_ == I2S_SLOT_MODE_STEREO) {
channel_count = 2;
}
#endif
#ifdef USE_ESP32_VARIANT_ESP32
// ESP32 reads audio aligned to a multiple of 2 bytes. For example, if configured for 24 bits per sample, then it will
// produce 32 bits per sample, where the actual data is in the most significant bits. Other ESP32 variants produce 24
// bits per sample in this situation.
if (bits_per_sample < 16) {
bits_per_sample = 16;
} else if ((bits_per_sample > 16) && (bits_per_sample <= 32)) {
bits_per_sample = 32;
}
#endif
#ifdef USE_I2S_LEGACY #ifdef USE_I2S_LEGACY
i2s_driver_config_t config = { i2s_driver_config_t config = {
.mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX), .mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX),
@ -144,6 +173,8 @@ void I2SAudioMicrophone::start_() {
i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config(); i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config();
#if SOC_I2S_SUPPORTS_PDM_RX #if SOC_I2S_SUPPORTS_PDM_RX
if (this->pdm_) { if (this->pdm_) {
bits_per_sample = 16; // PDM mics are always 16 bits per sample with the IDF 5 driver
i2s_pdm_rx_clk_config_t clk_cfg = { i2s_pdm_rx_clk_config_t clk_cfg = {
.sample_rate_hz = this->sample_rate_, .sample_rate_hz = this->sample_rate_,
.clk_src = clk_src, .clk_src = clk_src,
@ -187,13 +218,8 @@ void I2SAudioMicrophone::start_() {
.clk_src = clk_src, .clk_src = clk_src,
.mclk_multiple = I2S_MCLK_MULTIPLE_256, .mclk_multiple = I2S_MCLK_MULTIPLE_256,
}; };
i2s_data_bit_width_t data_bit_width; i2s_std_slot_config_t std_slot_cfg =
if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_8BIT) { I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) this->slot_bit_width_, this->slot_mode_);
data_bit_width = I2S_DATA_BIT_WIDTH_16BIT;
} else {
data_bit_width = I2S_DATA_BIT_WIDTH_8BIT;
}
i2s_std_slot_config_t std_slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(data_bit_width, this->slot_mode_);
std_slot_cfg.slot_bit_width = this->slot_bit_width_; std_slot_cfg.slot_bit_width = this->slot_bit_width_;
std_slot_cfg.slot_mask = this->std_slot_mask_; std_slot_cfg.slot_mask = this->std_slot_mask_;
@ -222,6 +248,8 @@ void I2SAudioMicrophone::start_() {
} }
#endif #endif
this->audio_stream_info_ = audio::AudioStreamInfo(bits_per_sample, channel_count, this->sample_rate_);
this->state_ = microphone::STATE_RUNNING; this->state_ = microphone::STATE_RUNNING;
this->high_freq_.start(); this->high_freq_.start();
this->status_clear_error(); this->status_clear_error();
@ -284,7 +312,7 @@ void I2SAudioMicrophone::stop_() {
this->status_clear_error(); this->status_clear_error();
} }
size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wait) { size_t I2SAudioMicrophone::read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait) {
size_t bytes_read = 0; size_t bytes_read = 0;
#ifdef USE_I2S_LEGACY #ifdef USE_I2S_LEGACY
esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait); esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait);
@ -303,38 +331,7 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
return 0; return 0;
} }
this->status_clear_warning(); this->status_clear_warning();
// ESP-IDF I2S implementation right-extends 8-bit data to 16 bits, #if defined(USE_ESP32_VARIANT_ESP32) and not defined(USE_I2S_LEGACY)
// and 24-bit data to 32 bits.
#ifdef USE_I2S_LEGACY
switch (this->bits_per_sample_) {
case I2S_BITS_PER_SAMPLE_8BIT:
case I2S_BITS_PER_SAMPLE_16BIT:
return bytes_read;
case I2S_BITS_PER_SAMPLE_24BIT:
case I2S_BITS_PER_SAMPLE_32BIT: {
size_t samples_read = bytes_read / sizeof(int32_t);
for (size_t i = 0; i < samples_read; i++) {
int32_t temp = reinterpret_cast<int32_t *>(buf)[i] >> 14;
buf[i] = clamp<int16_t>(temp, INT16_MIN, INT16_MAX);
}
return samples_read * sizeof(int16_t);
}
default:
ESP_LOGE(TAG, "Unsupported bits per sample: %d", this->bits_per_sample_);
return 0;
}
#else
#ifndef USE_ESP32_VARIANT_ESP32
// For newer ESP32 variants 8 bit data needs to be extended to 16 bit.
if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_8BIT) {
size_t samples_read = bytes_read / sizeof(int8_t);
for (size_t i = samples_read - 1; i >= 0; i--) {
int16_t temp = static_cast<int16_t>(reinterpret_cast<int8_t *>(buf)[i]) << 8;
buf[i] = temp;
}
return samples_read * sizeof(int16_t);
}
#else
// For ESP32 8/16 bit standard mono mode samples need to be switched. // For ESP32 8/16 bit standard mono mode samples need to be switched.
if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) { if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) {
size_t samples_read = bytes_read / sizeof(int16_t); size_t samples_read = bytes_read / sizeof(int16_t);
@ -346,14 +343,14 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
} }
#endif #endif
return bytes_read; return bytes_read;
#endif
} }
void I2SAudioMicrophone::read_() { void I2SAudioMicrophone::read_() {
std::vector<int16_t> samples; std::vector<uint8_t> samples;
samples.resize(BUFFER_SIZE); const size_t bytes_to_read = this->audio_stream_info_.ms_to_bytes(32);
size_t bytes_read = this->read(samples.data(), BUFFER_SIZE * sizeof(int16_t), 0); samples.resize(bytes_to_read);
samples.resize(bytes_read / sizeof(int16_t)); size_t bytes_read = this->read_(samples.data(), bytes_to_read, 0);
samples.resize(bytes_read);
this->data_callbacks_.call(samples); this->data_callbacks_.call(samples);
} }

View File

@ -25,9 +25,6 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
void set_pdm(bool pdm) { this->pdm_ = pdm; } void set_pdm(bool pdm) { this->pdm_ = pdm; }
size_t read(int16_t *buf, size_t len, TickType_t ticks_to_wait);
size_t read(int16_t *buf, size_t len) override { return this->read(buf, len, pdMS_TO_TICKS(100)); }
#ifdef USE_I2S_LEGACY #ifdef USE_I2S_LEGACY
#if SOC_I2S_SUPPORTS_ADC #if SOC_I2S_SUPPORTS_ADC
void set_adc_channel(adc1_channel_t channel) { void set_adc_channel(adc1_channel_t channel) {
@ -41,6 +38,7 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
void start_(); void start_();
void stop_(); void stop_();
void read_(); void read_();
size_t read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait);
#ifdef USE_I2S_LEGACY #ifdef USE_I2S_LEGACY
int8_t din_pin_{I2S_PIN_NO_CHANGE}; int8_t din_pin_{I2S_PIN_NO_CHANGE};

View File

@ -328,7 +328,14 @@ CONFIG_SCHEMA = cv.All(
cv.Schema( cv.Schema(
{ {
cv.GenerateID(): cv.declare_id(MicroWakeWord), cv.GenerateID(): cv.declare_id(MicroWakeWord),
cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), cv.Optional(
CONF_MICROPHONE, default={}
): microphone.microphone_source_schema(
min_bits_per_sample=16,
max_bits_per_sample=16,
min_channels=1,
max_channels=1,
),
cv.Required(CONF_MODELS): cv.ensure_list( cv.Required(CONF_MODELS): cv.ensure_list(
cv.maybe_simple_value(MODEL_SCHEMA, key=CONF_MODEL) cv.maybe_simple_value(MODEL_SCHEMA, key=CONF_MODEL)
), ),
@ -404,15 +411,27 @@ def _feature_step_size_validate(config):
raise cv.Invalid("Cannot load models with different features step sizes.") raise cv.Invalid("Cannot load models with different features step sizes.")
FINAL_VALIDATE_SCHEMA = _feature_step_size_validate FINAL_VALIDATE_SCHEMA = cv.All(
cv.Schema(
{
cv.Required(
CONF_MICROPHONE
): microphone.final_validate_microphone_source_schema(
"micro_wake_word", sample_rate=16000
),
},
extra=cv.ALLOW_EXTRA,
),
_feature_step_size_validate,
)
async def to_code(config): async def to_code(config):
var = cg.new_Pvariable(config[CONF_ID]) var = cg.new_Pvariable(config[CONF_ID])
await cg.register_component(var, config) await cg.register_component(var, config)
mic = await cg.get_variable(config[CONF_MICROPHONE]) mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
cg.add(var.set_microphone(mic)) cg.add(var.set_microphone_source(mic_source))
esp32.add_idf_component( esp32.add_idf_component(
name="esp-tflite-micro", name="esp-tflite-micro",

View File

@ -61,7 +61,7 @@ void MicroWakeWord::dump_config() {
void MicroWakeWord::setup() { void MicroWakeWord::setup() {
ESP_LOGCONFIG(TAG, "Setting up microWakeWord..."); ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
this->microphone_->add_data_callback([this](const std::vector<int16_t> &data) { this->microphone_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
if (this->state_ != State::DETECTING_WAKE_WORD) { if (this->state_ != State::DETECTING_WAKE_WORD) {
return; return;
} }
@ -71,7 +71,7 @@ void MicroWakeWord::setup() {
size_t bytes_free = temp_ring_buffer->free(); size_t bytes_free = temp_ring_buffer->free();
if (bytes_free < data.size() * sizeof(int16_t)) { if (bytes_free < data.size()) {
ESP_LOGW( ESP_LOGW(
TAG, TAG,
"Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
@ -80,7 +80,7 @@ void MicroWakeWord::setup() {
temp_ring_buffer->reset(); temp_ring_buffer->reset();
} }
temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t)); temp_ring_buffer->write((void *) data.data(), data.size());
} }
}); });
@ -128,11 +128,11 @@ void MicroWakeWord::loop() {
break; break;
case State::START_MICROPHONE: case State::START_MICROPHONE:
ESP_LOGD(TAG, "Starting Microphone"); ESP_LOGD(TAG, "Starting Microphone");
this->microphone_->start(); this->microphone_source_->start();
this->set_state_(State::STARTING_MICROPHONE); this->set_state_(State::STARTING_MICROPHONE);
break; break;
case State::STARTING_MICROPHONE: case State::STARTING_MICROPHONE:
if (this->microphone_->is_running()) { if (this->microphone_source_->is_running()) {
this->set_state_(State::DETECTING_WAKE_WORD); this->set_state_(State::DETECTING_WAKE_WORD);
} }
break; break;
@ -148,13 +148,13 @@ void MicroWakeWord::loop() {
break; break;
case State::STOP_MICROPHONE: case State::STOP_MICROPHONE:
ESP_LOGD(TAG, "Stopping Microphone"); ESP_LOGD(TAG, "Stopping Microphone");
this->microphone_->stop(); this->microphone_source_->stop();
this->set_state_(State::STOPPING_MICROPHONE); this->set_state_(State::STOPPING_MICROPHONE);
this->unload_models_(); this->unload_models_();
this->deallocate_buffers_(); this->deallocate_buffers_();
break; break;
case State::STOPPING_MICROPHONE: case State::STOPPING_MICROPHONE:
if (this->microphone_->is_stopped()) { if (this->microphone_source_->is_stopped()) {
this->set_state_(State::IDLE); this->set_state_(State::IDLE);
if (this->detected_) { if (this->detected_) {
this->wake_word_detected_trigger_->trigger(this->detected_wake_word_); this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);

View File

@ -9,7 +9,7 @@
#include "esphome/core/component.h" #include "esphome/core/component.h"
#include "esphome/core/ring_buffer.h" #include "esphome/core/ring_buffer.h"
#include "esphome/components/microphone/microphone.h" #include "esphome/components/microphone/microphone_source.h"
#include <frontend_util.h> #include <frontend_util.h>
@ -46,7 +46,9 @@ class MicroWakeWord : public Component {
void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; } void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; }
void set_microphone(microphone::Microphone *microphone) { this->microphone_ = microphone; } void set_microphone_source(microphone::MicrophoneSource *microphone_source) {
this->microphone_source_ = microphone_source;
}
Trigger<std::string> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; } Trigger<std::string> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; }
@ -59,7 +61,7 @@ class MicroWakeWord : public Component {
#endif #endif
protected: protected:
microphone::Microphone *microphone_{nullptr}; microphone::MicrophoneSource *microphone_source_{nullptr};
Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>(); Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
State state_{State::IDLE}; State state_{State::IDLE};

View File

@ -36,7 +36,7 @@ StopCaptureAction = microphone_ns.class_(
DataTrigger = microphone_ns.class_( DataTrigger = microphone_ns.class_(
"DataTrigger", "DataTrigger",
automation.Trigger.template(cg.std_vector.template(cg.int16).operator("ref")), automation.Trigger.template(cg.std_vector.template(cg.uint8).operator("ref")),
) )
IsCapturingCondition = microphone_ns.class_( IsCapturingCondition = microphone_ns.class_(
@ -98,10 +98,11 @@ def microphone_source_schema(
return config return config
return cv.All( return cv.All(
cv.maybe_simple_value( automation.maybe_conf(
CONF_MICROPHONE,
{ {
cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource), cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
cv.Required(CONF_MICROPHONE): cv.use_id(Microphone), cv.GenerateID(CONF_MICROPHONE): cv.use_id(Microphone),
cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range( cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
min_bits_per_sample, max_bits_per_sample min_bits_per_sample, max_bits_per_sample
), ),
@ -112,7 +113,6 @@ def microphone_source_schema(
), ),
cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64), cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
}, },
key=CONF_MICROPHONE,
), ),
) )

View File

@ -16,10 +16,10 @@ template<typename... Ts> class StopCaptureAction : public Action<Ts...>, public
void play(Ts... x) override { this->parent_->stop(); } void play(Ts... x) override { this->parent_->stop(); }
}; };
class DataTrigger : public Trigger<const std::vector<int16_t> &> { class DataTrigger : public Trigger<const std::vector<uint8_t> &> {
public: public:
explicit DataTrigger(Microphone *mic) { explicit DataTrigger(Microphone *mic) {
mic->add_data_callback([this](const std::vector<int16_t> &data) { this->trigger(data); }); mic->add_data_callback([this](const std::vector<uint8_t> &data) { this->trigger(data); });
} }
}; };

View File

@ -22,10 +22,9 @@ class Microphone {
public: public:
virtual void start() = 0; virtual void start() = 0;
virtual void stop() = 0; virtual void stop() = 0;
void add_data_callback(std::function<void(const std::vector<int16_t> &)> &&data_callback) { void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
this->data_callbacks_.add(std::move(data_callback)); this->data_callbacks_.add(std::move(data_callback));
} }
virtual size_t read(int16_t *buf, size_t len) = 0;
bool is_running() const { return this->state_ == STATE_RUNNING; } bool is_running() const { return this->state_ == STATE_RUNNING; }
bool is_stopped() const { return this->state_ == STATE_STOPPED; } bool is_stopped() const { return this->state_ == STATE_STOPPED; }
@ -37,7 +36,7 @@ class Microphone {
audio::AudioStreamInfo audio_stream_info_; audio::AudioStreamInfo audio_stream_info_;
CallbackManager<void(const std::vector<int16_t> &)> data_callbacks_{}; CallbackManager<void(const std::vector<uint8_t> &)> data_callbacks_{};
}; };
} // namespace microphone } // namespace microphone

View File

@ -10,9 +10,7 @@ void MicrophoneSource::add_data_callback(std::function<void(const std::vector<ui
data_callback(this->process_audio_(data)); data_callback(this->process_audio_(data));
} }
}; };
// Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a this->mic_->add_data_callback(std::move(filtered_callback));
// mic callback.
// this->mic_->add_data_callback(std::move(filtered_callback));
} }
void MicrophoneSource::start() { void MicrophoneSource::start() {

View File

@ -88,7 +88,14 @@ CONFIG_SCHEMA = cv.All(
cv.Schema( cv.Schema(
{ {
cv.GenerateID(): cv.declare_id(VoiceAssistant), cv.GenerateID(): cv.declare_id(VoiceAssistant),
cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), cv.Optional(
CONF_MICROPHONE, default={}
): microphone.microphone_source_schema(
min_bits_per_sample=16,
max_bits_per_sample=16,
min_channels=1,
max_channels=1,
),
cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker), cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id( cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
media_player.MediaPlayer media_player.MediaPlayer
@ -163,13 +170,26 @@ CONFIG_SCHEMA = cv.All(
tts_stream_validate, tts_stream_validate,
) )
FINAL_VALIDATE_SCHEMA = cv.All(
cv.Schema(
{
cv.Optional(
CONF_MICROPHONE
): microphone.final_validate_microphone_source_schema(
"voice_assistant", sample_rate=16000
),
},
extra=cv.ALLOW_EXTRA,
),
)
async def to_code(config): async def to_code(config):
var = cg.new_Pvariable(config[CONF_ID]) var = cg.new_Pvariable(config[CONF_ID])
await cg.register_component(var, config) await cg.register_component(var, config)
mic = await cg.get_variable(config[CONF_MICROPHONE]) mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
cg.add(var.set_microphone(mic)) cg.add(var.set_microphone_source(mic_source))
if CONF_SPEAKER in config: if CONF_SPEAKER in config:
spkr = await cg.get_variable(config[CONF_SPEAKER]) spkr = await cg.get_variable(config[CONF_SPEAKER])

View File

@ -29,10 +29,10 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; } VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
void VoiceAssistant::setup() { void VoiceAssistant::setup() {
this->mic_->add_data_callback([this](const std::vector<int16_t> &data) { this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_; std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
if (this->ring_buffer_.use_count() > 1) { if (this->ring_buffer_.use_count() > 1) {
temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t)); temp_ring_buffer->write((void *) data.data(), data.size());
} }
}); });
} }
@ -162,7 +162,7 @@ void VoiceAssistant::reset_conversation_id() {
void VoiceAssistant::loop() { void VoiceAssistant::loop() {
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE && if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
this->state_ != State::STOPPING_MICROPHONE) { this->state_ != State::STOPPING_MICROPHONE) {
if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) { if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
this->set_state_(State::STOP_MICROPHONE, State::IDLE); this->set_state_(State::STOP_MICROPHONE, State::IDLE);
} else { } else {
this->set_state_(State::IDLE, State::IDLE); this->set_state_(State::IDLE, State::IDLE);
@ -193,12 +193,12 @@ void VoiceAssistant::loop() {
} }
this->clear_buffers_(); this->clear_buffers_();
this->mic_->start(); this->mic_source_->start();
this->set_state_(State::STARTING_MICROPHONE); this->set_state_(State::STARTING_MICROPHONE);
break; break;
} }
case State::STARTING_MICROPHONE: { case State::STARTING_MICROPHONE: {
if (this->mic_->is_running()) { if (this->mic_source_->is_running()) {
this->set_state_(this->desired_state_); this->set_state_(this->desired_state_);
} }
break; break;
@ -262,8 +262,8 @@ void VoiceAssistant::loop() {
break; break;
} }
case State::STOP_MICROPHONE: { case State::STOP_MICROPHONE: {
if (this->mic_->is_running()) { if (this->mic_source_->is_running()) {
this->mic_->stop(); this->mic_source_->stop();
this->set_state_(State::STOPPING_MICROPHONE); this->set_state_(State::STOPPING_MICROPHONE);
} else { } else {
this->set_state_(this->desired_state_); this->set_state_(this->desired_state_);
@ -271,7 +271,7 @@ void VoiceAssistant::loop() {
break; break;
} }
case State::STOPPING_MICROPHONE: { case State::STOPPING_MICROPHONE: {
if (this->mic_->is_stopped()) { if (this->mic_source_->is_stopped()) {
this->set_state_(this->desired_state_); this->set_state_(this->desired_state_);
} }
break; break;
@ -478,7 +478,7 @@ void VoiceAssistant::start_streaming() {
ESP_LOGD(TAG, "Client started, streaming microphone"); ESP_LOGD(TAG, "Client started, streaming microphone");
this->audio_mode_ = AUDIO_MODE_API; this->audio_mode_ = AUDIO_MODE_API;
if (this->mic_->is_running()) { if (this->mic_source_->is_running()) {
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
} else { } else {
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
@ -508,7 +508,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
return; return;
} }
if (this->mic_->is_running()) { if (this->mic_source_->is_running()) {
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
} else { } else {
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);

View File

@ -11,7 +11,7 @@
#include "esphome/components/api/api_connection.h" #include "esphome/components/api/api_connection.h"
#include "esphome/components/api/api_pb2.h" #include "esphome/components/api/api_pb2.h"
#include "esphome/components/microphone/microphone.h" #include "esphome/components/microphone/microphone_source.h"
#ifdef USE_SPEAKER #ifdef USE_SPEAKER
#include "esphome/components/speaker/speaker.h" #include "esphome/components/speaker/speaker.h"
#endif #endif
@ -98,7 +98,7 @@ class VoiceAssistant : public Component {
void start_streaming(struct sockaddr_storage *addr, uint16_t port); void start_streaming(struct sockaddr_storage *addr, uint16_t port);
void failed_to_start(); void failed_to_start();
void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; } void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
#ifdef USE_SPEAKER #ifdef USE_SPEAKER
void set_speaker(speaker::Speaker *speaker) { void set_speaker(speaker::Speaker *speaker) {
this->speaker_ = speaker; this->speaker_ = speaker;
@ -249,7 +249,7 @@ class VoiceAssistant : public Component {
bool has_timers_{false}; bool has_timers_{false};
bool timer_tick_running_{false}; bool timer_tick_running_{false};
microphone::Microphone *mic_{nullptr}; microphone::MicrophoneSource *mic_source_{nullptr};
#ifdef USE_SPEAKER #ifdef USE_SPEAKER
void write_speaker_(); void write_speaker_();
speaker::Speaker *speaker_{nullptr}; speaker::Speaker *speaker_{nullptr};

View File

@ -11,6 +11,7 @@ microphone:
bits_per_sample: 16bit bits_per_sample: 16bit
micro_wake_word: micro_wake_word:
microphone: echo_microphone
on_wake_word_detected: on_wake_word_detected:
- logger.log: "Wake word detected" - logger.log: "Wake word detected"
models: models:

View File

@ -30,7 +30,10 @@ speaker:
i2s_dout_pin: ${i2s_dout_pin} i2s_dout_pin: ${i2s_dout_pin}
voice_assistant: voice_assistant:
microphone:
microphone: mic_id_external microphone: mic_id_external
gain_factor: 4
channels: 0
speaker: speaker_id speaker: speaker_id
conversation_timeout: 60s conversation_timeout: 60s
on_listening: on_listening: