mirror of
https://github.com/esphome/esphome.git
synced 2025-07-29 06:36:45 +00:00
[i2s_audio, microphone, micro_wake_word, voice_assistant] Use microphone source to process incoming audio (#8645)
Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
This commit is contained in:
parent
0fe6c65ba3
commit
9f629dcaa2
@ -1,13 +1,20 @@
|
|||||||
from esphome import pins
|
from esphome import pins
|
||||||
import esphome.codegen as cg
|
import esphome.codegen as cg
|
||||||
from esphome.components import esp32, microphone
|
from esphome.components import audio, esp32, microphone
|
||||||
from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin
|
from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin
|
||||||
import esphome.config_validation as cv
|
import esphome.config_validation as cv
|
||||||
from esphome.const import CONF_ID, CONF_NUMBER
|
from esphome.const import (
|
||||||
|
CONF_BITS_PER_SAMPLE,
|
||||||
|
CONF_CHANNEL,
|
||||||
|
CONF_ID,
|
||||||
|
CONF_NUM_CHANNELS,
|
||||||
|
CONF_NUMBER,
|
||||||
|
CONF_SAMPLE_RATE,
|
||||||
|
)
|
||||||
|
|
||||||
from .. import (
|
from .. import (
|
||||||
CONF_CHANNEL,
|
|
||||||
CONF_I2S_DIN_PIN,
|
CONF_I2S_DIN_PIN,
|
||||||
|
CONF_LEFT,
|
||||||
CONF_MONO,
|
CONF_MONO,
|
||||||
CONF_RIGHT,
|
CONF_RIGHT,
|
||||||
I2SAudioIn,
|
I2SAudioIn,
|
||||||
@ -32,7 +39,7 @@ INTERNAL_ADC_VARIANTS = [esp32.const.VARIANT_ESP32]
|
|||||||
PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3]
|
PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3]
|
||||||
|
|
||||||
|
|
||||||
def validate_esp32_variant(config):
|
def _validate_esp32_variant(config):
|
||||||
variant = esp32.get_esp32_variant()
|
variant = esp32.get_esp32_variant()
|
||||||
if config[CONF_ADC_TYPE] == "external":
|
if config[CONF_ADC_TYPE] == "external":
|
||||||
if config[CONF_PDM]:
|
if config[CONF_PDM]:
|
||||||
@ -46,12 +53,34 @@ def validate_esp32_variant(config):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
def validate_channel(config):
|
def _validate_channel(config):
|
||||||
if config[CONF_CHANNEL] == CONF_MONO:
|
if config[CONF_CHANNEL] == CONF_MONO:
|
||||||
raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.")
|
raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.")
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _set_num_channels_from_config(config):
|
||||||
|
if config[CONF_CHANNEL] in (CONF_LEFT, CONF_RIGHT):
|
||||||
|
config[CONF_NUM_CHANNELS] = 1
|
||||||
|
else:
|
||||||
|
config[CONF_NUM_CHANNELS] = 2
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _set_stream_limits(config):
|
||||||
|
audio.set_stream_limits(
|
||||||
|
min_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
|
||||||
|
max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
|
||||||
|
min_channels=config.get(CONF_NUM_CHANNELS),
|
||||||
|
max_channels=config.get(CONF_NUM_CHANNELS),
|
||||||
|
min_sample_rate=config.get(CONF_SAMPLE_RATE),
|
||||||
|
max_sample_rate=config.get(CONF_SAMPLE_RATE),
|
||||||
|
)(config)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend(
|
BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend(
|
||||||
i2s_audio_component_schema(
|
i2s_audio_component_schema(
|
||||||
I2SAudioMicrophone,
|
I2SAudioMicrophone,
|
||||||
@ -79,8 +108,10 @@ CONFIG_SCHEMA = cv.All(
|
|||||||
},
|
},
|
||||||
key=CONF_ADC_TYPE,
|
key=CONF_ADC_TYPE,
|
||||||
),
|
),
|
||||||
validate_esp32_variant,
|
_validate_esp32_variant,
|
||||||
validate_channel,
|
_validate_channel,
|
||||||
|
_set_num_channels_from_config,
|
||||||
|
_set_stream_limits,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,6 +56,35 @@ void I2SAudioMicrophone::start_() {
|
|||||||
}
|
}
|
||||||
esp_err_t err;
|
esp_err_t err;
|
||||||
|
|
||||||
|
uint8_t channel_count = 1;
|
||||||
|
#ifdef USE_I2S_LEGACY
|
||||||
|
uint8_t bits_per_sample = this->bits_per_sample_;
|
||||||
|
|
||||||
|
if (this->channel_ == I2S_CHANNEL_FMT_RIGHT_LEFT) {
|
||||||
|
channel_count = 2;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_AUTO) {
|
||||||
|
this->slot_bit_width_ = I2S_SLOT_BIT_WIDTH_16BIT;
|
||||||
|
}
|
||||||
|
uint8_t bits_per_sample = this->slot_bit_width_;
|
||||||
|
|
||||||
|
if (this->slot_mode_ == I2S_SLOT_MODE_STEREO) {
|
||||||
|
channel_count = 2;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_ESP32_VARIANT_ESP32
|
||||||
|
// ESP32 reads audio aligned to a multiple of 2 bytes. For example, if configured for 24 bits per sample, then it will
|
||||||
|
// produce 32 bits per sample, where the actual data is in the most significant bits. Other ESP32 variants produce 24
|
||||||
|
// bits per sample in this situation.
|
||||||
|
if (bits_per_sample < 16) {
|
||||||
|
bits_per_sample = 16;
|
||||||
|
} else if ((bits_per_sample > 16) && (bits_per_sample <= 32)) {
|
||||||
|
bits_per_sample = 32;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef USE_I2S_LEGACY
|
#ifdef USE_I2S_LEGACY
|
||||||
i2s_driver_config_t config = {
|
i2s_driver_config_t config = {
|
||||||
.mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX),
|
.mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX),
|
||||||
@ -144,6 +173,8 @@ void I2SAudioMicrophone::start_() {
|
|||||||
i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config();
|
i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config();
|
||||||
#if SOC_I2S_SUPPORTS_PDM_RX
|
#if SOC_I2S_SUPPORTS_PDM_RX
|
||||||
if (this->pdm_) {
|
if (this->pdm_) {
|
||||||
|
bits_per_sample = 16; // PDM mics are always 16 bits per sample with the IDF 5 driver
|
||||||
|
|
||||||
i2s_pdm_rx_clk_config_t clk_cfg = {
|
i2s_pdm_rx_clk_config_t clk_cfg = {
|
||||||
.sample_rate_hz = this->sample_rate_,
|
.sample_rate_hz = this->sample_rate_,
|
||||||
.clk_src = clk_src,
|
.clk_src = clk_src,
|
||||||
@ -187,13 +218,8 @@ void I2SAudioMicrophone::start_() {
|
|||||||
.clk_src = clk_src,
|
.clk_src = clk_src,
|
||||||
.mclk_multiple = I2S_MCLK_MULTIPLE_256,
|
.mclk_multiple = I2S_MCLK_MULTIPLE_256,
|
||||||
};
|
};
|
||||||
i2s_data_bit_width_t data_bit_width;
|
i2s_std_slot_config_t std_slot_cfg =
|
||||||
if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_8BIT) {
|
I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) this->slot_bit_width_, this->slot_mode_);
|
||||||
data_bit_width = I2S_DATA_BIT_WIDTH_16BIT;
|
|
||||||
} else {
|
|
||||||
data_bit_width = I2S_DATA_BIT_WIDTH_8BIT;
|
|
||||||
}
|
|
||||||
i2s_std_slot_config_t std_slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(data_bit_width, this->slot_mode_);
|
|
||||||
std_slot_cfg.slot_bit_width = this->slot_bit_width_;
|
std_slot_cfg.slot_bit_width = this->slot_bit_width_;
|
||||||
std_slot_cfg.slot_mask = this->std_slot_mask_;
|
std_slot_cfg.slot_mask = this->std_slot_mask_;
|
||||||
|
|
||||||
@ -222,6 +248,8 @@ void I2SAudioMicrophone::start_() {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
this->audio_stream_info_ = audio::AudioStreamInfo(bits_per_sample, channel_count, this->sample_rate_);
|
||||||
|
|
||||||
this->state_ = microphone::STATE_RUNNING;
|
this->state_ = microphone::STATE_RUNNING;
|
||||||
this->high_freq_.start();
|
this->high_freq_.start();
|
||||||
this->status_clear_error();
|
this->status_clear_error();
|
||||||
@ -284,7 +312,7 @@ void I2SAudioMicrophone::stop_() {
|
|||||||
this->status_clear_error();
|
this->status_clear_error();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wait) {
|
size_t I2SAudioMicrophone::read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait) {
|
||||||
size_t bytes_read = 0;
|
size_t bytes_read = 0;
|
||||||
#ifdef USE_I2S_LEGACY
|
#ifdef USE_I2S_LEGACY
|
||||||
esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait);
|
esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait);
|
||||||
@ -303,38 +331,7 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
this->status_clear_warning();
|
this->status_clear_warning();
|
||||||
// ESP-IDF I2S implementation right-extends 8-bit data to 16 bits,
|
#if defined(USE_ESP32_VARIANT_ESP32) and not defined(USE_I2S_LEGACY)
|
||||||
// and 24-bit data to 32 bits.
|
|
||||||
#ifdef USE_I2S_LEGACY
|
|
||||||
switch (this->bits_per_sample_) {
|
|
||||||
case I2S_BITS_PER_SAMPLE_8BIT:
|
|
||||||
case I2S_BITS_PER_SAMPLE_16BIT:
|
|
||||||
return bytes_read;
|
|
||||||
case I2S_BITS_PER_SAMPLE_24BIT:
|
|
||||||
case I2S_BITS_PER_SAMPLE_32BIT: {
|
|
||||||
size_t samples_read = bytes_read / sizeof(int32_t);
|
|
||||||
for (size_t i = 0; i < samples_read; i++) {
|
|
||||||
int32_t temp = reinterpret_cast<int32_t *>(buf)[i] >> 14;
|
|
||||||
buf[i] = clamp<int16_t>(temp, INT16_MIN, INT16_MAX);
|
|
||||||
}
|
|
||||||
return samples_read * sizeof(int16_t);
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
ESP_LOGE(TAG, "Unsupported bits per sample: %d", this->bits_per_sample_);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#ifndef USE_ESP32_VARIANT_ESP32
|
|
||||||
// For newer ESP32 variants 8 bit data needs to be extended to 16 bit.
|
|
||||||
if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_8BIT) {
|
|
||||||
size_t samples_read = bytes_read / sizeof(int8_t);
|
|
||||||
for (size_t i = samples_read - 1; i >= 0; i--) {
|
|
||||||
int16_t temp = static_cast<int16_t>(reinterpret_cast<int8_t *>(buf)[i]) << 8;
|
|
||||||
buf[i] = temp;
|
|
||||||
}
|
|
||||||
return samples_read * sizeof(int16_t);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// For ESP32 8/16 bit standard mono mode samples need to be switched.
|
// For ESP32 8/16 bit standard mono mode samples need to be switched.
|
||||||
if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) {
|
if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) {
|
||||||
size_t samples_read = bytes_read / sizeof(int16_t);
|
size_t samples_read = bytes_read / sizeof(int16_t);
|
||||||
@ -346,14 +343,14 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return bytes_read;
|
return bytes_read;
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void I2SAudioMicrophone::read_() {
|
void I2SAudioMicrophone::read_() {
|
||||||
std::vector<int16_t> samples;
|
std::vector<uint8_t> samples;
|
||||||
samples.resize(BUFFER_SIZE);
|
const size_t bytes_to_read = this->audio_stream_info_.ms_to_bytes(32);
|
||||||
size_t bytes_read = this->read(samples.data(), BUFFER_SIZE * sizeof(int16_t), 0);
|
samples.resize(bytes_to_read);
|
||||||
samples.resize(bytes_read / sizeof(int16_t));
|
size_t bytes_read = this->read_(samples.data(), bytes_to_read, 0);
|
||||||
|
samples.resize(bytes_read);
|
||||||
this->data_callbacks_.call(samples);
|
this->data_callbacks_.call(samples);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,9 +25,6 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
|
|||||||
|
|
||||||
void set_pdm(bool pdm) { this->pdm_ = pdm; }
|
void set_pdm(bool pdm) { this->pdm_ = pdm; }
|
||||||
|
|
||||||
size_t read(int16_t *buf, size_t len, TickType_t ticks_to_wait);
|
|
||||||
size_t read(int16_t *buf, size_t len) override { return this->read(buf, len, pdMS_TO_TICKS(100)); }
|
|
||||||
|
|
||||||
#ifdef USE_I2S_LEGACY
|
#ifdef USE_I2S_LEGACY
|
||||||
#if SOC_I2S_SUPPORTS_ADC
|
#if SOC_I2S_SUPPORTS_ADC
|
||||||
void set_adc_channel(adc1_channel_t channel) {
|
void set_adc_channel(adc1_channel_t channel) {
|
||||||
@ -41,6 +38,7 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
|
|||||||
void start_();
|
void start_();
|
||||||
void stop_();
|
void stop_();
|
||||||
void read_();
|
void read_();
|
||||||
|
size_t read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait);
|
||||||
|
|
||||||
#ifdef USE_I2S_LEGACY
|
#ifdef USE_I2S_LEGACY
|
||||||
int8_t din_pin_{I2S_PIN_NO_CHANGE};
|
int8_t din_pin_{I2S_PIN_NO_CHANGE};
|
||||||
|
@ -328,7 +328,14 @@ CONFIG_SCHEMA = cv.All(
|
|||||||
cv.Schema(
|
cv.Schema(
|
||||||
{
|
{
|
||||||
cv.GenerateID(): cv.declare_id(MicroWakeWord),
|
cv.GenerateID(): cv.declare_id(MicroWakeWord),
|
||||||
cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone),
|
cv.Optional(
|
||||||
|
CONF_MICROPHONE, default={}
|
||||||
|
): microphone.microphone_source_schema(
|
||||||
|
min_bits_per_sample=16,
|
||||||
|
max_bits_per_sample=16,
|
||||||
|
min_channels=1,
|
||||||
|
max_channels=1,
|
||||||
|
),
|
||||||
cv.Required(CONF_MODELS): cv.ensure_list(
|
cv.Required(CONF_MODELS): cv.ensure_list(
|
||||||
cv.maybe_simple_value(MODEL_SCHEMA, key=CONF_MODEL)
|
cv.maybe_simple_value(MODEL_SCHEMA, key=CONF_MODEL)
|
||||||
),
|
),
|
||||||
@ -404,15 +411,27 @@ def _feature_step_size_validate(config):
|
|||||||
raise cv.Invalid("Cannot load models with different features step sizes.")
|
raise cv.Invalid("Cannot load models with different features step sizes.")
|
||||||
|
|
||||||
|
|
||||||
FINAL_VALIDATE_SCHEMA = _feature_step_size_validate
|
FINAL_VALIDATE_SCHEMA = cv.All(
|
||||||
|
cv.Schema(
|
||||||
|
{
|
||||||
|
cv.Required(
|
||||||
|
CONF_MICROPHONE
|
||||||
|
): microphone.final_validate_microphone_source_schema(
|
||||||
|
"micro_wake_word", sample_rate=16000
|
||||||
|
),
|
||||||
|
},
|
||||||
|
extra=cv.ALLOW_EXTRA,
|
||||||
|
),
|
||||||
|
_feature_step_size_validate,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def to_code(config):
|
async def to_code(config):
|
||||||
var = cg.new_Pvariable(config[CONF_ID])
|
var = cg.new_Pvariable(config[CONF_ID])
|
||||||
await cg.register_component(var, config)
|
await cg.register_component(var, config)
|
||||||
|
|
||||||
mic = await cg.get_variable(config[CONF_MICROPHONE])
|
mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
|
||||||
cg.add(var.set_microphone(mic))
|
cg.add(var.set_microphone_source(mic_source))
|
||||||
|
|
||||||
esp32.add_idf_component(
|
esp32.add_idf_component(
|
||||||
name="esp-tflite-micro",
|
name="esp-tflite-micro",
|
||||||
|
@ -61,7 +61,7 @@ void MicroWakeWord::dump_config() {
|
|||||||
void MicroWakeWord::setup() {
|
void MicroWakeWord::setup() {
|
||||||
ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
|
ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
|
||||||
|
|
||||||
this->microphone_->add_data_callback([this](const std::vector<int16_t> &data) {
|
this->microphone_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
|
||||||
if (this->state_ != State::DETECTING_WAKE_WORD) {
|
if (this->state_ != State::DETECTING_WAKE_WORD) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -71,7 +71,7 @@ void MicroWakeWord::setup() {
|
|||||||
|
|
||||||
size_t bytes_free = temp_ring_buffer->free();
|
size_t bytes_free = temp_ring_buffer->free();
|
||||||
|
|
||||||
if (bytes_free < data.size() * sizeof(int16_t)) {
|
if (bytes_free < data.size()) {
|
||||||
ESP_LOGW(
|
ESP_LOGW(
|
||||||
TAG,
|
TAG,
|
||||||
"Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
|
"Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
|
||||||
@ -80,7 +80,7 @@ void MicroWakeWord::setup() {
|
|||||||
|
|
||||||
temp_ring_buffer->reset();
|
temp_ring_buffer->reset();
|
||||||
}
|
}
|
||||||
temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
|
temp_ring_buffer->write((void *) data.data(), data.size());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -128,11 +128,11 @@ void MicroWakeWord::loop() {
|
|||||||
break;
|
break;
|
||||||
case State::START_MICROPHONE:
|
case State::START_MICROPHONE:
|
||||||
ESP_LOGD(TAG, "Starting Microphone");
|
ESP_LOGD(TAG, "Starting Microphone");
|
||||||
this->microphone_->start();
|
this->microphone_source_->start();
|
||||||
this->set_state_(State::STARTING_MICROPHONE);
|
this->set_state_(State::STARTING_MICROPHONE);
|
||||||
break;
|
break;
|
||||||
case State::STARTING_MICROPHONE:
|
case State::STARTING_MICROPHONE:
|
||||||
if (this->microphone_->is_running()) {
|
if (this->microphone_source_->is_running()) {
|
||||||
this->set_state_(State::DETECTING_WAKE_WORD);
|
this->set_state_(State::DETECTING_WAKE_WORD);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -148,13 +148,13 @@ void MicroWakeWord::loop() {
|
|||||||
break;
|
break;
|
||||||
case State::STOP_MICROPHONE:
|
case State::STOP_MICROPHONE:
|
||||||
ESP_LOGD(TAG, "Stopping Microphone");
|
ESP_LOGD(TAG, "Stopping Microphone");
|
||||||
this->microphone_->stop();
|
this->microphone_source_->stop();
|
||||||
this->set_state_(State::STOPPING_MICROPHONE);
|
this->set_state_(State::STOPPING_MICROPHONE);
|
||||||
this->unload_models_();
|
this->unload_models_();
|
||||||
this->deallocate_buffers_();
|
this->deallocate_buffers_();
|
||||||
break;
|
break;
|
||||||
case State::STOPPING_MICROPHONE:
|
case State::STOPPING_MICROPHONE:
|
||||||
if (this->microphone_->is_stopped()) {
|
if (this->microphone_source_->is_stopped()) {
|
||||||
this->set_state_(State::IDLE);
|
this->set_state_(State::IDLE);
|
||||||
if (this->detected_) {
|
if (this->detected_) {
|
||||||
this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);
|
this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
#include "esphome/core/component.h"
|
#include "esphome/core/component.h"
|
||||||
#include "esphome/core/ring_buffer.h"
|
#include "esphome/core/ring_buffer.h"
|
||||||
|
|
||||||
#include "esphome/components/microphone/microphone.h"
|
#include "esphome/components/microphone/microphone_source.h"
|
||||||
|
|
||||||
#include <frontend_util.h>
|
#include <frontend_util.h>
|
||||||
|
|
||||||
@ -46,7 +46,9 @@ class MicroWakeWord : public Component {
|
|||||||
|
|
||||||
void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; }
|
void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; }
|
||||||
|
|
||||||
void set_microphone(microphone::Microphone *microphone) { this->microphone_ = microphone; }
|
void set_microphone_source(microphone::MicrophoneSource *microphone_source) {
|
||||||
|
this->microphone_source_ = microphone_source;
|
||||||
|
}
|
||||||
|
|
||||||
Trigger<std::string> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; }
|
Trigger<std::string> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; }
|
||||||
|
|
||||||
@ -59,7 +61,7 @@ class MicroWakeWord : public Component {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
microphone::Microphone *microphone_{nullptr};
|
microphone::MicrophoneSource *microphone_source_{nullptr};
|
||||||
Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
|
Trigger<std::string> *wake_word_detected_trigger_ = new Trigger<std::string>();
|
||||||
State state_{State::IDLE};
|
State state_{State::IDLE};
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ StopCaptureAction = microphone_ns.class_(
|
|||||||
|
|
||||||
DataTrigger = microphone_ns.class_(
|
DataTrigger = microphone_ns.class_(
|
||||||
"DataTrigger",
|
"DataTrigger",
|
||||||
automation.Trigger.template(cg.std_vector.template(cg.int16).operator("ref")),
|
automation.Trigger.template(cg.std_vector.template(cg.uint8).operator("ref")),
|
||||||
)
|
)
|
||||||
|
|
||||||
IsCapturingCondition = microphone_ns.class_(
|
IsCapturingCondition = microphone_ns.class_(
|
||||||
@ -98,10 +98,11 @@ def microphone_source_schema(
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
return cv.All(
|
return cv.All(
|
||||||
cv.maybe_simple_value(
|
automation.maybe_conf(
|
||||||
|
CONF_MICROPHONE,
|
||||||
{
|
{
|
||||||
cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
|
cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
|
||||||
cv.Required(CONF_MICROPHONE): cv.use_id(Microphone),
|
cv.GenerateID(CONF_MICROPHONE): cv.use_id(Microphone),
|
||||||
cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
|
cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
|
||||||
min_bits_per_sample, max_bits_per_sample
|
min_bits_per_sample, max_bits_per_sample
|
||||||
),
|
),
|
||||||
@ -112,7 +113,6 @@ def microphone_source_schema(
|
|||||||
),
|
),
|
||||||
cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
|
cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
|
||||||
},
|
},
|
||||||
key=CONF_MICROPHONE,
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -16,10 +16,10 @@ template<typename... Ts> class StopCaptureAction : public Action<Ts...>, public
|
|||||||
void play(Ts... x) override { this->parent_->stop(); }
|
void play(Ts... x) override { this->parent_->stop(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
class DataTrigger : public Trigger<const std::vector<int16_t> &> {
|
class DataTrigger : public Trigger<const std::vector<uint8_t> &> {
|
||||||
public:
|
public:
|
||||||
explicit DataTrigger(Microphone *mic) {
|
explicit DataTrigger(Microphone *mic) {
|
||||||
mic->add_data_callback([this](const std::vector<int16_t> &data) { this->trigger(data); });
|
mic->add_data_callback([this](const std::vector<uint8_t> &data) { this->trigger(data); });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -22,10 +22,9 @@ class Microphone {
|
|||||||
public:
|
public:
|
||||||
virtual void start() = 0;
|
virtual void start() = 0;
|
||||||
virtual void stop() = 0;
|
virtual void stop() = 0;
|
||||||
void add_data_callback(std::function<void(const std::vector<int16_t> &)> &&data_callback) {
|
void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
|
||||||
this->data_callbacks_.add(std::move(data_callback));
|
this->data_callbacks_.add(std::move(data_callback));
|
||||||
}
|
}
|
||||||
virtual size_t read(int16_t *buf, size_t len) = 0;
|
|
||||||
|
|
||||||
bool is_running() const { return this->state_ == STATE_RUNNING; }
|
bool is_running() const { return this->state_ == STATE_RUNNING; }
|
||||||
bool is_stopped() const { return this->state_ == STATE_STOPPED; }
|
bool is_stopped() const { return this->state_ == STATE_STOPPED; }
|
||||||
@ -37,7 +36,7 @@ class Microphone {
|
|||||||
|
|
||||||
audio::AudioStreamInfo audio_stream_info_;
|
audio::AudioStreamInfo audio_stream_info_;
|
||||||
|
|
||||||
CallbackManager<void(const std::vector<int16_t> &)> data_callbacks_{};
|
CallbackManager<void(const std::vector<uint8_t> &)> data_callbacks_{};
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace microphone
|
} // namespace microphone
|
||||||
|
@ -10,9 +10,7 @@ void MicrophoneSource::add_data_callback(std::function<void(const std::vector<ui
|
|||||||
data_callback(this->process_audio_(data));
|
data_callback(this->process_audio_(data));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a
|
this->mic_->add_data_callback(std::move(filtered_callback));
|
||||||
// mic callback.
|
|
||||||
// this->mic_->add_data_callback(std::move(filtered_callback));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void MicrophoneSource::start() {
|
void MicrophoneSource::start() {
|
||||||
|
@ -88,7 +88,14 @@ CONFIG_SCHEMA = cv.All(
|
|||||||
cv.Schema(
|
cv.Schema(
|
||||||
{
|
{
|
||||||
cv.GenerateID(): cv.declare_id(VoiceAssistant),
|
cv.GenerateID(): cv.declare_id(VoiceAssistant),
|
||||||
cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone),
|
cv.Optional(
|
||||||
|
CONF_MICROPHONE, default={}
|
||||||
|
): microphone.microphone_source_schema(
|
||||||
|
min_bits_per_sample=16,
|
||||||
|
max_bits_per_sample=16,
|
||||||
|
min_channels=1,
|
||||||
|
max_channels=1,
|
||||||
|
),
|
||||||
cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
|
cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker),
|
||||||
cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
|
cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
|
||||||
media_player.MediaPlayer
|
media_player.MediaPlayer
|
||||||
@ -163,13 +170,26 @@ CONFIG_SCHEMA = cv.All(
|
|||||||
tts_stream_validate,
|
tts_stream_validate,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
FINAL_VALIDATE_SCHEMA = cv.All(
|
||||||
|
cv.Schema(
|
||||||
|
{
|
||||||
|
cv.Optional(
|
||||||
|
CONF_MICROPHONE
|
||||||
|
): microphone.final_validate_microphone_source_schema(
|
||||||
|
"voice_assistant", sample_rate=16000
|
||||||
|
),
|
||||||
|
},
|
||||||
|
extra=cv.ALLOW_EXTRA,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def to_code(config):
|
async def to_code(config):
|
||||||
var = cg.new_Pvariable(config[CONF_ID])
|
var = cg.new_Pvariable(config[CONF_ID])
|
||||||
await cg.register_component(var, config)
|
await cg.register_component(var, config)
|
||||||
|
|
||||||
mic = await cg.get_variable(config[CONF_MICROPHONE])
|
mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
|
||||||
cg.add(var.set_microphone(mic))
|
cg.add(var.set_microphone_source(mic_source))
|
||||||
|
|
||||||
if CONF_SPEAKER in config:
|
if CONF_SPEAKER in config:
|
||||||
spkr = await cg.get_variable(config[CONF_SPEAKER])
|
spkr = await cg.get_variable(config[CONF_SPEAKER])
|
||||||
|
@ -29,10 +29,10 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
|
|||||||
VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
|
VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
|
||||||
|
|
||||||
void VoiceAssistant::setup() {
|
void VoiceAssistant::setup() {
|
||||||
this->mic_->add_data_callback([this](const std::vector<int16_t> &data) {
|
this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
|
||||||
std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
|
std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
|
||||||
if (this->ring_buffer_.use_count() > 1) {
|
if (this->ring_buffer_.use_count() > 1) {
|
||||||
temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
|
temp_ring_buffer->write((void *) data.data(), data.size());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -162,7 +162,7 @@ void VoiceAssistant::reset_conversation_id() {
|
|||||||
void VoiceAssistant::loop() {
|
void VoiceAssistant::loop() {
|
||||||
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
|
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
|
||||||
this->state_ != State::STOPPING_MICROPHONE) {
|
this->state_ != State::STOPPING_MICROPHONE) {
|
||||||
if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
|
if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
|
||||||
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
||||||
} else {
|
} else {
|
||||||
this->set_state_(State::IDLE, State::IDLE);
|
this->set_state_(State::IDLE, State::IDLE);
|
||||||
@ -193,12 +193,12 @@ void VoiceAssistant::loop() {
|
|||||||
}
|
}
|
||||||
this->clear_buffers_();
|
this->clear_buffers_();
|
||||||
|
|
||||||
this->mic_->start();
|
this->mic_source_->start();
|
||||||
this->set_state_(State::STARTING_MICROPHONE);
|
this->set_state_(State::STARTING_MICROPHONE);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case State::STARTING_MICROPHONE: {
|
case State::STARTING_MICROPHONE: {
|
||||||
if (this->mic_->is_running()) {
|
if (this->mic_source_->is_running()) {
|
||||||
this->set_state_(this->desired_state_);
|
this->set_state_(this->desired_state_);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -262,8 +262,8 @@ void VoiceAssistant::loop() {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case State::STOP_MICROPHONE: {
|
case State::STOP_MICROPHONE: {
|
||||||
if (this->mic_->is_running()) {
|
if (this->mic_source_->is_running()) {
|
||||||
this->mic_->stop();
|
this->mic_source_->stop();
|
||||||
this->set_state_(State::STOPPING_MICROPHONE);
|
this->set_state_(State::STOPPING_MICROPHONE);
|
||||||
} else {
|
} else {
|
||||||
this->set_state_(this->desired_state_);
|
this->set_state_(this->desired_state_);
|
||||||
@ -271,7 +271,7 @@ void VoiceAssistant::loop() {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case State::STOPPING_MICROPHONE: {
|
case State::STOPPING_MICROPHONE: {
|
||||||
if (this->mic_->is_stopped()) {
|
if (this->mic_source_->is_stopped()) {
|
||||||
this->set_state_(this->desired_state_);
|
this->set_state_(this->desired_state_);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -478,7 +478,7 @@ void VoiceAssistant::start_streaming() {
|
|||||||
ESP_LOGD(TAG, "Client started, streaming microphone");
|
ESP_LOGD(TAG, "Client started, streaming microphone");
|
||||||
this->audio_mode_ = AUDIO_MODE_API;
|
this->audio_mode_ = AUDIO_MODE_API;
|
||||||
|
|
||||||
if (this->mic_->is_running()) {
|
if (this->mic_source_->is_running()) {
|
||||||
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
|
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||||
} else {
|
} else {
|
||||||
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
|
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||||
@ -508,7 +508,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this->mic_->is_running()) {
|
if (this->mic_source_->is_running()) {
|
||||||
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
|
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||||
} else {
|
} else {
|
||||||
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
|
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
#include "esphome/components/api/api_connection.h"
|
#include "esphome/components/api/api_connection.h"
|
||||||
#include "esphome/components/api/api_pb2.h"
|
#include "esphome/components/api/api_pb2.h"
|
||||||
#include "esphome/components/microphone/microphone.h"
|
#include "esphome/components/microphone/microphone_source.h"
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
#include "esphome/components/speaker/speaker.h"
|
#include "esphome/components/speaker/speaker.h"
|
||||||
#endif
|
#endif
|
||||||
@ -98,7 +98,7 @@ class VoiceAssistant : public Component {
|
|||||||
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
||||||
void failed_to_start();
|
void failed_to_start();
|
||||||
|
|
||||||
void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; }
|
void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
void set_speaker(speaker::Speaker *speaker) {
|
void set_speaker(speaker::Speaker *speaker) {
|
||||||
this->speaker_ = speaker;
|
this->speaker_ = speaker;
|
||||||
@ -249,7 +249,7 @@ class VoiceAssistant : public Component {
|
|||||||
bool has_timers_{false};
|
bool has_timers_{false};
|
||||||
bool timer_tick_running_{false};
|
bool timer_tick_running_{false};
|
||||||
|
|
||||||
microphone::Microphone *mic_{nullptr};
|
microphone::MicrophoneSource *mic_source_{nullptr};
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
void write_speaker_();
|
void write_speaker_();
|
||||||
speaker::Speaker *speaker_{nullptr};
|
speaker::Speaker *speaker_{nullptr};
|
||||||
|
@ -11,6 +11,7 @@ microphone:
|
|||||||
bits_per_sample: 16bit
|
bits_per_sample: 16bit
|
||||||
|
|
||||||
micro_wake_word:
|
micro_wake_word:
|
||||||
|
microphone: echo_microphone
|
||||||
on_wake_word_detected:
|
on_wake_word_detected:
|
||||||
- logger.log: "Wake word detected"
|
- logger.log: "Wake word detected"
|
||||||
models:
|
models:
|
||||||
|
@ -30,7 +30,10 @@ speaker:
|
|||||||
i2s_dout_pin: ${i2s_dout_pin}
|
i2s_dout_pin: ${i2s_dout_pin}
|
||||||
|
|
||||||
voice_assistant:
|
voice_assistant:
|
||||||
|
microphone:
|
||||||
microphone: mic_id_external
|
microphone: mic_id_external
|
||||||
|
gain_factor: 4
|
||||||
|
channels: 0
|
||||||
speaker: speaker_id
|
speaker: speaker_id
|
||||||
conversation_timeout: 60s
|
conversation_timeout: 60s
|
||||||
on_listening:
|
on_listening:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user