[voice_assistant] Use mic callback and remove esp_adf code (#8627)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
This commit is contained in:
Kevin Ahrendt 2025-04-27 19:15:28 -05:00 committed by GitHub
parent c9d1476ae0
commit e49252ca3d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 22 additions and 129 deletions

View File

@ -94,8 +94,8 @@ CONFIG_SCHEMA = cv.All(
media_player.MediaPlayer
),
cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean,
cv.Optional(CONF_VAD_THRESHOLD): cv.All(
cv.requires_component("esp_adf"), cv.only_with_esp_idf, cv.uint8_t
cv.Optional(CONF_VAD_THRESHOLD): cv.invalid(
"VAD threshold is no longer supported, as it requires the deprecated esp_adf external component. Use an i2s_audio microphone/speaker instead. Additionally, you may need to configure the audio_adc and audio_dac components depending on your hardware."
),
cv.Optional(CONF_NOISE_SUPPRESSION_LEVEL, default=0): cv.int_range(0, 4),
cv.Optional(CONF_AUTO_GAIN, default="0dBFS"): cv.All(

View File

@ -18,14 +18,25 @@ static const char *const TAG = "voice_assistant";
#endif
static const size_t SAMPLE_RATE_HZ = 16000;
static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
static const size_t RECEIVE_SIZE = 1024;
static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
void VoiceAssistant::setup() {
this->mic_->add_data_callback([this](const std::vector<int16_t> &data) {
std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
if (this->ring_buffer_.use_count() > 1) {
temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
}
});
}
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
bool VoiceAssistant::start_udp_socket_() {
@ -83,21 +94,8 @@ bool VoiceAssistant::allocate_buffers_() {
}
#endif
if (this->input_buffer_ == nullptr) {
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
if (this->input_buffer_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate input buffer");
return false;
}
}
#ifdef USE_ESP_ADF
this->vad_instance_ = vad_create(VAD_MODE_4);
#endif
if (this->ring_buffer_.use_count() == 0) {
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE);
if (this->ring_buffer_.use_count() == 0) {
ESP_LOGE(TAG, "Could not allocate ring buffer");
return false;
@ -121,10 +119,6 @@ void VoiceAssistant::clear_buffers_() {
memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
}
if (this->input_buffer_ != nullptr) {
memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
}
if (this->ring_buffer_ != nullptr) {
this->ring_buffer_->reset();
}
@ -151,19 +145,6 @@ void VoiceAssistant::deallocate_buffers_() {
this->ring_buffer_.reset();
}
#ifdef USE_ESP_ADF
if (this->vad_instance_ != nullptr) {
vad_destroy(this->vad_instance_);
this->vad_instance_ = nullptr;
}
#endif
if (this->input_buffer_ != nullptr) {
ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
this->input_buffer_ = nullptr;
}
#ifdef USE_SPEAKER
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
ExternalRAMAllocator<uint8_t> speaker_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
@ -178,22 +159,6 @@ void VoiceAssistant::reset_conversation_id() {
ESP_LOGD(TAG, "reset conversation ID");
}
int VoiceAssistant::read_microphone_() {
size_t bytes_read = 0;
if (this->mic_->is_running()) { // Read audio into input buffer
bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
if (bytes_read == 0) {
memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
return 0;
}
// Write audio into ring buffer
this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
} else {
ESP_LOGD(TAG, "microphone not running");
}
return bytes_read;
}
void VoiceAssistant::loop() {
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
this->state_ != State::STOPPING_MICROPHONE) {
@ -211,16 +176,8 @@ void VoiceAssistant::loop() {
case State::IDLE: {
if (this->continuous_ && this->desired_state_ == State::IDLE) {
this->idle_trigger_->trigger();
#ifdef USE_ESP_ADF
if (this->use_wake_word_) {
this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD);
} else
#endif
{
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
}
} else {
this->high_freq_.stop();
this->deallocate_buffers_();
}
break;
@ -237,7 +194,6 @@ void VoiceAssistant::loop() {
this->clear_buffers_();
this->mic_->start();
this->high_freq_.start();
this->set_state_(State::STARTING_MICROPHONE);
break;
}
@ -247,39 +203,7 @@ void VoiceAssistant::loop() {
}
break;
}
#ifdef USE_ESP_ADF
case State::WAIT_FOR_VAD: {
this->read_microphone_();
ESP_LOGD(TAG, "Waiting for speech...");
this->set_state_(State::WAITING_FOR_VAD);
break;
}
case State::WAITING_FOR_VAD: {
size_t bytes_read = this->read_microphone_();
if (bytes_read > 0) {
vad_state_t vad_state =
vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
if (vad_state == VAD_SPEECH) {
if (this->vad_counter_ < this->vad_threshold_) {
this->vad_counter_++;
} else {
ESP_LOGD(TAG, "VAD detected speech");
this->set_state_(State::START_PIPELINE, State::STREAMING_MICROPHONE);
// Reset for next time
this->vad_counter_ = 0;
}
} else {
if (this->vad_counter_ > 0) {
this->vad_counter_--;
}
}
}
break;
}
#endif
case State::START_PIPELINE: {
this->read_microphone_();
ESP_LOGD(TAG, "Requesting start...");
uint32_t flags = 0;
if (!this->continue_conversation_ && this->use_wake_word_)
@ -312,11 +236,9 @@ void VoiceAssistant::loop() {
break;
}
case State::STARTING_PIPELINE: {
this->read_microphone_();
break; // State changed when udp server port received
}
case State::STREAMING_MICROPHONE: {
this->read_microphone_();
size_t available = this->ring_buffer_->available();
while (available >= SEND_BUFFER_SIZE) {
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
@ -603,15 +525,9 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
if (this->state_ == State::IDLE) {
this->continuous_ = continuous;
this->silence_detection_ = silence_detection;
#ifdef USE_ESP_ADF
if (this->use_wake_word_) {
this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD);
} else
#endif
{
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
}
}
}
void VoiceAssistant::request_stop() {
@ -785,16 +701,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
this->set_state_(State::IDLE, State::IDLE);
} else if (this->state_ == State::STREAMING_MICROPHONE) {
this->ring_buffer_->reset();
#ifdef USE_ESP_ADF
if (this->use_wake_word_) {
// No need to stop the microphone since we didn't use the speaker
this->set_state_(State::WAIT_FOR_VAD, State::WAITING_FOR_VAD);
} else
#endif
{
this->set_state_(State::IDLE, State::IDLE);
}
}
this->defer([this]() { this->end_trigger_->trigger(); });
break;
}

View File

@ -20,10 +20,6 @@
#endif
#include "esphome/components/socket/socket.h"
#ifdef USE_ESP_ADF
#include <esp_vad.h>
#endif
#include <unordered_map>
#include <vector>
@ -96,6 +92,7 @@ class VoiceAssistant : public Component {
VoiceAssistant();
void loop() override;
void setup() override;
float get_setup_priority() const override;
void start_streaming();
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
@ -163,9 +160,6 @@ class VoiceAssistant : public Component {
bool is_continuous() const { return this->continuous_; }
void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
#ifdef USE_ESP_ADF
void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; }
#endif
void set_noise_suppression_level(uint8_t noise_suppression_level) {
this->noise_suppression_level_ = noise_suppression_level;
@ -214,7 +208,6 @@ class VoiceAssistant : public Component {
void clear_buffers_();
void deallocate_buffers_();
int read_microphone_();
void set_state_(State state);
void set_state_(State state, State desired_state);
void signal_stop_();
@ -279,13 +272,6 @@ class VoiceAssistant : public Component {
std::string wake_word_{""};
HighFrequencyLoopRequester high_freq_;
#ifdef USE_ESP_ADF
vad_handle_t vad_instance_;
uint8_t vad_threshold_{5};
uint8_t vad_counter_{0};
#endif
std::shared_ptr<RingBuffer> ring_buffer_;
bool use_wake_word_;
@ -295,7 +281,6 @@ class VoiceAssistant : public Component {
uint32_t conversation_timeout_;
uint8_t *send_buffer_{nullptr};
int16_t *input_buffer_{nullptr};
bool continuous_{false};
bool silence_detection_;