From e49252ca3d0e6734adebb4fb8d30e9ea2becf47a Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Sun, 27 Apr 2025 19:15:28 -0500 Subject: [PATCH] [voice_assistant] Use mic callback and remove esp_adf code (#8627) Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com> --- .../components/voice_assistant/__init__.py | 4 +- .../voice_assistant/voice_assistant.cpp | 130 +++--------------- .../voice_assistant/voice_assistant.h | 17 +-- 3 files changed, 22 insertions(+), 129 deletions(-) diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py index a4fb572208..e8cdca94b8 100644 --- a/esphome/components/voice_assistant/__init__.py +++ b/esphome/components/voice_assistant/__init__.py @@ -94,8 +94,8 @@ CONFIG_SCHEMA = cv.All( media_player.MediaPlayer ), cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean, - cv.Optional(CONF_VAD_THRESHOLD): cv.All( - cv.requires_component("esp_adf"), cv.only_with_esp_idf, cv.uint8_t + cv.Optional(CONF_VAD_THRESHOLD): cv.invalid( + "VAD threshold is no longer supported, as it requires the deprecated esp_adf external component. Use an i2s_audio microphone/speaker instead. Additionally, you may need to configure the audio_adc and audio_dac components depending on your hardware." ), cv.Optional(CONF_NOISE_SUPPRESSION_LEVEL, default=0): cv.int_range(0, 4), cv.Optional(CONF_AUTO_GAIN, default="0dBFS"): cv.All( diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index a38ae2d12b..c62767d7d5 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -18,14 +18,25 @@ static const char *const TAG = "voice_assistant"; #endif static const size_t SAMPLE_RATE_HZ = 16000; -static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms -static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000; -static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t); + +static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms +static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t); +static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms +static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t); static const size_t RECEIVE_SIZE = 1024; static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE; VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; } +void VoiceAssistant::setup() { + this->mic_->add_data_callback([this](const std::vector &data) { + std::shared_ptr temp_ring_buffer = this->ring_buffer_; + if (this->ring_buffer_.use_count() > 1) { + temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t)); + } + }); +} + float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } bool VoiceAssistant::start_udp_socket_() { @@ -83,21 +94,8 @@ bool VoiceAssistant::allocate_buffers_() { } #endif - if (this->input_buffer_ == nullptr) { - ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); - this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE); - if (this->input_buffer_ == nullptr) { - ESP_LOGW(TAG, "Could not allocate input buffer"); - return false; - } - } - -#ifdef USE_ESP_ADF - this->vad_instance_ = vad_create(VAD_MODE_4); -#endif - if (this->ring_buffer_.use_count() == 0) { - this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t)); + this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE); if (this->ring_buffer_.use_count() == 0) { ESP_LOGE(TAG, "Could not allocate ring buffer"); return false; @@ -121,10 +119,6 @@ void VoiceAssistant::clear_buffers_() { memset(this->send_buffer_, 0, SEND_BUFFER_SIZE); } - if (this->input_buffer_ != nullptr) { - memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t)); - } - if (this->ring_buffer_ != nullptr) { this->ring_buffer_->reset(); } @@ -151,19 +145,6 @@ void VoiceAssistant::deallocate_buffers_() { this->ring_buffer_.reset(); } -#ifdef USE_ESP_ADF - if (this->vad_instance_ != nullptr) { - vad_destroy(this->vad_instance_); - this->vad_instance_ = nullptr; - } -#endif - - if (this->input_buffer_ != nullptr) { - ExternalRAMAllocator input_deallocator(ExternalRAMAllocator::ALLOW_FAILURE); - input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE); - this->input_buffer_ = nullptr; - } - #ifdef USE_SPEAKER if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) { ExternalRAMAllocator speaker_deallocator(ExternalRAMAllocator::ALLOW_FAILURE); @@ -178,22 +159,6 @@ void VoiceAssistant::reset_conversation_id() { ESP_LOGD(TAG, "reset conversation ID"); } -int VoiceAssistant::read_microphone_() { - size_t bytes_read = 0; - if (this->mic_->is_running()) { // Read audio into input buffer - bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t)); - if (bytes_read == 0) { - memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t)); - return 0; - } - // Write audio into ring buffer - this->ring_buffer_->write((void *) this->input_buffer_, bytes_read); - } else { - ESP_LOGD(TAG, "microphone not running"); - } - return bytes_read; -} - void VoiceAssistant::loop() { if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE && this->state_ != State::STOPPING_MICROPHONE) { @@ -211,16 +176,8 @@ void VoiceAssistant::loop() { case State::IDLE: { if (this->continuous_ && this->desired_state_ == State::IDLE) { this->idle_trigger_->trigger(); -#ifdef USE_ESP_ADF - if (this->use_wake_word_) { - this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD); - } else -#endif - { - this->set_state_(State::START_MICROPHONE, State::START_PIPELINE); - } + this->set_state_(State::START_MICROPHONE, State::START_PIPELINE); } else { - this->high_freq_.stop(); this->deallocate_buffers_(); } break; @@ -237,7 +194,6 @@ void VoiceAssistant::loop() { this->clear_buffers_(); this->mic_->start(); - this->high_freq_.start(); this->set_state_(State::STARTING_MICROPHONE); break; } @@ -247,39 +203,7 @@ void VoiceAssistant::loop() { } break; } -#ifdef USE_ESP_ADF - case State::WAIT_FOR_VAD: { - this->read_microphone_(); - ESP_LOGD(TAG, "Waiting for speech..."); - this->set_state_(State::WAITING_FOR_VAD); - break; - } - case State::WAITING_FOR_VAD: { - size_t bytes_read = this->read_microphone_(); - if (bytes_read > 0) { - vad_state_t vad_state = - vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); - if (vad_state == VAD_SPEECH) { - if (this->vad_counter_ < this->vad_threshold_) { - this->vad_counter_++; - } else { - ESP_LOGD(TAG, "VAD detected speech"); - this->set_state_(State::START_PIPELINE, State::STREAMING_MICROPHONE); - - // Reset for next time - this->vad_counter_ = 0; - } - } else { - if (this->vad_counter_ > 0) { - this->vad_counter_--; - } - } - } - break; - } -#endif case State::START_PIPELINE: { - this->read_microphone_(); ESP_LOGD(TAG, "Requesting start..."); uint32_t flags = 0; if (!this->continue_conversation_ && this->use_wake_word_) @@ -312,11 +236,9 @@ void VoiceAssistant::loop() { break; } case State::STARTING_PIPELINE: { - this->read_microphone_(); break; // State changed when udp server port received } case State::STREAMING_MICROPHONE: { - this->read_microphone_(); size_t available = this->ring_buffer_->available(); while (available >= SEND_BUFFER_SIZE) { size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); @@ -603,14 +525,8 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) { if (this->state_ == State::IDLE) { this->continuous_ = continuous; this->silence_detection_ = silence_detection; -#ifdef USE_ESP_ADF - if (this->use_wake_word_) { - this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD); - } else -#endif - { - this->set_state_(State::START_MICROPHONE, State::START_PIPELINE); - } + + this->set_state_(State::START_MICROPHONE, State::START_PIPELINE); } } @@ -785,15 +701,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { this->set_state_(State::IDLE, State::IDLE); } else if (this->state_ == State::STREAMING_MICROPHONE) { this->ring_buffer_->reset(); -#ifdef USE_ESP_ADF - if (this->use_wake_word_) { - // No need to stop the microphone since we didn't use the speaker - this->set_state_(State::WAIT_FOR_VAD, State::WAITING_FOR_VAD); - } else -#endif - { - this->set_state_(State::IDLE, State::IDLE); - } + this->set_state_(State::IDLE, State::IDLE); } this->defer([this]() { this->end_trigger_->trigger(); }); break; diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index 66531fcd94..cb57a6b05d 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -20,10 +20,6 @@ #endif #include "esphome/components/socket/socket.h" -#ifdef USE_ESP_ADF -#include -#endif - #include #include @@ -96,6 +92,7 @@ class VoiceAssistant : public Component { VoiceAssistant(); void loop() override; + void setup() override; float get_setup_priority() const override; void start_streaming(); void start_streaming(struct sockaddr_storage *addr, uint16_t port); @@ -163,9 +160,6 @@ class VoiceAssistant : public Component { bool is_continuous() const { return this->continuous_; } void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; } -#ifdef USE_ESP_ADF - void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; } -#endif void set_noise_suppression_level(uint8_t noise_suppression_level) { this->noise_suppression_level_ = noise_suppression_level; @@ -214,7 +208,6 @@ class VoiceAssistant : public Component { void clear_buffers_(); void deallocate_buffers_(); - int read_microphone_(); void set_state_(State state); void set_state_(State state, State desired_state); void signal_stop_(); @@ -279,13 +272,6 @@ class VoiceAssistant : public Component { std::string wake_word_{""}; - HighFrequencyLoopRequester high_freq_; - -#ifdef USE_ESP_ADF - vad_handle_t vad_instance_; - uint8_t vad_threshold_{5}; - uint8_t vad_counter_{0}; -#endif std::shared_ptr ring_buffer_; bool use_wake_word_; @@ -295,7 +281,6 @@ class VoiceAssistant : public Component { uint32_t conversation_timeout_; uint8_t *send_buffer_{nullptr}; - int16_t *input_buffer_{nullptr}; bool continuous_{false}; bool silence_detection_;