mirror of
https://github.com/esphome/esphome.git
synced 2025-07-28 14:16:40 +00:00
[voice_assistant] Use mic callback and remove esp_adf code (#8627)
Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
This commit is contained in:
parent
c9d1476ae0
commit
e49252ca3d
@ -94,8 +94,8 @@ CONFIG_SCHEMA = cv.All(
|
|||||||
media_player.MediaPlayer
|
media_player.MediaPlayer
|
||||||
),
|
),
|
||||||
cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean,
|
cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean,
|
||||||
cv.Optional(CONF_VAD_THRESHOLD): cv.All(
|
cv.Optional(CONF_VAD_THRESHOLD): cv.invalid(
|
||||||
cv.requires_component("esp_adf"), cv.only_with_esp_idf, cv.uint8_t
|
"VAD threshold is no longer supported, as it requires the deprecated esp_adf external component. Use an i2s_audio microphone/speaker instead. Additionally, you may need to configure the audio_adc and audio_dac components depending on your hardware."
|
||||||
),
|
),
|
||||||
cv.Optional(CONF_NOISE_SUPPRESSION_LEVEL, default=0): cv.int_range(0, 4),
|
cv.Optional(CONF_NOISE_SUPPRESSION_LEVEL, default=0): cv.int_range(0, 4),
|
||||||
cv.Optional(CONF_AUTO_GAIN, default="0dBFS"): cv.All(
|
cv.Optional(CONF_AUTO_GAIN, default="0dBFS"): cv.All(
|
||||||
|
@ -18,14 +18,25 @@ static const char *const TAG = "voice_assistant";
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static const size_t SAMPLE_RATE_HZ = 16000;
|
static const size_t SAMPLE_RATE_HZ = 16000;
|
||||||
static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
|
|
||||||
static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
|
static const size_t RING_BUFFER_SAMPLES = 512 * SAMPLE_RATE_HZ / 1000; // 512 ms * 16 kHz/ 1000 ms
|
||||||
static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
|
static const size_t RING_BUFFER_SIZE = RING_BUFFER_SAMPLES * sizeof(int16_t);
|
||||||
|
static const size_t SEND_BUFFER_SAMPLES = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
|
||||||
|
static const size_t SEND_BUFFER_SIZE = SEND_BUFFER_SAMPLES * sizeof(int16_t);
|
||||||
static const size_t RECEIVE_SIZE = 1024;
|
static const size_t RECEIVE_SIZE = 1024;
|
||||||
static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
|
static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
|
||||||
|
|
||||||
VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
|
VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
|
||||||
|
|
||||||
|
void VoiceAssistant::setup() {
|
||||||
|
this->mic_->add_data_callback([this](const std::vector<int16_t> &data) {
|
||||||
|
std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_;
|
||||||
|
if (this->ring_buffer_.use_count() > 1) {
|
||||||
|
temp_ring_buffer->write((void *) data.data(), data.size() * sizeof(int16_t));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
|
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
|
||||||
|
|
||||||
bool VoiceAssistant::start_udp_socket_() {
|
bool VoiceAssistant::start_udp_socket_() {
|
||||||
@ -83,21 +94,8 @@ bool VoiceAssistant::allocate_buffers_() {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (this->input_buffer_ == nullptr) {
|
|
||||||
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
|
||||||
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
|
|
||||||
if (this->input_buffer_ == nullptr) {
|
|
||||||
ESP_LOGW(TAG, "Could not allocate input buffer");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
this->vad_instance_ = vad_create(VAD_MODE_4);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (this->ring_buffer_.use_count() == 0) {
|
if (this->ring_buffer_.use_count() == 0) {
|
||||||
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
|
this->ring_buffer_ = RingBuffer::create(RING_BUFFER_SIZE);
|
||||||
if (this->ring_buffer_.use_count() == 0) {
|
if (this->ring_buffer_.use_count() == 0) {
|
||||||
ESP_LOGE(TAG, "Could not allocate ring buffer");
|
ESP_LOGE(TAG, "Could not allocate ring buffer");
|
||||||
return false;
|
return false;
|
||||||
@ -121,10 +119,6 @@ void VoiceAssistant::clear_buffers_() {
|
|||||||
memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
|
memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this->input_buffer_ != nullptr) {
|
|
||||||
memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this->ring_buffer_ != nullptr) {
|
if (this->ring_buffer_ != nullptr) {
|
||||||
this->ring_buffer_->reset();
|
this->ring_buffer_->reset();
|
||||||
}
|
}
|
||||||
@ -151,19 +145,6 @@ void VoiceAssistant::deallocate_buffers_() {
|
|||||||
this->ring_buffer_.reset();
|
this->ring_buffer_.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
if (this->vad_instance_ != nullptr) {
|
|
||||||
vad_destroy(this->vad_instance_);
|
|
||||||
this->vad_instance_ = nullptr;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (this->input_buffer_ != nullptr) {
|
|
||||||
ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
|
||||||
input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
|
|
||||||
this->input_buffer_ = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef USE_SPEAKER
|
#ifdef USE_SPEAKER
|
||||||
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
|
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
|
||||||
ExternalRAMAllocator<uint8_t> speaker_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
ExternalRAMAllocator<uint8_t> speaker_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||||
@ -178,22 +159,6 @@ void VoiceAssistant::reset_conversation_id() {
|
|||||||
ESP_LOGD(TAG, "reset conversation ID");
|
ESP_LOGD(TAG, "reset conversation ID");
|
||||||
}
|
}
|
||||||
|
|
||||||
int VoiceAssistant::read_microphone_() {
|
|
||||||
size_t bytes_read = 0;
|
|
||||||
if (this->mic_->is_running()) { // Read audio into input buffer
|
|
||||||
bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
|
|
||||||
if (bytes_read == 0) {
|
|
||||||
memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
// Write audio into ring buffer
|
|
||||||
this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
|
|
||||||
} else {
|
|
||||||
ESP_LOGD(TAG, "microphone not running");
|
|
||||||
}
|
|
||||||
return bytes_read;
|
|
||||||
}
|
|
||||||
|
|
||||||
void VoiceAssistant::loop() {
|
void VoiceAssistant::loop() {
|
||||||
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
|
if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
|
||||||
this->state_ != State::STOPPING_MICROPHONE) {
|
this->state_ != State::STOPPING_MICROPHONE) {
|
||||||
@ -211,16 +176,8 @@ void VoiceAssistant::loop() {
|
|||||||
case State::IDLE: {
|
case State::IDLE: {
|
||||||
if (this->continuous_ && this->desired_state_ == State::IDLE) {
|
if (this->continuous_ && this->desired_state_ == State::IDLE) {
|
||||||
this->idle_trigger_->trigger();
|
this->idle_trigger_->trigger();
|
||||||
#ifdef USE_ESP_ADF
|
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
|
||||||
if (this->use_wake_word_) {
|
|
||||||
this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
this->high_freq_.stop();
|
|
||||||
this->deallocate_buffers_();
|
this->deallocate_buffers_();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -237,7 +194,6 @@ void VoiceAssistant::loop() {
|
|||||||
this->clear_buffers_();
|
this->clear_buffers_();
|
||||||
|
|
||||||
this->mic_->start();
|
this->mic_->start();
|
||||||
this->high_freq_.start();
|
|
||||||
this->set_state_(State::STARTING_MICROPHONE);
|
this->set_state_(State::STARTING_MICROPHONE);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -247,39 +203,7 @@ void VoiceAssistant::loop() {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
case State::WAIT_FOR_VAD: {
|
|
||||||
this->read_microphone_();
|
|
||||||
ESP_LOGD(TAG, "Waiting for speech...");
|
|
||||||
this->set_state_(State::WAITING_FOR_VAD);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case State::WAITING_FOR_VAD: {
|
|
||||||
size_t bytes_read = this->read_microphone_();
|
|
||||||
if (bytes_read > 0) {
|
|
||||||
vad_state_t vad_state =
|
|
||||||
vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
|
|
||||||
if (vad_state == VAD_SPEECH) {
|
|
||||||
if (this->vad_counter_ < this->vad_threshold_) {
|
|
||||||
this->vad_counter_++;
|
|
||||||
} else {
|
|
||||||
ESP_LOGD(TAG, "VAD detected speech");
|
|
||||||
this->set_state_(State::START_PIPELINE, State::STREAMING_MICROPHONE);
|
|
||||||
|
|
||||||
// Reset for next time
|
|
||||||
this->vad_counter_ = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (this->vad_counter_ > 0) {
|
|
||||||
this->vad_counter_--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
case State::START_PIPELINE: {
|
case State::START_PIPELINE: {
|
||||||
this->read_microphone_();
|
|
||||||
ESP_LOGD(TAG, "Requesting start...");
|
ESP_LOGD(TAG, "Requesting start...");
|
||||||
uint32_t flags = 0;
|
uint32_t flags = 0;
|
||||||
if (!this->continue_conversation_ && this->use_wake_word_)
|
if (!this->continue_conversation_ && this->use_wake_word_)
|
||||||
@ -312,11 +236,9 @@ void VoiceAssistant::loop() {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case State::STARTING_PIPELINE: {
|
case State::STARTING_PIPELINE: {
|
||||||
this->read_microphone_();
|
|
||||||
break; // State changed when udp server port received
|
break; // State changed when udp server port received
|
||||||
}
|
}
|
||||||
case State::STREAMING_MICROPHONE: {
|
case State::STREAMING_MICROPHONE: {
|
||||||
this->read_microphone_();
|
|
||||||
size_t available = this->ring_buffer_->available();
|
size_t available = this->ring_buffer_->available();
|
||||||
while (available >= SEND_BUFFER_SIZE) {
|
while (available >= SEND_BUFFER_SIZE) {
|
||||||
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
|
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
|
||||||
@ -603,14 +525,8 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
|
|||||||
if (this->state_ == State::IDLE) {
|
if (this->state_ == State::IDLE) {
|
||||||
this->continuous_ = continuous;
|
this->continuous_ = continuous;
|
||||||
this->silence_detection_ = silence_detection;
|
this->silence_detection_ = silence_detection;
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
if (this->use_wake_word_) {
|
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
|
||||||
this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -785,15 +701,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
|||||||
this->set_state_(State::IDLE, State::IDLE);
|
this->set_state_(State::IDLE, State::IDLE);
|
||||||
} else if (this->state_ == State::STREAMING_MICROPHONE) {
|
} else if (this->state_ == State::STREAMING_MICROPHONE) {
|
||||||
this->ring_buffer_->reset();
|
this->ring_buffer_->reset();
|
||||||
#ifdef USE_ESP_ADF
|
this->set_state_(State::IDLE, State::IDLE);
|
||||||
if (this->use_wake_word_) {
|
|
||||||
// No need to stop the microphone since we didn't use the speaker
|
|
||||||
this->set_state_(State::WAIT_FOR_VAD, State::WAITING_FOR_VAD);
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
this->set_state_(State::IDLE, State::IDLE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
this->defer([this]() { this->end_trigger_->trigger(); });
|
this->defer([this]() { this->end_trigger_->trigger(); });
|
||||||
break;
|
break;
|
||||||
|
@ -20,10 +20,6 @@
|
|||||||
#endif
|
#endif
|
||||||
#include "esphome/components/socket/socket.h"
|
#include "esphome/components/socket/socket.h"
|
||||||
|
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
#include <esp_vad.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -96,6 +92,7 @@ class VoiceAssistant : public Component {
|
|||||||
VoiceAssistant();
|
VoiceAssistant();
|
||||||
|
|
||||||
void loop() override;
|
void loop() override;
|
||||||
|
void setup() override;
|
||||||
float get_setup_priority() const override;
|
float get_setup_priority() const override;
|
||||||
void start_streaming();
|
void start_streaming();
|
||||||
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
||||||
@ -163,9 +160,6 @@ class VoiceAssistant : public Component {
|
|||||||
bool is_continuous() const { return this->continuous_; }
|
bool is_continuous() const { return this->continuous_; }
|
||||||
|
|
||||||
void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
|
void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void set_noise_suppression_level(uint8_t noise_suppression_level) {
|
void set_noise_suppression_level(uint8_t noise_suppression_level) {
|
||||||
this->noise_suppression_level_ = noise_suppression_level;
|
this->noise_suppression_level_ = noise_suppression_level;
|
||||||
@ -214,7 +208,6 @@ class VoiceAssistant : public Component {
|
|||||||
void clear_buffers_();
|
void clear_buffers_();
|
||||||
void deallocate_buffers_();
|
void deallocate_buffers_();
|
||||||
|
|
||||||
int read_microphone_();
|
|
||||||
void set_state_(State state);
|
void set_state_(State state);
|
||||||
void set_state_(State state, State desired_state);
|
void set_state_(State state, State desired_state);
|
||||||
void signal_stop_();
|
void signal_stop_();
|
||||||
@ -279,13 +272,6 @@ class VoiceAssistant : public Component {
|
|||||||
|
|
||||||
std::string wake_word_{""};
|
std::string wake_word_{""};
|
||||||
|
|
||||||
HighFrequencyLoopRequester high_freq_;
|
|
||||||
|
|
||||||
#ifdef USE_ESP_ADF
|
|
||||||
vad_handle_t vad_instance_;
|
|
||||||
uint8_t vad_threshold_{5};
|
|
||||||
uint8_t vad_counter_{0};
|
|
||||||
#endif
|
|
||||||
std::shared_ptr<RingBuffer> ring_buffer_;
|
std::shared_ptr<RingBuffer> ring_buffer_;
|
||||||
|
|
||||||
bool use_wake_word_;
|
bool use_wake_word_;
|
||||||
@ -295,7 +281,6 @@ class VoiceAssistant : public Component {
|
|||||||
uint32_t conversation_timeout_;
|
uint32_t conversation_timeout_;
|
||||||
|
|
||||||
uint8_t *send_buffer_{nullptr};
|
uint8_t *send_buffer_{nullptr};
|
||||||
int16_t *input_buffer_{nullptr};
|
|
||||||
|
|
||||||
bool continuous_{false};
|
bool continuous_{false};
|
||||||
bool silence_detection_;
|
bool silence_detection_;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user