[voice_assisant] support start/continue conversation and deallocate buffers (#8610)

This commit is contained in:
Kevin Ahrendt 2025-04-27 18:30:21 -05:00 committed by GitHub
parent ee646d7324
commit c9d1476ae0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 104 additions and 33 deletions

View File

@ -72,12 +72,8 @@ bool VoiceAssistant::start_udp_socket_() {
}
bool VoiceAssistant::allocate_buffers_() {
if (this->send_buffer_ != nullptr) {
return true; // Already allocated
}
#ifdef USE_SPEAKER
if (this->speaker_ != nullptr) {
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
if (this->speaker_buffer_ == nullptr) {
@ -87,28 +83,34 @@ bool VoiceAssistant::allocate_buffers_() {
}
#endif
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
if (this->input_buffer_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate input buffer");
return false;
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
if (this->input_buffer_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate input buffer");
return false;
}
}
#ifdef USE_ESP_ADF
this->vad_instance_ = vad_create(VAD_MODE_4);
#endif
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
if (this->ring_buffer_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate ring buffer");
return false;
if (this->ring_buffer_.use_count() == 0) {
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
if (this->ring_buffer_.use_count() == 0) {
ESP_LOGE(TAG, "Could not allocate ring buffer");
return false;
}
}
ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
if (send_buffer_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate send buffer");
return false;
if (this->send_buffer_ == nullptr) {
ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
if (send_buffer_ == nullptr) {
ESP_LOGW(TAG, "Could not allocate send buffer");
return false;
}
}
return true;
@ -139,13 +141,14 @@ void VoiceAssistant::clear_buffers_() {
}
void VoiceAssistant::deallocate_buffers_() {
ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
this->send_buffer_ = nullptr;
if (this->send_buffer_ != nullptr) {
ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
this->send_buffer_ = nullptr;
}
if (this->ring_buffer_ != nullptr) {
if (this->ring_buffer_.use_count() > 0) {
this->ring_buffer_.reset();
this->ring_buffer_ = nullptr;
}
#ifdef USE_ESP_ADF
@ -155,9 +158,11 @@ void VoiceAssistant::deallocate_buffers_() {
}
#endif
ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
this->input_buffer_ = nullptr;
if (this->input_buffer_ != nullptr) {
ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
this->input_buffer_ = nullptr;
}
#ifdef USE_SPEAKER
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
@ -216,6 +221,7 @@ void VoiceAssistant::loop() {
}
} else {
this->high_freq_.stop();
this->deallocate_buffers_();
}
break;
}
@ -276,7 +282,7 @@ void VoiceAssistant::loop() {
this->read_microphone_();
ESP_LOGD(TAG, "Requesting start...");
uint32_t flags = 0;
if (this->use_wake_word_)
if (!this->continue_conversation_ && this->use_wake_word_)
flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD;
if (this->silence_detection_)
flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD;
@ -387,6 +393,25 @@ void VoiceAssistant::loop() {
#ifdef USE_MEDIA_PLAYER
if (this->media_player_ != nullptr) {
playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING);
if (playing && this->media_player_wait_for_announcement_start_) {
// Announcement has started playing, wait for it to finish
this->media_player_wait_for_announcement_start_ = false;
this->media_player_wait_for_announcement_end_ = true;
}
if (!playing && this->media_player_wait_for_announcement_end_) {
// Announcement has finished playing
this->media_player_wait_for_announcement_end_ = false;
this->cancel_timeout("playing");
ESP_LOGD(TAG, "Announcement finished playing");
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
api::VoiceAssistantAnnounceFinished msg;
msg.success = true;
this->api_client_->send_voice_assistant_announce_finished(msg);
break;
}
}
#endif
if (playing) {
@ -417,7 +442,11 @@ void VoiceAssistant::loop() {
this->tts_stream_end_trigger_->trigger();
}
#endif
this->set_state_(State::IDLE, State::IDLE);
if (this->continue_conversation_) {
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
} else {
this->set_state_(State::IDLE, State::IDLE);
}
break;
}
default:
@ -587,6 +616,7 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
void VoiceAssistant::request_stop() {
this->continuous_ = false;
this->continue_conversation_ = false;
switch (this->state_) {
case State::IDLE:
@ -611,6 +641,16 @@ void VoiceAssistant::request_stop() {
this->signal_stop_();
break;
case State::STREAMING_RESPONSE:
#ifdef USE_MEDIA_PLAYER
// Stop any ongoing media player announcement
if (this->media_player_ != nullptr) {
this->media_player_->make_call()
.set_command(media_player::MEDIA_PLAYER_COMMAND_STOP)
.set_announcement(true)
.perform();
}
#endif
break;
case State::RESPONSE_FINISHED:
break; // Let the incoming audio stream finish then it will go to idle.
}
@ -628,9 +668,9 @@ void VoiceAssistant::signal_stop_() {
}
void VoiceAssistant::start_playback_timeout_() {
this->set_timeout("playing", 100, [this]() {
this->set_timeout("playing", 2000, [this]() {
this->cancel_timeout("speaker-timeout");
this->set_state_(State::IDLE, State::IDLE);
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
api::VoiceAssistantAnnounceFinished msg;
msg.success = true;
@ -679,6 +719,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
for (auto arg : msg.data) {
if (arg.name == "conversation_id") {
this->conversation_id_ = std::move(arg.value);
} else if (arg.name == "continue_conversation") {
this->continue_conversation_ = (arg.value == "1");
}
}
this->defer([this]() { this->intent_end_trigger_->trigger(); });
@ -722,6 +764,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
#ifdef USE_MEDIA_PLAYER
if (this->media_player_ != nullptr) {
this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
this->media_player_wait_for_announcement_start_ = true;
this->media_player_wait_for_announcement_end_ = false;
// Start the playback timeout, as the media player state isn't immediately updated
this->start_playback_timeout_();
}
@ -888,8 +933,28 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
#ifdef USE_MEDIA_PLAYER
if (this->media_player_ != nullptr) {
this->tts_start_trigger_->trigger(msg.text);
this->media_player_->make_call().set_media_url(msg.media_id).set_announcement(true).perform();
this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE);
if (!msg.preannounce_media_id.empty()) {
this->media_player_->make_call().set_media_url(msg.preannounce_media_id).set_announcement(true).perform();
}
// Enqueueing a URL with an empty playlist will still play the file immediately
this->media_player_->make_call()
.set_command(media_player::MEDIA_PLAYER_COMMAND_ENQUEUE)
.set_media_url(msg.media_id)
.set_announcement(true)
.perform();
this->continue_conversation_ = msg.start_conversation;
this->media_player_wait_for_announcement_start_ = true;
this->media_player_wait_for_announcement_end_ = false;
// Start the playback timeout, as the media player state isn't immediately updated
this->start_playback_timeout_();
if (this->continuous_) {
this->set_state_(State::STOP_MICROPHONE, State::STREAMING_RESPONSE);
} else {
this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE);
}
this->tts_end_trigger_->trigger(msg.media_id);
this->end_trigger_->trigger();
}

View File

@ -41,6 +41,7 @@ enum VoiceAssistantFeature : uint32_t {
FEATURE_API_AUDIO = 1 << 2,
FEATURE_TIMERS = 1 << 3,
FEATURE_ANNOUNCE = 1 << 4,
FEATURE_START_CONVERSATION = 1 << 5,
};
enum class State {
@ -140,6 +141,7 @@ class VoiceAssistant : public Component {
#ifdef USE_MEDIA_PLAYER
if (this->media_player_ != nullptr) {
flags |= VoiceAssistantFeature::FEATURE_ANNOUNCE;
flags |= VoiceAssistantFeature::FEATURE_START_CONVERSATION;
}
#endif
@ -267,6 +269,8 @@ class VoiceAssistant : public Component {
#endif
#ifdef USE_MEDIA_PLAYER
media_player::MediaPlayer *media_player_{nullptr};
bool media_player_wait_for_announcement_start_{false};
bool media_player_wait_for_announcement_end_{false};
#endif
bool local_output_{false};
@ -282,7 +286,7 @@ class VoiceAssistant : public Component {
uint8_t vad_threshold_{5};
uint8_t vad_counter_{0};
#endif
std::unique_ptr<RingBuffer> ring_buffer_;
std::shared_ptr<RingBuffer> ring_buffer_;
bool use_wake_word_;
uint8_t noise_suppression_level_;
@ -296,6 +300,8 @@ class VoiceAssistant : public Component {
bool continuous_{false};
bool silence_detection_;
bool continue_conversation_{false};
State state_{State::IDLE};
State desired_state_{State::IDLE};