mirror of
https://github.com/esphome/esphome.git
synced 2025-07-28 22:26:36 +00:00
[voice_assisant] support start/continue conversation and deallocate buffers (#8610)
This commit is contained in:
parent
ee646d7324
commit
c9d1476ae0
@ -72,12 +72,8 @@ bool VoiceAssistant::start_udp_socket_() {
|
||||
}
|
||||
|
||||
bool VoiceAssistant::allocate_buffers_() {
|
||||
if (this->send_buffer_ != nullptr) {
|
||||
return true; // Already allocated
|
||||
}
|
||||
|
||||
#ifdef USE_SPEAKER
|
||||
if (this->speaker_ != nullptr) {
|
||||
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
|
||||
ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||
this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
|
||||
if (this->speaker_buffer_ == nullptr) {
|
||||
@ -87,28 +83,34 @@ bool VoiceAssistant::allocate_buffers_() {
|
||||
}
|
||||
#endif
|
||||
|
||||
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
||||
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
|
||||
if (this->input_buffer_ == nullptr) {
|
||||
ESP_LOGW(TAG, "Could not allocate input buffer");
|
||||
return false;
|
||||
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
||||
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
|
||||
if (this->input_buffer_ == nullptr) {
|
||||
ESP_LOGW(TAG, "Could not allocate input buffer");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_ESP_ADF
|
||||
this->vad_instance_ = vad_create(VAD_MODE_4);
|
||||
#endif
|
||||
|
||||
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
|
||||
if (this->ring_buffer_ == nullptr) {
|
||||
ESP_LOGW(TAG, "Could not allocate ring buffer");
|
||||
return false;
|
||||
if (this->ring_buffer_.use_count() == 0) {
|
||||
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
|
||||
if (this->ring_buffer_.use_count() == 0) {
|
||||
ESP_LOGE(TAG, "Could not allocate ring buffer");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||
this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
|
||||
if (send_buffer_ == nullptr) {
|
||||
ESP_LOGW(TAG, "Could not allocate send buffer");
|
||||
return false;
|
||||
if (this->send_buffer_ == nullptr) {
|
||||
ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||
this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
|
||||
if (send_buffer_ == nullptr) {
|
||||
ESP_LOGW(TAG, "Could not allocate send buffer");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -139,13 +141,14 @@ void VoiceAssistant::clear_buffers_() {
|
||||
}
|
||||
|
||||
void VoiceAssistant::deallocate_buffers_() {
|
||||
ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||
send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
|
||||
this->send_buffer_ = nullptr;
|
||||
if (this->send_buffer_ != nullptr) {
|
||||
ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||
send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
|
||||
this->send_buffer_ = nullptr;
|
||||
}
|
||||
|
||||
if (this->ring_buffer_ != nullptr) {
|
||||
if (this->ring_buffer_.use_count() > 0) {
|
||||
this->ring_buffer_.reset();
|
||||
this->ring_buffer_ = nullptr;
|
||||
}
|
||||
|
||||
#ifdef USE_ESP_ADF
|
||||
@ -155,9 +158,11 @@ void VoiceAssistant::deallocate_buffers_() {
|
||||
}
|
||||
#endif
|
||||
|
||||
ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
||||
input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
|
||||
this->input_buffer_ = nullptr;
|
||||
if (this->input_buffer_ != nullptr) {
|
||||
ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
||||
input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
|
||||
this->input_buffer_ = nullptr;
|
||||
}
|
||||
|
||||
#ifdef USE_SPEAKER
|
||||
if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
|
||||
@ -216,6 +221,7 @@ void VoiceAssistant::loop() {
|
||||
}
|
||||
} else {
|
||||
this->high_freq_.stop();
|
||||
this->deallocate_buffers_();
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -276,7 +282,7 @@ void VoiceAssistant::loop() {
|
||||
this->read_microphone_();
|
||||
ESP_LOGD(TAG, "Requesting start...");
|
||||
uint32_t flags = 0;
|
||||
if (this->use_wake_word_)
|
||||
if (!this->continue_conversation_ && this->use_wake_word_)
|
||||
flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD;
|
||||
if (this->silence_detection_)
|
||||
flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD;
|
||||
@ -387,6 +393,25 @@ void VoiceAssistant::loop() {
|
||||
#ifdef USE_MEDIA_PLAYER
|
||||
if (this->media_player_ != nullptr) {
|
||||
playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING);
|
||||
|
||||
if (playing && this->media_player_wait_for_announcement_start_) {
|
||||
// Announcement has started playing, wait for it to finish
|
||||
this->media_player_wait_for_announcement_start_ = false;
|
||||
this->media_player_wait_for_announcement_end_ = true;
|
||||
}
|
||||
|
||||
if (!playing && this->media_player_wait_for_announcement_end_) {
|
||||
// Announcement has finished playing
|
||||
this->media_player_wait_for_announcement_end_ = false;
|
||||
this->cancel_timeout("playing");
|
||||
ESP_LOGD(TAG, "Announcement finished playing");
|
||||
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
|
||||
|
||||
api::VoiceAssistantAnnounceFinished msg;
|
||||
msg.success = true;
|
||||
this->api_client_->send_voice_assistant_announce_finished(msg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (playing) {
|
||||
@ -417,7 +442,11 @@ void VoiceAssistant::loop() {
|
||||
this->tts_stream_end_trigger_->trigger();
|
||||
}
|
||||
#endif
|
||||
this->set_state_(State::IDLE, State::IDLE);
|
||||
if (this->continue_conversation_) {
|
||||
this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
|
||||
} else {
|
||||
this->set_state_(State::IDLE, State::IDLE);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@ -587,6 +616,7 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
|
||||
|
||||
void VoiceAssistant::request_stop() {
|
||||
this->continuous_ = false;
|
||||
this->continue_conversation_ = false;
|
||||
|
||||
switch (this->state_) {
|
||||
case State::IDLE:
|
||||
@ -611,6 +641,16 @@ void VoiceAssistant::request_stop() {
|
||||
this->signal_stop_();
|
||||
break;
|
||||
case State::STREAMING_RESPONSE:
|
||||
#ifdef USE_MEDIA_PLAYER
|
||||
// Stop any ongoing media player announcement
|
||||
if (this->media_player_ != nullptr) {
|
||||
this->media_player_->make_call()
|
||||
.set_command(media_player::MEDIA_PLAYER_COMMAND_STOP)
|
||||
.set_announcement(true)
|
||||
.perform();
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case State::RESPONSE_FINISHED:
|
||||
break; // Let the incoming audio stream finish then it will go to idle.
|
||||
}
|
||||
@ -628,9 +668,9 @@ void VoiceAssistant::signal_stop_() {
|
||||
}
|
||||
|
||||
void VoiceAssistant::start_playback_timeout_() {
|
||||
this->set_timeout("playing", 100, [this]() {
|
||||
this->set_timeout("playing", 2000, [this]() {
|
||||
this->cancel_timeout("speaker-timeout");
|
||||
this->set_state_(State::IDLE, State::IDLE);
|
||||
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
|
||||
|
||||
api::VoiceAssistantAnnounceFinished msg;
|
||||
msg.success = true;
|
||||
@ -679,6 +719,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
||||
for (auto arg : msg.data) {
|
||||
if (arg.name == "conversation_id") {
|
||||
this->conversation_id_ = std::move(arg.value);
|
||||
} else if (arg.name == "continue_conversation") {
|
||||
this->continue_conversation_ = (arg.value == "1");
|
||||
}
|
||||
}
|
||||
this->defer([this]() { this->intent_end_trigger_->trigger(); });
|
||||
@ -722,6 +764,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
||||
#ifdef USE_MEDIA_PLAYER
|
||||
if (this->media_player_ != nullptr) {
|
||||
this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
|
||||
|
||||
this->media_player_wait_for_announcement_start_ = true;
|
||||
this->media_player_wait_for_announcement_end_ = false;
|
||||
// Start the playback timeout, as the media player state isn't immediately updated
|
||||
this->start_playback_timeout_();
|
||||
}
|
||||
@ -888,8 +933,28 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
|
||||
#ifdef USE_MEDIA_PLAYER
|
||||
if (this->media_player_ != nullptr) {
|
||||
this->tts_start_trigger_->trigger(msg.text);
|
||||
this->media_player_->make_call().set_media_url(msg.media_id).set_announcement(true).perform();
|
||||
this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE);
|
||||
if (!msg.preannounce_media_id.empty()) {
|
||||
this->media_player_->make_call().set_media_url(msg.preannounce_media_id).set_announcement(true).perform();
|
||||
}
|
||||
// Enqueueing a URL with an empty playlist will still play the file immediately
|
||||
this->media_player_->make_call()
|
||||
.set_command(media_player::MEDIA_PLAYER_COMMAND_ENQUEUE)
|
||||
.set_media_url(msg.media_id)
|
||||
.set_announcement(true)
|
||||
.perform();
|
||||
this->continue_conversation_ = msg.start_conversation;
|
||||
|
||||
this->media_player_wait_for_announcement_start_ = true;
|
||||
this->media_player_wait_for_announcement_end_ = false;
|
||||
// Start the playback timeout, as the media player state isn't immediately updated
|
||||
this->start_playback_timeout_();
|
||||
|
||||
if (this->continuous_) {
|
||||
this->set_state_(State::STOP_MICROPHONE, State::STREAMING_RESPONSE);
|
||||
} else {
|
||||
this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE);
|
||||
}
|
||||
|
||||
this->tts_end_trigger_->trigger(msg.media_id);
|
||||
this->end_trigger_->trigger();
|
||||
}
|
||||
|
@ -41,6 +41,7 @@ enum VoiceAssistantFeature : uint32_t {
|
||||
FEATURE_API_AUDIO = 1 << 2,
|
||||
FEATURE_TIMERS = 1 << 3,
|
||||
FEATURE_ANNOUNCE = 1 << 4,
|
||||
FEATURE_START_CONVERSATION = 1 << 5,
|
||||
};
|
||||
|
||||
enum class State {
|
||||
@ -140,6 +141,7 @@ class VoiceAssistant : public Component {
|
||||
#ifdef USE_MEDIA_PLAYER
|
||||
if (this->media_player_ != nullptr) {
|
||||
flags |= VoiceAssistantFeature::FEATURE_ANNOUNCE;
|
||||
flags |= VoiceAssistantFeature::FEATURE_START_CONVERSATION;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -267,6 +269,8 @@ class VoiceAssistant : public Component {
|
||||
#endif
|
||||
#ifdef USE_MEDIA_PLAYER
|
||||
media_player::MediaPlayer *media_player_{nullptr};
|
||||
bool media_player_wait_for_announcement_start_{false};
|
||||
bool media_player_wait_for_announcement_end_{false};
|
||||
#endif
|
||||
|
||||
bool local_output_{false};
|
||||
@ -282,7 +286,7 @@ class VoiceAssistant : public Component {
|
||||
uint8_t vad_threshold_{5};
|
||||
uint8_t vad_counter_{0};
|
||||
#endif
|
||||
std::unique_ptr<RingBuffer> ring_buffer_;
|
||||
std::shared_ptr<RingBuffer> ring_buffer_;
|
||||
|
||||
bool use_wake_word_;
|
||||
uint8_t noise_suppression_level_;
|
||||
@ -296,6 +300,8 @@ class VoiceAssistant : public Component {
|
||||
bool continuous_{false};
|
||||
bool silence_detection_;
|
||||
|
||||
bool continue_conversation_{false};
|
||||
|
||||
State state_{State::IDLE};
|
||||
State desired_state_{State::IDLE};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user