mirror of
https://github.com/esphome/esphome.git
synced 2025-07-28 14:16:40 +00:00
[voice_assistant] Support streaming TTS responses and fixes crash for long responses (#9224)
This commit is contained in:
parent
068aa0ff1e
commit
9f3f4ead4f
@ -490,7 +490,7 @@ esphome/components/vbus/* @ssieb
|
|||||||
esphome/components/veml3235/* @kbx81
|
esphome/components/veml3235/* @kbx81
|
||||||
esphome/components/veml7700/* @latonita
|
esphome/components/veml7700/* @latonita
|
||||||
esphome/components/version/* @esphome/core
|
esphome/components/version/* @esphome/core
|
||||||
esphome/components/voice_assistant/* @jesserockz
|
esphome/components/voice_assistant/* @jesserockz @kahrendt
|
||||||
esphome/components/wake_on_lan/* @clydebarrow @willwill2will54
|
esphome/components/wake_on_lan/* @clydebarrow @willwill2will54
|
||||||
esphome/components/watchdog/* @oarcher
|
esphome/components/watchdog/* @oarcher
|
||||||
esphome/components/waveshare_epaper/* @clydebarrow
|
esphome/components/waveshare_epaper/* @clydebarrow
|
||||||
|
@ -17,10 +17,11 @@ from esphome.const import (
|
|||||||
AUTO_LOAD = ["socket"]
|
AUTO_LOAD = ["socket"]
|
||||||
DEPENDENCIES = ["api", "microphone"]
|
DEPENDENCIES = ["api", "microphone"]
|
||||||
|
|
||||||
CODEOWNERS = ["@jesserockz"]
|
CODEOWNERS = ["@jesserockz", "@kahrendt"]
|
||||||
|
|
||||||
CONF_ON_END = "on_end"
|
CONF_ON_END = "on_end"
|
||||||
CONF_ON_INTENT_END = "on_intent_end"
|
CONF_ON_INTENT_END = "on_intent_end"
|
||||||
|
CONF_ON_INTENT_PROGRESS = "on_intent_progress"
|
||||||
CONF_ON_INTENT_START = "on_intent_start"
|
CONF_ON_INTENT_START = "on_intent_start"
|
||||||
CONF_ON_LISTENING = "on_listening"
|
CONF_ON_LISTENING = "on_listening"
|
||||||
CONF_ON_START = "on_start"
|
CONF_ON_START = "on_start"
|
||||||
@ -136,6 +137,9 @@ CONFIG_SCHEMA = cv.All(
|
|||||||
cv.Optional(CONF_ON_INTENT_START): automation.validate_automation(
|
cv.Optional(CONF_ON_INTENT_START): automation.validate_automation(
|
||||||
single=True
|
single=True
|
||||||
),
|
),
|
||||||
|
cv.Optional(CONF_ON_INTENT_PROGRESS): automation.validate_automation(
|
||||||
|
single=True
|
||||||
|
),
|
||||||
cv.Optional(CONF_ON_INTENT_END): automation.validate_automation(
|
cv.Optional(CONF_ON_INTENT_END): automation.validate_automation(
|
||||||
single=True
|
single=True
|
||||||
),
|
),
|
||||||
@ -282,6 +286,13 @@ async def to_code(config):
|
|||||||
config[CONF_ON_INTENT_START],
|
config[CONF_ON_INTENT_START],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if CONF_ON_INTENT_PROGRESS in config:
|
||||||
|
await automation.build_automation(
|
||||||
|
var.get_intent_progress_trigger(),
|
||||||
|
[(cg.std_string, "x")],
|
||||||
|
config[CONF_ON_INTENT_PROGRESS],
|
||||||
|
)
|
||||||
|
|
||||||
if CONF_ON_INTENT_END in config:
|
if CONF_ON_INTENT_END in config:
|
||||||
await automation.build_automation(
|
await automation.build_automation(
|
||||||
var.get_intent_end_trigger(),
|
var.get_intent_end_trigger(),
|
||||||
|
@ -555,7 +555,7 @@ void VoiceAssistant::request_stop() {
|
|||||||
break;
|
break;
|
||||||
case State::AWAITING_RESPONSE:
|
case State::AWAITING_RESPONSE:
|
||||||
this->signal_stop_();
|
this->signal_stop_();
|
||||||
break;
|
// Fallthrough intended to stop a streaming TTS announcement that has potentially started
|
||||||
case State::STREAMING_RESPONSE:
|
case State::STREAMING_RESPONSE:
|
||||||
#ifdef USE_MEDIA_PLAYER
|
#ifdef USE_MEDIA_PLAYER
|
||||||
// Stop any ongoing media player announcement
|
// Stop any ongoing media player announcement
|
||||||
@ -599,6 +599,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
|||||||
switch (msg.event_type) {
|
switch (msg.event_type) {
|
||||||
case api::enums::VOICE_ASSISTANT_RUN_START:
|
case api::enums::VOICE_ASSISTANT_RUN_START:
|
||||||
ESP_LOGD(TAG, "Assist Pipeline running");
|
ESP_LOGD(TAG, "Assist Pipeline running");
|
||||||
|
#ifdef USE_MEDIA_PLAYER
|
||||||
|
this->started_streaming_tts_ = false;
|
||||||
|
for (auto arg : msg.data) {
|
||||||
|
if (arg.name == "url") {
|
||||||
|
this->tts_response_url_ = std::move(arg.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
this->defer([this]() { this->start_trigger_->trigger(); });
|
this->defer([this]() { this->start_trigger_->trigger(); });
|
||||||
break;
|
break;
|
||||||
case api::enums::VOICE_ASSISTANT_WAKE_WORD_START:
|
case api::enums::VOICE_ASSISTANT_WAKE_WORD_START:
|
||||||
@ -622,6 +630,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
|||||||
if (text.empty()) {
|
if (text.empty()) {
|
||||||
ESP_LOGW(TAG, "No text in STT_END event");
|
ESP_LOGW(TAG, "No text in STT_END event");
|
||||||
return;
|
return;
|
||||||
|
} else if (text.length() > 500) {
|
||||||
|
text = text.substr(0, 497) + "...";
|
||||||
}
|
}
|
||||||
ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
|
ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
|
||||||
this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
|
this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
|
||||||
@ -631,6 +641,27 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
|||||||
ESP_LOGD(TAG, "Intent started");
|
ESP_LOGD(TAG, "Intent started");
|
||||||
this->defer([this]() { this->intent_start_trigger_->trigger(); });
|
this->defer([this]() { this->intent_start_trigger_->trigger(); });
|
||||||
break;
|
break;
|
||||||
|
case api::enums::VOICE_ASSISTANT_INTENT_PROGRESS: {
|
||||||
|
ESP_LOGD(TAG, "Intent progress");
|
||||||
|
std::string tts_url_for_trigger = "";
|
||||||
|
#ifdef USE_MEDIA_PLAYER
|
||||||
|
if (this->media_player_ != nullptr) {
|
||||||
|
for (const auto &arg : msg.data) {
|
||||||
|
if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) {
|
||||||
|
this->media_player_->make_call().set_media_url(this->tts_response_url_).set_announcement(true).perform();
|
||||||
|
|
||||||
|
this->media_player_wait_for_announcement_start_ = true;
|
||||||
|
this->media_player_wait_for_announcement_end_ = false;
|
||||||
|
this->started_streaming_tts_ = true;
|
||||||
|
tts_url_for_trigger = this->tts_response_url_;
|
||||||
|
this->tts_response_url_.clear(); // Reset streaming URL
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_->trigger(tts_url_for_trigger); });
|
||||||
|
break;
|
||||||
|
}
|
||||||
case api::enums::VOICE_ASSISTANT_INTENT_END: {
|
case api::enums::VOICE_ASSISTANT_INTENT_END: {
|
||||||
for (auto arg : msg.data) {
|
for (auto arg : msg.data) {
|
||||||
if (arg.name == "conversation_id") {
|
if (arg.name == "conversation_id") {
|
||||||
@ -653,6 +684,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
|||||||
ESP_LOGW(TAG, "No text in TTS_START event");
|
ESP_LOGW(TAG, "No text in TTS_START event");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (text.length() > 500) {
|
||||||
|
text = text.substr(0, 497) + "...";
|
||||||
|
}
|
||||||
ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
|
ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
|
||||||
this->defer([this, text]() {
|
this->defer([this, text]() {
|
||||||
this->tts_start_trigger_->trigger(text);
|
this->tts_start_trigger_->trigger(text);
|
||||||
@ -678,7 +712,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
|||||||
ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
|
ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
|
||||||
this->defer([this, url]() {
|
this->defer([this, url]() {
|
||||||
#ifdef USE_MEDIA_PLAYER
|
#ifdef USE_MEDIA_PLAYER
|
||||||
if (this->media_player_ != nullptr) {
|
if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) {
|
||||||
this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
|
this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
|
||||||
|
|
||||||
this->media_player_wait_for_announcement_start_ = true;
|
this->media_player_wait_for_announcement_start_ = true;
|
||||||
|
@ -177,6 +177,7 @@ class VoiceAssistant : public Component {
|
|||||||
|
|
||||||
Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; }
|
Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; }
|
||||||
Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; }
|
Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; }
|
||||||
|
Trigger<std::string> *get_intent_progress_trigger() const { return this->intent_progress_trigger_; }
|
||||||
Trigger<> *get_listening_trigger() const { return this->listening_trigger_; }
|
Trigger<> *get_listening_trigger() const { return this->listening_trigger_; }
|
||||||
Trigger<> *get_end_trigger() const { return this->end_trigger_; }
|
Trigger<> *get_end_trigger() const { return this->end_trigger_; }
|
||||||
Trigger<> *get_start_trigger() const { return this->start_trigger_; }
|
Trigger<> *get_start_trigger() const { return this->start_trigger_; }
|
||||||
@ -233,6 +234,7 @@ class VoiceAssistant : public Component {
|
|||||||
Trigger<> *tts_stream_start_trigger_ = new Trigger<>();
|
Trigger<> *tts_stream_start_trigger_ = new Trigger<>();
|
||||||
Trigger<> *tts_stream_end_trigger_ = new Trigger<>();
|
Trigger<> *tts_stream_end_trigger_ = new Trigger<>();
|
||||||
#endif
|
#endif
|
||||||
|
Trigger<std::string> *intent_progress_trigger_ = new Trigger<std::string>();
|
||||||
Trigger<> *wake_word_detected_trigger_ = new Trigger<>();
|
Trigger<> *wake_word_detected_trigger_ = new Trigger<>();
|
||||||
Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>();
|
Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>();
|
||||||
Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>();
|
Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>();
|
||||||
@ -268,6 +270,8 @@ class VoiceAssistant : public Component {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef USE_MEDIA_PLAYER
|
#ifdef USE_MEDIA_PLAYER
|
||||||
media_player::MediaPlayer *media_player_{nullptr};
|
media_player::MediaPlayer *media_player_{nullptr};
|
||||||
|
std::string tts_response_url_{""};
|
||||||
|
bool started_streaming_tts_{false};
|
||||||
bool media_player_wait_for_announcement_start_{false};
|
bool media_player_wait_for_announcement_start_{false};
|
||||||
bool media_player_wait_for_announcement_end_{false};
|
bool media_player_wait_for_announcement_end_{false};
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user