mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-30 22:53:59 +00:00 
			
		
		
		
	[voice_assistant] Support streaming TTS responses and fixes crash for long responses (#9224)
This commit is contained in:
		| @@ -491,7 +491,7 @@ esphome/components/vbus/* @ssieb | |||||||
| esphome/components/veml3235/* @kbx81 | esphome/components/veml3235/* @kbx81 | ||||||
| esphome/components/veml7700/* @latonita | esphome/components/veml7700/* @latonita | ||||||
| esphome/components/version/* @esphome/core | esphome/components/version/* @esphome/core | ||||||
| esphome/components/voice_assistant/* @jesserockz | esphome/components/voice_assistant/* @jesserockz @kahrendt | ||||||
| esphome/components/wake_on_lan/* @clydebarrow @willwill2will54 | esphome/components/wake_on_lan/* @clydebarrow @willwill2will54 | ||||||
| esphome/components/watchdog/* @oarcher | esphome/components/watchdog/* @oarcher | ||||||
| esphome/components/waveshare_epaper/* @clydebarrow | esphome/components/waveshare_epaper/* @clydebarrow | ||||||
|   | |||||||
| @@ -17,10 +17,11 @@ from esphome.const import ( | |||||||
| AUTO_LOAD = ["socket"] | AUTO_LOAD = ["socket"] | ||||||
| DEPENDENCIES = ["api", "microphone"] | DEPENDENCIES = ["api", "microphone"] | ||||||
|  |  | ||||||
| CODEOWNERS = ["@jesserockz"] | CODEOWNERS = ["@jesserockz", "@kahrendt"] | ||||||
|  |  | ||||||
| CONF_ON_END = "on_end" | CONF_ON_END = "on_end" | ||||||
| CONF_ON_INTENT_END = "on_intent_end" | CONF_ON_INTENT_END = "on_intent_end" | ||||||
|  | CONF_ON_INTENT_PROGRESS = "on_intent_progress" | ||||||
| CONF_ON_INTENT_START = "on_intent_start" | CONF_ON_INTENT_START = "on_intent_start" | ||||||
| CONF_ON_LISTENING = "on_listening" | CONF_ON_LISTENING = "on_listening" | ||||||
| CONF_ON_START = "on_start" | CONF_ON_START = "on_start" | ||||||
| @@ -136,6 +137,9 @@ CONFIG_SCHEMA = cv.All( | |||||||
|             cv.Optional(CONF_ON_INTENT_START): automation.validate_automation( |             cv.Optional(CONF_ON_INTENT_START): automation.validate_automation( | ||||||
|                 single=True |                 single=True | ||||||
|             ), |             ), | ||||||
|  |             cv.Optional(CONF_ON_INTENT_PROGRESS): automation.validate_automation( | ||||||
|  |                 single=True | ||||||
|  |             ), | ||||||
|             cv.Optional(CONF_ON_INTENT_END): automation.validate_automation( |             cv.Optional(CONF_ON_INTENT_END): automation.validate_automation( | ||||||
|                 single=True |                 single=True | ||||||
|             ), |             ), | ||||||
| @@ -282,6 +286,13 @@ async def to_code(config): | |||||||
|             config[CONF_ON_INTENT_START], |             config[CONF_ON_INTENT_START], | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     if CONF_ON_INTENT_PROGRESS in config: | ||||||
|  |         await automation.build_automation( | ||||||
|  |             var.get_intent_progress_trigger(), | ||||||
|  |             [(cg.std_string, "x")], | ||||||
|  |             config[CONF_ON_INTENT_PROGRESS], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|     if CONF_ON_INTENT_END in config: |     if CONF_ON_INTENT_END in config: | ||||||
|         await automation.build_automation( |         await automation.build_automation( | ||||||
|             var.get_intent_end_trigger(), |             var.get_intent_end_trigger(), | ||||||
|   | |||||||
| @@ -555,7 +555,7 @@ void VoiceAssistant::request_stop() { | |||||||
|       break; |       break; | ||||||
|     case State::AWAITING_RESPONSE: |     case State::AWAITING_RESPONSE: | ||||||
|       this->signal_stop_(); |       this->signal_stop_(); | ||||||
|       break; |       // Fallthrough intended to stop a streaming TTS announcement that has potentially started | ||||||
|     case State::STREAMING_RESPONSE: |     case State::STREAMING_RESPONSE: | ||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|       // Stop any ongoing media player announcement |       // Stop any ongoing media player announcement | ||||||
| @@ -599,6 +599,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|   switch (msg.event_type) { |   switch (msg.event_type) { | ||||||
|     case api::enums::VOICE_ASSISTANT_RUN_START: |     case api::enums::VOICE_ASSISTANT_RUN_START: | ||||||
|       ESP_LOGD(TAG, "Assist Pipeline running"); |       ESP_LOGD(TAG, "Assist Pipeline running"); | ||||||
|  | #ifdef USE_MEDIA_PLAYER | ||||||
|  |       this->started_streaming_tts_ = false; | ||||||
|  |       for (auto arg : msg.data) { | ||||||
|  |         if (arg.name == "url") { | ||||||
|  |           this->tts_response_url_ = std::move(arg.value); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|       this->defer([this]() { this->start_trigger_->trigger(); }); |       this->defer([this]() { this->start_trigger_->trigger(); }); | ||||||
|       break; |       break; | ||||||
|     case api::enums::VOICE_ASSISTANT_WAKE_WORD_START: |     case api::enums::VOICE_ASSISTANT_WAKE_WORD_START: | ||||||
| @@ -622,6 +630,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       if (text.empty()) { |       if (text.empty()) { | ||||||
|         ESP_LOGW(TAG, "No text in STT_END event"); |         ESP_LOGW(TAG, "No text in STT_END event"); | ||||||
|         return; |         return; | ||||||
|  |       } else if (text.length() > 500) { | ||||||
|  |         text = text.substr(0, 497) + "..."; | ||||||
|       } |       } | ||||||
|       ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); |       ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); | ||||||
|       this->defer([this, text]() { this->stt_end_trigger_->trigger(text); }); |       this->defer([this, text]() { this->stt_end_trigger_->trigger(text); }); | ||||||
| @@ -631,6 +641,27 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       ESP_LOGD(TAG, "Intent started"); |       ESP_LOGD(TAG, "Intent started"); | ||||||
|       this->defer([this]() { this->intent_start_trigger_->trigger(); }); |       this->defer([this]() { this->intent_start_trigger_->trigger(); }); | ||||||
|       break; |       break; | ||||||
|  |     case api::enums::VOICE_ASSISTANT_INTENT_PROGRESS: { | ||||||
|  |       ESP_LOGD(TAG, "Intent progress"); | ||||||
|  |       std::string tts_url_for_trigger = ""; | ||||||
|  | #ifdef USE_MEDIA_PLAYER | ||||||
|  |       if (this->media_player_ != nullptr) { | ||||||
|  |         for (const auto &arg : msg.data) { | ||||||
|  |           if ((arg.name == "tts_start_streaming") && (arg.value == "1") && !this->tts_response_url_.empty()) { | ||||||
|  |             this->media_player_->make_call().set_media_url(this->tts_response_url_).set_announcement(true).perform(); | ||||||
|  |  | ||||||
|  |             this->media_player_wait_for_announcement_start_ = true; | ||||||
|  |             this->media_player_wait_for_announcement_end_ = false; | ||||||
|  |             this->started_streaming_tts_ = true; | ||||||
|  |             tts_url_for_trigger = this->tts_response_url_; | ||||||
|  |             this->tts_response_url_.clear();  // Reset streaming URL | ||||||
|  |           } | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|  |       this->defer([this, tts_url_for_trigger]() { this->intent_progress_trigger_->trigger(tts_url_for_trigger); }); | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|     case api::enums::VOICE_ASSISTANT_INTENT_END: { |     case api::enums::VOICE_ASSISTANT_INTENT_END: { | ||||||
|       for (auto arg : msg.data) { |       for (auto arg : msg.data) { | ||||||
|         if (arg.name == "conversation_id") { |         if (arg.name == "conversation_id") { | ||||||
| @@ -653,6 +684,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|         ESP_LOGW(TAG, "No text in TTS_START event"); |         ESP_LOGW(TAG, "No text in TTS_START event"); | ||||||
|         return; |         return; | ||||||
|       } |       } | ||||||
|  |       if (text.length() > 500) { | ||||||
|  |         text = text.substr(0, 497) + "..."; | ||||||
|  |       } | ||||||
|       ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); |       ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); | ||||||
|       this->defer([this, text]() { |       this->defer([this, text]() { | ||||||
|         this->tts_start_trigger_->trigger(text); |         this->tts_start_trigger_->trigger(text); | ||||||
| @@ -678,7 +712,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); |       ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); | ||||||
|       this->defer([this, url]() { |       this->defer([this, url]() { | ||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|         if (this->media_player_ != nullptr) { |         if ((this->media_player_ != nullptr) && (!this->started_streaming_tts_)) { | ||||||
|           this->media_player_->make_call().set_media_url(url).set_announcement(true).perform(); |           this->media_player_->make_call().set_media_url(url).set_announcement(true).perform(); | ||||||
|  |  | ||||||
|           this->media_player_wait_for_announcement_start_ = true; |           this->media_player_wait_for_announcement_start_ = true; | ||||||
|   | |||||||
| @@ -177,6 +177,7 @@ class VoiceAssistant : public Component { | |||||||
|  |  | ||||||
|   Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; } |   Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; } | ||||||
|   Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; } |   Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; } | ||||||
|  |   Trigger<std::string> *get_intent_progress_trigger() const { return this->intent_progress_trigger_; } | ||||||
|   Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } |   Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } | ||||||
|   Trigger<> *get_end_trigger() const { return this->end_trigger_; } |   Trigger<> *get_end_trigger() const { return this->end_trigger_; } | ||||||
|   Trigger<> *get_start_trigger() const { return this->start_trigger_; } |   Trigger<> *get_start_trigger() const { return this->start_trigger_; } | ||||||
| @@ -233,6 +234,7 @@ class VoiceAssistant : public Component { | |||||||
|   Trigger<> *tts_stream_start_trigger_ = new Trigger<>(); |   Trigger<> *tts_stream_start_trigger_ = new Trigger<>(); | ||||||
|   Trigger<> *tts_stream_end_trigger_ = new Trigger<>(); |   Trigger<> *tts_stream_end_trigger_ = new Trigger<>(); | ||||||
| #endif | #endif | ||||||
|  |   Trigger<std::string> *intent_progress_trigger_ = new Trigger<std::string>(); | ||||||
|   Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); |   Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); | ||||||
|   Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>(); |   Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>(); | ||||||
|   Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>(); |   Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>(); | ||||||
| @@ -268,6 +270,8 @@ class VoiceAssistant : public Component { | |||||||
| #endif | #endif | ||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|   media_player::MediaPlayer *media_player_{nullptr}; |   media_player::MediaPlayer *media_player_{nullptr}; | ||||||
|  |   std::string tts_response_url_{""}; | ||||||
|  |   bool started_streaming_tts_{false}; | ||||||
|   bool media_player_wait_for_announcement_start_{false}; |   bool media_player_wait_for_announcement_start_{false}; | ||||||
|   bool media_player_wait_for_announcement_end_{false}; |   bool media_player_wait_for_announcement_end_{false}; | ||||||
| #endif | #endif | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user