mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-31 07:03:55 +00:00 
			
		
		
		
	Add more VA triggers (#5762)
This commit is contained in:
		| @@ -18,20 +18,25 @@ DEPENDENCIES = ["api", "microphone"] | |||||||
|  |  | ||||||
| CODEOWNERS = ["@jesserockz"] | CODEOWNERS = ["@jesserockz"] | ||||||
|  |  | ||||||
| CONF_SILENCE_DETECTION = "silence_detection" |  | ||||||
| CONF_ON_LISTENING = "on_listening" |  | ||||||
| CONF_ON_START = "on_start" |  | ||||||
| CONF_ON_WAKE_WORD_DETECTED = "on_wake_word_detected" |  | ||||||
| CONF_ON_STT_END = "on_stt_end" |  | ||||||
| CONF_ON_TTS_START = "on_tts_start" |  | ||||||
| CONF_ON_TTS_END = "on_tts_end" |  | ||||||
| CONF_ON_END = "on_end" | CONF_ON_END = "on_end" | ||||||
| CONF_ON_ERROR = "on_error" | CONF_ON_ERROR = "on_error" | ||||||
|  | CONF_ON_INTENT_END = "on_intent_end" | ||||||
|  | CONF_ON_INTENT_START = "on_intent_start" | ||||||
|  | CONF_ON_LISTENING = "on_listening" | ||||||
|  | CONF_ON_START = "on_start" | ||||||
|  | CONF_ON_STT_END = "on_stt_end" | ||||||
|  | CONF_ON_STT_VAD_END = "on_stt_vad_end" | ||||||
|  | CONF_ON_STT_VAD_START = "on_stt_vad_start" | ||||||
|  | CONF_ON_TTS_END = "on_tts_end" | ||||||
|  | CONF_ON_TTS_START = "on_tts_start" | ||||||
|  | CONF_ON_WAKE_WORD_DETECTED = "on_wake_word_detected" | ||||||
|  |  | ||||||
|  | CONF_SILENCE_DETECTION = "silence_detection" | ||||||
| CONF_USE_WAKE_WORD = "use_wake_word" | CONF_USE_WAKE_WORD = "use_wake_word" | ||||||
| CONF_VAD_THRESHOLD = "vad_threshold" | CONF_VAD_THRESHOLD = "vad_threshold" | ||||||
|  |  | ||||||
| CONF_NOISE_SUPPRESSION_LEVEL = "noise_suppression_level" |  | ||||||
| CONF_AUTO_GAIN = "auto_gain" | CONF_AUTO_GAIN = "auto_gain" | ||||||
|  | CONF_NOISE_SUPPRESSION_LEVEL = "noise_suppression_level" | ||||||
| CONF_VOLUME_MULTIPLIER = "volume_multiplier" | CONF_VOLUME_MULTIPLIER = "volume_multiplier" | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -88,6 +93,18 @@ CONFIG_SCHEMA = cv.All( | |||||||
|             cv.Optional(CONF_ON_CLIENT_DISCONNECTED): automation.validate_automation( |             cv.Optional(CONF_ON_CLIENT_DISCONNECTED): automation.validate_automation( | ||||||
|                 single=True |                 single=True | ||||||
|             ), |             ), | ||||||
|  |             cv.Optional(CONF_ON_INTENT_START): automation.validate_automation( | ||||||
|  |                 single=True | ||||||
|  |             ), | ||||||
|  |             cv.Optional(CONF_ON_INTENT_END): automation.validate_automation( | ||||||
|  |                 single=True | ||||||
|  |             ), | ||||||
|  |             cv.Optional(CONF_ON_STT_VAD_START): automation.validate_automation( | ||||||
|  |                 single=True | ||||||
|  |             ), | ||||||
|  |             cv.Optional(CONF_ON_STT_VAD_END): automation.validate_automation( | ||||||
|  |                 single=True | ||||||
|  |             ), | ||||||
|         } |         } | ||||||
|     ).extend(cv.COMPONENT_SCHEMA), |     ).extend(cv.COMPONENT_SCHEMA), | ||||||
| ) | ) | ||||||
| @@ -177,6 +194,34 @@ async def to_code(config): | |||||||
|             config[CONF_ON_CLIENT_DISCONNECTED], |             config[CONF_ON_CLIENT_DISCONNECTED], | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     if CONF_ON_INTENT_START in config: | ||||||
|  |         await automation.build_automation( | ||||||
|  |             var.get_intent_start_trigger(), | ||||||
|  |             [], | ||||||
|  |             config[CONF_ON_INTENT_START], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     if CONF_ON_INTENT_END in config: | ||||||
|  |         await automation.build_automation( | ||||||
|  |             var.get_intent_end_trigger(), | ||||||
|  |             [], | ||||||
|  |             config[CONF_ON_INTENT_END], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     if CONF_ON_STT_VAD_START in config: | ||||||
|  |         await automation.build_automation( | ||||||
|  |             var.get_stt_vad_start_trigger(), | ||||||
|  |             [], | ||||||
|  |             config[CONF_ON_STT_VAD_START], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     if CONF_ON_STT_VAD_END in config: | ||||||
|  |         await automation.build_automation( | ||||||
|  |             var.get_stt_vad_end_trigger(), | ||||||
|  |             [], | ||||||
|  |             config[CONF_ON_STT_VAD_END], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|     cg.add_define("USE_VOICE_ASSISTANT") |     cg.add_define("USE_VOICE_ASSISTANT") | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -31,7 +31,7 @@ void VoiceAssistant::setup() { | |||||||
|  |  | ||||||
|   this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); |   this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); | ||||||
|   if (socket_ == nullptr) { |   if (socket_ == nullptr) { | ||||||
|     ESP_LOGW(TAG, "Could not create socket."); |     ESP_LOGW(TAG, "Could not create socket"); | ||||||
|     this->mark_failed(); |     this->mark_failed(); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| @@ -69,7 +69,7 @@ void VoiceAssistant::setup() { | |||||||
|     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||||
|     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); |     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); | ||||||
|     if (this->speaker_buffer_ == nullptr) { |     if (this->speaker_buffer_ == nullptr) { | ||||||
|       ESP_LOGW(TAG, "Could not allocate speaker buffer."); |       ESP_LOGW(TAG, "Could not allocate speaker buffer"); | ||||||
|       this->mark_failed(); |       this->mark_failed(); | ||||||
|       return; |       return; | ||||||
|     } |     } | ||||||
| @@ -79,7 +79,7 @@ void VoiceAssistant::setup() { | |||||||
|   ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE); |   ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE); | ||||||
|   this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE); |   this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE); | ||||||
|   if (this->input_buffer_ == nullptr) { |   if (this->input_buffer_ == nullptr) { | ||||||
|     ESP_LOGW(TAG, "Could not allocate input buffer."); |     ESP_LOGW(TAG, "Could not allocate input buffer"); | ||||||
|     this->mark_failed(); |     this->mark_failed(); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| @@ -89,7 +89,7 @@ void VoiceAssistant::setup() { | |||||||
|  |  | ||||||
|   this->ring_buffer_ = rb_create(BUFFER_SIZE, sizeof(int16_t)); |   this->ring_buffer_ = rb_create(BUFFER_SIZE, sizeof(int16_t)); | ||||||
|   if (this->ring_buffer_ == nullptr) { |   if (this->ring_buffer_ == nullptr) { | ||||||
|     ESP_LOGW(TAG, "Could not allocate ring buffer."); |     ESP_LOGW(TAG, "Could not allocate ring buffer"); | ||||||
|     this->mark_failed(); |     this->mark_failed(); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| @@ -98,7 +98,7 @@ void VoiceAssistant::setup() { | |||||||
|   ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); |   ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||||
|   this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE); |   this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE); | ||||||
|   if (send_buffer_ == nullptr) { |   if (send_buffer_ == nullptr) { | ||||||
|     ESP_LOGW(TAG, "Could not allocate send buffer."); |     ESP_LOGW(TAG, "Could not allocate send buffer"); | ||||||
|     this->mark_failed(); |     this->mark_failed(); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
| @@ -221,8 +221,8 @@ void VoiceAssistant::loop() { | |||||||
|       msg.audio_settings = audio_settings; |       msg.audio_settings = audio_settings; | ||||||
|  |  | ||||||
|       if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) { |       if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) { | ||||||
|         ESP_LOGW(TAG, "Could not request start."); |         ESP_LOGW(TAG, "Could not request start"); | ||||||
|         this->error_trigger_->trigger("not-connected", "Could not request start."); |         this->error_trigger_->trigger("not-connected", "Could not request start"); | ||||||
|         this->continuous_ = false; |         this->continuous_ = false; | ||||||
|         this->set_state_(State::IDLE, State::IDLE); |         this->set_state_(State::IDLE, State::IDLE); | ||||||
|         break; |         break; | ||||||
| @@ -280,7 +280,7 @@ void VoiceAssistant::loop() { | |||||||
|             this->speaker_buffer_size_ += len; |             this->speaker_buffer_size_ += len; | ||||||
|           } |           } | ||||||
|         } else { |         } else { | ||||||
|           ESP_LOGW(TAG, "Receive buffer full."); |           ESP_LOGW(TAG, "Receive buffer full"); | ||||||
|         } |         } | ||||||
|         if (this->speaker_buffer_size_ > 0) { |         if (this->speaker_buffer_size_ > 0) { | ||||||
|           size_t written = this->speaker_->play(this->speaker_buffer_, this->speaker_buffer_size_); |           size_t written = this->speaker_->play(this->speaker_buffer_, this->speaker_buffer_size_); | ||||||
| @@ -290,7 +290,7 @@ void VoiceAssistant::loop() { | |||||||
|             this->speaker_buffer_index_ -= written; |             this->speaker_buffer_index_ -= written; | ||||||
|             this->set_timeout("speaker-timeout", 2000, [this]() { this->speaker_->stop(); }); |             this->set_timeout("speaker-timeout", 2000, [this]() { this->speaker_->stop(); }); | ||||||
|           } else { |           } else { | ||||||
|             ESP_LOGW(TAG, "Speaker buffer full."); |             ESP_LOGW(TAG, "Speaker buffer full"); | ||||||
|           } |           } | ||||||
|         } |         } | ||||||
|         if (this->wait_for_stream_end_) { |         if (this->wait_for_stream_end_) { | ||||||
| @@ -513,7 +513,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|     case api::enums::VOICE_ASSISTANT_STT_START: |     case api::enums::VOICE_ASSISTANT_STT_START: | ||||||
|       ESP_LOGD(TAG, "STT Started"); |       ESP_LOGD(TAG, "STT started"); | ||||||
|       this->listening_trigger_->trigger(); |       this->listening_trigger_->trigger(); | ||||||
|       break; |       break; | ||||||
|     case api::enums::VOICE_ASSISTANT_STT_END: { |     case api::enums::VOICE_ASSISTANT_STT_END: { | ||||||
| @@ -525,19 +525,24 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|       if (text.empty()) { |       if (text.empty()) { | ||||||
|         ESP_LOGW(TAG, "No text in STT_END event."); |         ESP_LOGW(TAG, "No text in STT_END event"); | ||||||
|         return; |         return; | ||||||
|       } |       } | ||||||
|       ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); |       ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); | ||||||
|       this->stt_end_trigger_->trigger(text); |       this->stt_end_trigger_->trigger(text); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|  |     case api::enums::VOICE_ASSISTANT_INTENT_START: | ||||||
|  |       ESP_LOGD(TAG, "Intent started"); | ||||||
|  |       this->intent_start_trigger_->trigger(); | ||||||
|  |       break; | ||||||
|     case api::enums::VOICE_ASSISTANT_INTENT_END: { |     case api::enums::VOICE_ASSISTANT_INTENT_END: { | ||||||
|       for (auto arg : msg.data) { |       for (auto arg : msg.data) { | ||||||
|         if (arg.name == "conversation_id") { |         if (arg.name == "conversation_id") { | ||||||
|           this->conversation_id_ = std::move(arg.value); |           this->conversation_id_ = std::move(arg.value); | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|  |       this->intent_end_trigger_->trigger(); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|     case api::enums::VOICE_ASSISTANT_TTS_START: { |     case api::enums::VOICE_ASSISTANT_TTS_START: { | ||||||
| @@ -548,7 +553,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|       if (text.empty()) { |       if (text.empty()) { | ||||||
|         ESP_LOGW(TAG, "No text in TTS_START event."); |         ESP_LOGW(TAG, "No text in TTS_START event"); | ||||||
|         return; |         return; | ||||||
|       } |       } | ||||||
|       ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); |       ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); | ||||||
| @@ -566,7 +571,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|       if (url.empty()) { |       if (url.empty()) { | ||||||
|         ESP_LOGW(TAG, "No url in TTS_END event."); |         ESP_LOGW(TAG, "No url in TTS_END event"); | ||||||
|         return; |         return; | ||||||
|       } |       } | ||||||
|       ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); |       ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); | ||||||
| @@ -634,6 +639,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       this->set_state_(State::RESPONSE_FINISHED, State::IDLE); |       this->set_state_(State::RESPONSE_FINISHED, State::IDLE); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|  |     case api::enums::VOICE_ASSISTANT_STT_VAD_START: | ||||||
|  |       ESP_LOGD(TAG, "Starting STT by VAD"); | ||||||
|  |       this->stt_vad_start_trigger_->trigger(); | ||||||
|  |       break; | ||||||
|  |     case api::enums::VOICE_ASSISTANT_STT_VAD_END: | ||||||
|  |       ESP_LOGD(TAG, "STT by VAD end"); | ||||||
|  |       this->stt_vad_end_trigger_->trigger(); | ||||||
|  |       break; | ||||||
|     default: |     default: | ||||||
|       ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); |       ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); | ||||||
|       break; |       break; | ||||||
|   | |||||||
| @@ -100,13 +100,17 @@ class VoiceAssistant : public Component { | |||||||
|   void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; } |   void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; } | ||||||
|   void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; } |   void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; } | ||||||
|  |  | ||||||
|  |   Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; } | ||||||
|  |   Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; } | ||||||
|   Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } |   Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } | ||||||
|  |   Trigger<> *get_end_trigger() const { return this->end_trigger_; } | ||||||
|   Trigger<> *get_start_trigger() const { return this->start_trigger_; } |   Trigger<> *get_start_trigger() const { return this->start_trigger_; } | ||||||
|  |   Trigger<> *get_stt_vad_end_trigger() const { return this->stt_vad_end_trigger_; } | ||||||
|  |   Trigger<> *get_stt_vad_start_trigger() const { return this->stt_vad_start_trigger_; } | ||||||
|   Trigger<> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; } |   Trigger<> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; } | ||||||
|   Trigger<std::string> *get_stt_end_trigger() const { return this->stt_end_trigger_; } |   Trigger<std::string> *get_stt_end_trigger() const { return this->stt_end_trigger_; } | ||||||
|   Trigger<std::string> *get_tts_start_trigger() const { return this->tts_start_trigger_; } |  | ||||||
|   Trigger<std::string> *get_tts_end_trigger() const { return this->tts_end_trigger_; } |   Trigger<std::string> *get_tts_end_trigger() const { return this->tts_end_trigger_; } | ||||||
|   Trigger<> *get_end_trigger() const { return this->end_trigger_; } |   Trigger<std::string> *get_tts_start_trigger() const { return this->tts_start_trigger_; } | ||||||
|   Trigger<std::string, std::string> *get_error_trigger() const { return this->error_trigger_; } |   Trigger<std::string, std::string> *get_error_trigger() const { return this->error_trigger_; } | ||||||
|  |  | ||||||
|   Trigger<> *get_client_connected_trigger() const { return this->client_connected_trigger_; } |   Trigger<> *get_client_connected_trigger() const { return this->client_connected_trigger_; } | ||||||
| @@ -124,13 +128,17 @@ class VoiceAssistant : public Component { | |||||||
|   std::unique_ptr<socket::Socket> socket_ = nullptr; |   std::unique_ptr<socket::Socket> socket_ = nullptr; | ||||||
|   struct sockaddr_storage dest_addr_; |   struct sockaddr_storage dest_addr_; | ||||||
|  |  | ||||||
|  |   Trigger<> *intent_end_trigger_ = new Trigger<>(); | ||||||
|  |   Trigger<> *intent_start_trigger_ = new Trigger<>(); | ||||||
|   Trigger<> *listening_trigger_ = new Trigger<>(); |   Trigger<> *listening_trigger_ = new Trigger<>(); | ||||||
|  |   Trigger<> *end_trigger_ = new Trigger<>(); | ||||||
|   Trigger<> *start_trigger_ = new Trigger<>(); |   Trigger<> *start_trigger_ = new Trigger<>(); | ||||||
|  |   Trigger<> *stt_vad_start_trigger_ = new Trigger<>(); | ||||||
|  |   Trigger<> *stt_vad_end_trigger_ = new Trigger<>(); | ||||||
|   Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); |   Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); | ||||||
|   Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>(); |   Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>(); | ||||||
|   Trigger<std::string> *tts_start_trigger_ = new Trigger<std::string>(); |  | ||||||
|   Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>(); |   Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>(); | ||||||
|   Trigger<> *end_trigger_ = new Trigger<>(); |   Trigger<std::string> *tts_start_trigger_ = new Trigger<std::string>(); | ||||||
|   Trigger<std::string, std::string> *error_trigger_ = new Trigger<std::string, std::string>(); |   Trigger<std::string, std::string> *error_trigger_ = new Trigger<std::string, std::string>(); | ||||||
|  |  | ||||||
|   Trigger<> *client_connected_trigger_ = new Trigger<>(); |   Trigger<> *client_connected_trigger_ = new Trigger<>(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user