mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-31 23:21:54 +00:00 
			
		
		
		
	[voice_assisant] support start/continue conversation and deallocate buffers (#8610)
This commit is contained in:
		| @@ -72,12 +72,8 @@ bool VoiceAssistant::start_udp_socket_() { | |||||||
| } | } | ||||||
|  |  | ||||||
| bool VoiceAssistant::allocate_buffers_() { | bool VoiceAssistant::allocate_buffers_() { | ||||||
|   if (this->send_buffer_ != nullptr) { |  | ||||||
|     return true;  // Already allocated |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| #ifdef USE_SPEAKER | #ifdef USE_SPEAKER | ||||||
|   if (this->speaker_ != nullptr) { |   if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) { | ||||||
|     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||||
|     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); |     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); | ||||||
|     if (this->speaker_buffer_ == nullptr) { |     if (this->speaker_buffer_ == nullptr) { | ||||||
| @@ -87,29 +83,35 @@ bool VoiceAssistant::allocate_buffers_() { | |||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |   if (this->input_buffer_ == nullptr) { | ||||||
|     ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE); | ||||||
|     this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE); |     this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE); | ||||||
|     if (this->input_buffer_ == nullptr) { |     if (this->input_buffer_ == nullptr) { | ||||||
|       ESP_LOGW(TAG, "Could not allocate input buffer"); |       ESP_LOGW(TAG, "Could not allocate input buffer"); | ||||||
|       return false; |       return false; | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
| #ifdef USE_ESP_ADF | #ifdef USE_ESP_ADF | ||||||
|   this->vad_instance_ = vad_create(VAD_MODE_4); |   this->vad_instance_ = vad_create(VAD_MODE_4); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |   if (this->ring_buffer_.use_count() == 0) { | ||||||
|     this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t)); |     this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t)); | ||||||
|   if (this->ring_buffer_ == nullptr) { |     if (this->ring_buffer_.use_count() == 0) { | ||||||
|     ESP_LOGW(TAG, "Could not allocate ring buffer"); |       ESP_LOGE(TAG, "Could not allocate ring buffer"); | ||||||
|       return false; |       return false; | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if (this->send_buffer_ == nullptr) { | ||||||
|     ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||||
|     this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE); |     this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE); | ||||||
|     if (send_buffer_ == nullptr) { |     if (send_buffer_ == nullptr) { | ||||||
|       ESP_LOGW(TAG, "Could not allocate send buffer"); |       ESP_LOGW(TAG, "Could not allocate send buffer"); | ||||||
|       return false; |       return false; | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   return true; |   return true; | ||||||
| } | } | ||||||
| @@ -139,13 +141,14 @@ void VoiceAssistant::clear_buffers_() { | |||||||
| } | } | ||||||
|  |  | ||||||
| void VoiceAssistant::deallocate_buffers_() { | void VoiceAssistant::deallocate_buffers_() { | ||||||
|  |   if (this->send_buffer_ != nullptr) { | ||||||
|     ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||||
|     send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE); |     send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE); | ||||||
|     this->send_buffer_ = nullptr; |     this->send_buffer_ = nullptr; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   if (this->ring_buffer_ != nullptr) { |   if (this->ring_buffer_.use_count() > 0) { | ||||||
|     this->ring_buffer_.reset(); |     this->ring_buffer_.reset(); | ||||||
|     this->ring_buffer_ = nullptr; |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #ifdef USE_ESP_ADF | #ifdef USE_ESP_ADF | ||||||
| @@ -155,9 +158,11 @@ void VoiceAssistant::deallocate_buffers_() { | |||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |   if (this->input_buffer_ != nullptr) { | ||||||
|     ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE); | ||||||
|     input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE); |     input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE); | ||||||
|     this->input_buffer_ = nullptr; |     this->input_buffer_ = nullptr; | ||||||
|  |   } | ||||||
|  |  | ||||||
| #ifdef USE_SPEAKER | #ifdef USE_SPEAKER | ||||||
|   if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) { |   if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) { | ||||||
| @@ -216,6 +221,7 @@ void VoiceAssistant::loop() { | |||||||
|         } |         } | ||||||
|       } else { |       } else { | ||||||
|         this->high_freq_.stop(); |         this->high_freq_.stop(); | ||||||
|  |         this->deallocate_buffers_(); | ||||||
|       } |       } | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
| @@ -276,7 +282,7 @@ void VoiceAssistant::loop() { | |||||||
|       this->read_microphone_(); |       this->read_microphone_(); | ||||||
|       ESP_LOGD(TAG, "Requesting start..."); |       ESP_LOGD(TAG, "Requesting start..."); | ||||||
|       uint32_t flags = 0; |       uint32_t flags = 0; | ||||||
|       if (this->use_wake_word_) |       if (!this->continue_conversation_ && this->use_wake_word_) | ||||||
|         flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD; |         flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD; | ||||||
|       if (this->silence_detection_) |       if (this->silence_detection_) | ||||||
|         flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD; |         flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD; | ||||||
| @@ -387,6 +393,25 @@ void VoiceAssistant::loop() { | |||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|       if (this->media_player_ != nullptr) { |       if (this->media_player_ != nullptr) { | ||||||
|         playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING); |         playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING); | ||||||
|  |  | ||||||
|  |         if (playing && this->media_player_wait_for_announcement_start_) { | ||||||
|  |           // Announcement has started playing, wait for it to finish | ||||||
|  |           this->media_player_wait_for_announcement_start_ = false; | ||||||
|  |           this->media_player_wait_for_announcement_end_ = true; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if (!playing && this->media_player_wait_for_announcement_end_) { | ||||||
|  |           // Announcement has finished playing | ||||||
|  |           this->media_player_wait_for_announcement_end_ = false; | ||||||
|  |           this->cancel_timeout("playing"); | ||||||
|  |           ESP_LOGD(TAG, "Announcement finished playing"); | ||||||
|  |           this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); | ||||||
|  |  | ||||||
|  |           api::VoiceAssistantAnnounceFinished msg; | ||||||
|  |           msg.success = true; | ||||||
|  |           this->api_client_->send_voice_assistant_announce_finished(msg); | ||||||
|  |           break; | ||||||
|  |         } | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|       if (playing) { |       if (playing) { | ||||||
| @@ -417,7 +442,11 @@ void VoiceAssistant::loop() { | |||||||
|         this->tts_stream_end_trigger_->trigger(); |         this->tts_stream_end_trigger_->trigger(); | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|  |       if (this->continue_conversation_) { | ||||||
|  |         this->set_state_(State::START_MICROPHONE, State::START_PIPELINE); | ||||||
|  |       } else { | ||||||
|         this->set_state_(State::IDLE, State::IDLE); |         this->set_state_(State::IDLE, State::IDLE); | ||||||
|  |       } | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|     default: |     default: | ||||||
| @@ -587,6 +616,7 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) { | |||||||
|  |  | ||||||
| void VoiceAssistant::request_stop() { | void VoiceAssistant::request_stop() { | ||||||
|   this->continuous_ = false; |   this->continuous_ = false; | ||||||
|  |   this->continue_conversation_ = false; | ||||||
|  |  | ||||||
|   switch (this->state_) { |   switch (this->state_) { | ||||||
|     case State::IDLE: |     case State::IDLE: | ||||||
| @@ -611,6 +641,16 @@ void VoiceAssistant::request_stop() { | |||||||
|       this->signal_stop_(); |       this->signal_stop_(); | ||||||
|       break; |       break; | ||||||
|     case State::STREAMING_RESPONSE: |     case State::STREAMING_RESPONSE: | ||||||
|  | #ifdef USE_MEDIA_PLAYER | ||||||
|  |       // Stop any ongoing media player announcement | ||||||
|  |       if (this->media_player_ != nullptr) { | ||||||
|  |         this->media_player_->make_call() | ||||||
|  |             .set_command(media_player::MEDIA_PLAYER_COMMAND_STOP) | ||||||
|  |             .set_announcement(true) | ||||||
|  |             .perform(); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|  |       break; | ||||||
|     case State::RESPONSE_FINISHED: |     case State::RESPONSE_FINISHED: | ||||||
|       break;  // Let the incoming audio stream finish then it will go to idle. |       break;  // Let the incoming audio stream finish then it will go to idle. | ||||||
|   } |   } | ||||||
| @@ -628,9 +668,9 @@ void VoiceAssistant::signal_stop_() { | |||||||
| } | } | ||||||
|  |  | ||||||
| void VoiceAssistant::start_playback_timeout_() { | void VoiceAssistant::start_playback_timeout_() { | ||||||
|   this->set_timeout("playing", 100, [this]() { |   this->set_timeout("playing", 2000, [this]() { | ||||||
|     this->cancel_timeout("speaker-timeout"); |     this->cancel_timeout("speaker-timeout"); | ||||||
|     this->set_state_(State::IDLE, State::IDLE); |     this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); | ||||||
|  |  | ||||||
|     api::VoiceAssistantAnnounceFinished msg; |     api::VoiceAssistantAnnounceFinished msg; | ||||||
|     msg.success = true; |     msg.success = true; | ||||||
| @@ -679,6 +719,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       for (auto arg : msg.data) { |       for (auto arg : msg.data) { | ||||||
|         if (arg.name == "conversation_id") { |         if (arg.name == "conversation_id") { | ||||||
|           this->conversation_id_ = std::move(arg.value); |           this->conversation_id_ = std::move(arg.value); | ||||||
|  |         } else if (arg.name == "continue_conversation") { | ||||||
|  |           this->continue_conversation_ = (arg.value == "1"); | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|       this->defer([this]() { this->intent_end_trigger_->trigger(); }); |       this->defer([this]() { this->intent_end_trigger_->trigger(); }); | ||||||
| @@ -722,6 +764,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|         if (this->media_player_ != nullptr) { |         if (this->media_player_ != nullptr) { | ||||||
|           this->media_player_->make_call().set_media_url(url).set_announcement(true).perform(); |           this->media_player_->make_call().set_media_url(url).set_announcement(true).perform(); | ||||||
|  |  | ||||||
|  |           this->media_player_wait_for_announcement_start_ = true; | ||||||
|  |           this->media_player_wait_for_announcement_end_ = false; | ||||||
|           // Start the playback timeout, as the media player state isn't immediately updated |           // Start the playback timeout, as the media player state isn't immediately updated | ||||||
|           this->start_playback_timeout_(); |           this->start_playback_timeout_(); | ||||||
|         } |         } | ||||||
| @@ -888,8 +933,28 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg) | |||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|   if (this->media_player_ != nullptr) { |   if (this->media_player_ != nullptr) { | ||||||
|     this->tts_start_trigger_->trigger(msg.text); |     this->tts_start_trigger_->trigger(msg.text); | ||||||
|     this->media_player_->make_call().set_media_url(msg.media_id).set_announcement(true).perform(); |     if (!msg.preannounce_media_id.empty()) { | ||||||
|  |       this->media_player_->make_call().set_media_url(msg.preannounce_media_id).set_announcement(true).perform(); | ||||||
|  |     } | ||||||
|  |     // Enqueueing a URL with an empty playlist will still play the file immediately | ||||||
|  |     this->media_player_->make_call() | ||||||
|  |         .set_command(media_player::MEDIA_PLAYER_COMMAND_ENQUEUE) | ||||||
|  |         .set_media_url(msg.media_id) | ||||||
|  |         .set_announcement(true) | ||||||
|  |         .perform(); | ||||||
|  |     this->continue_conversation_ = msg.start_conversation; | ||||||
|  |  | ||||||
|  |     this->media_player_wait_for_announcement_start_ = true; | ||||||
|  |     this->media_player_wait_for_announcement_end_ = false; | ||||||
|  |     // Start the playback timeout, as the media player state isn't immediately updated | ||||||
|  |     this->start_playback_timeout_(); | ||||||
|  |  | ||||||
|  |     if (this->continuous_) { | ||||||
|  |       this->set_state_(State::STOP_MICROPHONE, State::STREAMING_RESPONSE); | ||||||
|  |     } else { | ||||||
|       this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE); |       this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     this->tts_end_trigger_->trigger(msg.media_id); |     this->tts_end_trigger_->trigger(msg.media_id); | ||||||
|     this->end_trigger_->trigger(); |     this->end_trigger_->trigger(); | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -41,6 +41,7 @@ enum VoiceAssistantFeature : uint32_t { | |||||||
|   FEATURE_API_AUDIO = 1 << 2, |   FEATURE_API_AUDIO = 1 << 2, | ||||||
|   FEATURE_TIMERS = 1 << 3, |   FEATURE_TIMERS = 1 << 3, | ||||||
|   FEATURE_ANNOUNCE = 1 << 4, |   FEATURE_ANNOUNCE = 1 << 4, | ||||||
|  |   FEATURE_START_CONVERSATION = 1 << 5, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| enum class State { | enum class State { | ||||||
| @@ -140,6 +141,7 @@ class VoiceAssistant : public Component { | |||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|     if (this->media_player_ != nullptr) { |     if (this->media_player_ != nullptr) { | ||||||
|       flags |= VoiceAssistantFeature::FEATURE_ANNOUNCE; |       flags |= VoiceAssistantFeature::FEATURE_ANNOUNCE; | ||||||
|  |       flags |= VoiceAssistantFeature::FEATURE_START_CONVERSATION; | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -267,6 +269,8 @@ class VoiceAssistant : public Component { | |||||||
| #endif | #endif | ||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|   media_player::MediaPlayer *media_player_{nullptr}; |   media_player::MediaPlayer *media_player_{nullptr}; | ||||||
|  |   bool media_player_wait_for_announcement_start_{false}; | ||||||
|  |   bool media_player_wait_for_announcement_end_{false}; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   bool local_output_{false}; |   bool local_output_{false}; | ||||||
| @@ -282,7 +286,7 @@ class VoiceAssistant : public Component { | |||||||
|   uint8_t vad_threshold_{5}; |   uint8_t vad_threshold_{5}; | ||||||
|   uint8_t vad_counter_{0}; |   uint8_t vad_counter_{0}; | ||||||
| #endif | #endif | ||||||
|   std::unique_ptr<RingBuffer> ring_buffer_; |   std::shared_ptr<RingBuffer> ring_buffer_; | ||||||
|  |  | ||||||
|   bool use_wake_word_; |   bool use_wake_word_; | ||||||
|   uint8_t noise_suppression_level_; |   uint8_t noise_suppression_level_; | ||||||
| @@ -296,6 +300,8 @@ class VoiceAssistant : public Component { | |||||||
|   bool continuous_{false}; |   bool continuous_{false}; | ||||||
|   bool silence_detection_; |   bool silence_detection_; | ||||||
|  |  | ||||||
|  |   bool continue_conversation_{false}; | ||||||
|  |  | ||||||
|   State state_{State::IDLE}; |   State state_{State::IDLE}; | ||||||
|   State desired_state_{State::IDLE}; |   State desired_state_{State::IDLE}; | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user