[voice_assisant] support start/continue conversation and deallocate buffers (#8610)

2025-10-31 07:03:55 +00:00 · 2025-04-27 18:30:21 -05:00
parent ee646d7324
commit c9d1476ae0
2 changed files with 104 additions and 33 deletions
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -72,12 +72,8 @@ bool VoiceAssistant::start_udp_socket_() {
 }

 bool VoiceAssistant::allocate_buffers_() {
-  if (this->send_buffer_ != nullptr) {
-    return true;  // Already allocated
-  }
-
 #ifdef USE_SPEAKER
-  if (this->speaker_ != nullptr) {
+  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ == nullptr)) {
    ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
    this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
    if (this->speaker_buffer_ == nullptr) {
@@ -87,29 +83,35 @@ bool VoiceAssistant::allocate_buffers_() {
  }
 #endif

+  if (this->input_buffer_ == nullptr) {
    ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
    this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
    if (this->input_buffer_ == nullptr) {
      ESP_LOGW(TAG, "Could not allocate input buffer");
      return false;
    }
+  }

 #ifdef USE_ESP_ADF
  this->vad_instance_ = vad_create(VAD_MODE_4);
 #endif

+  if (this->ring_buffer_.use_count() == 0) {
    this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
-  if (this->ring_buffer_ == nullptr) {
-    ESP_LOGW(TAG, "Could not allocate ring buffer");
+    if (this->ring_buffer_.use_count() == 0) {
+      ESP_LOGE(TAG, "Could not allocate ring buffer");
      return false;
    }
+  }

+  if (this->send_buffer_ == nullptr) {
    ExternalRAMAllocator<uint8_t> send_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
    this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
    if (send_buffer_ == nullptr) {
      ESP_LOGW(TAG, "Could not allocate send buffer");
      return false;
    }
+  }

  return true;
 }
@@ -139,13 +141,14 @@ void VoiceAssistant::clear_buffers_() {
 }

 void VoiceAssistant::deallocate_buffers_() {
+  if (this->send_buffer_ != nullptr) {
    ExternalRAMAllocator<uint8_t> send_deallocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
    send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
    this->send_buffer_ = nullptr;
+  }

-  if (this->ring_buffer_ != nullptr) {
+  if (this->ring_buffer_.use_count() > 0) {
    this->ring_buffer_.reset();
-    this->ring_buffer_ = nullptr;
  }

 #ifdef USE_ESP_ADF
@@ -155,9 +158,11 @@ void VoiceAssistant::deallocate_buffers_() {
  }
 #endif

+  if (this->input_buffer_ != nullptr) {
    ExternalRAMAllocator<int16_t> input_deallocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
    input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
    this->input_buffer_ = nullptr;
+  }

 #ifdef USE_SPEAKER
  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
@@ -216,6 +221,7 @@ void VoiceAssistant::loop() {
        }
      } else {
        this->high_freq_.stop();
+        this->deallocate_buffers_();
      }
      break;
    }
@@ -276,7 +282,7 @@ void VoiceAssistant::loop() {
      this->read_microphone_();
      ESP_LOGD(TAG, "Requesting start...");
      uint32_t flags = 0;
-      if (this->use_wake_word_)
+      if (!this->continue_conversation_ && this->use_wake_word_)
        flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD;
      if (this->silence_detection_)
        flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD;
@@ -387,6 +393,25 @@ void VoiceAssistant::loop() {
 #ifdef USE_MEDIA_PLAYER
      if (this->media_player_ != nullptr) {
        playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_ANNOUNCING);
+
+        if (playing && this->media_player_wait_for_announcement_start_) {
+          // Announcement has started playing, wait for it to finish
+          this->media_player_wait_for_announcement_start_ = false;
+          this->media_player_wait_for_announcement_end_ = true;
+        }
+
+        if (!playing && this->media_player_wait_for_announcement_end_) {
+          // Announcement has finished playing
+          this->media_player_wait_for_announcement_end_ = false;
+          this->cancel_timeout("playing");
+          ESP_LOGD(TAG, "Announcement finished playing");
+          this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
+
+          api::VoiceAssistantAnnounceFinished msg;
+          msg.success = true;
+          this->api_client_->send_voice_assistant_announce_finished(msg);
+          break;
+        }
      }
 #endif
      if (playing) {
@@ -417,7 +442,11 @@ void VoiceAssistant::loop() {
        this->tts_stream_end_trigger_->trigger();
      }
 #endif
+      if (this->continue_conversation_) {
+        this->set_state_(State::START_MICROPHONE, State::START_PIPELINE);
+      } else {
        this->set_state_(State::IDLE, State::IDLE);
+      }
      break;
    }
    default:
@@ -587,6 +616,7 @@ void VoiceAssistant::request_start(bool continuous, bool silence_detection) {

 void VoiceAssistant::request_stop() {
  this->continuous_ = false;
+  this->continue_conversation_ = false;

  switch (this->state_) {
    case State::IDLE:
@@ -611,6 +641,16 @@ void VoiceAssistant::request_stop() {
      this->signal_stop_();
      break;
    case State::STREAMING_RESPONSE:
+#ifdef USE_MEDIA_PLAYER
+      // Stop any ongoing media player announcement
+      if (this->media_player_ != nullptr) {
+        this->media_player_->make_call()
+            .set_command(media_player::MEDIA_PLAYER_COMMAND_STOP)
+            .set_announcement(true)
+            .perform();
+      }
+#endif
+      break;
    case State::RESPONSE_FINISHED:
      break;  // Let the incoming audio stream finish then it will go to idle.
  }
@@ -628,9 +668,9 @@ void VoiceAssistant::signal_stop_() {
 }

 void VoiceAssistant::start_playback_timeout_() {
-  this->set_timeout("playing", 100, [this]() {
+  this->set_timeout("playing", 2000, [this]() {
    this->cancel_timeout("speaker-timeout");
-    this->set_state_(State::IDLE, State::IDLE);
+    this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);

    api::VoiceAssistantAnnounceFinished msg;
    msg.success = true;
@@ -679,6 +719,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
      for (auto arg : msg.data) {
        if (arg.name == "conversation_id") {
          this->conversation_id_ = std::move(arg.value);
+        } else if (arg.name == "continue_conversation") {
+          this->continue_conversation_ = (arg.value == "1");
        }
      }
      this->defer([this]() { this->intent_end_trigger_->trigger(); });
@@ -722,6 +764,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
 #ifdef USE_MEDIA_PLAYER
        if (this->media_player_ != nullptr) {
          this->media_player_->make_call().set_media_url(url).set_announcement(true).perform();
+
+          this->media_player_wait_for_announcement_start_ = true;
+          this->media_player_wait_for_announcement_end_ = false;
          // Start the playback timeout, as the media player state isn't immediately updated
          this->start_playback_timeout_();
        }
@@ -888,8 +933,28 @@ void VoiceAssistant::on_announce(const api::VoiceAssistantAnnounceRequest &msg)
 #ifdef USE_MEDIA_PLAYER
  if (this->media_player_ != nullptr) {
    this->tts_start_trigger_->trigger(msg.text);
-    this->media_player_->make_call().set_media_url(msg.media_id).set_announcement(true).perform();
+    if (!msg.preannounce_media_id.empty()) {
+      this->media_player_->make_call().set_media_url(msg.preannounce_media_id).set_announcement(true).perform();
+    }
+    // Enqueueing a URL with an empty playlist will still play the file immediately
+    this->media_player_->make_call()
+        .set_command(media_player::MEDIA_PLAYER_COMMAND_ENQUEUE)
+        .set_media_url(msg.media_id)
+        .set_announcement(true)
+        .perform();
+    this->continue_conversation_ = msg.start_conversation;
+
+    this->media_player_wait_for_announcement_start_ = true;
+    this->media_player_wait_for_announcement_end_ = false;
+    // Start the playback timeout, as the media player state isn't immediately updated
+    this->start_playback_timeout_();
+
+    if (this->continuous_) {
+      this->set_state_(State::STOP_MICROPHONE, State::STREAMING_RESPONSE);
+    } else {
      this->set_state_(State::STREAMING_RESPONSE, State::STREAMING_RESPONSE);
+    }
+
    this->tts_end_trigger_->trigger(msg.media_id);
    this->end_trigger_->trigger();
  }
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -41,6 +41,7 @@ enum VoiceAssistantFeature : uint32_t {
  FEATURE_API_AUDIO = 1 << 2,
  FEATURE_TIMERS = 1 << 3,
  FEATURE_ANNOUNCE = 1 << 4,
+  FEATURE_START_CONVERSATION = 1 << 5,
 };

 enum class State {
@@ -140,6 +141,7 @@ class VoiceAssistant : public Component {
 #ifdef USE_MEDIA_PLAYER
    if (this->media_player_ != nullptr) {
      flags |= VoiceAssistantFeature::FEATURE_ANNOUNCE;
+      flags |= VoiceAssistantFeature::FEATURE_START_CONVERSATION;
    }
 #endif

@@ -267,6 +269,8 @@ class VoiceAssistant : public Component {
 #endif
 #ifdef USE_MEDIA_PLAYER
  media_player::MediaPlayer *media_player_{nullptr};
+  bool media_player_wait_for_announcement_start_{false};
+  bool media_player_wait_for_announcement_end_{false};
 #endif

  bool local_output_{false};
@@ -282,7 +286,7 @@ class VoiceAssistant : public Component {
  uint8_t vad_threshold_{5};
  uint8_t vad_counter_{0};
 #endif
-  std::unique_ptr<RingBuffer> ring_buffer_;
+  std::shared_ptr<RingBuffer> ring_buffer_;

  bool use_wake_word_;
  uint8_t noise_suppression_level_;
@@ -296,6 +300,8 @@ class VoiceAssistant : public Component {
  bool continuous_{false};
  bool silence_detection_;

+  bool continue_conversation_{false};
+
  State state_{State::IDLE};
  State desired_state_{State::IDLE};