From 51259888bf1a3fb640bb299f8dfd4b9a5ff48fe2 Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@koston.org>
Date: Fri, 2 Jan 2026 14:10:21 -1000
Subject: [PATCH] [voice_assistant] Use zero-copy buffer access for audio data
 (#12656)

---
 esphome/components/api/api.proto                  |  2 +-
 esphome/components/api/api_pb2.cpp                | 10 ++++++----
 esphome/components/api/api_pb2.h                  | 11 +++--------
 esphome/components/api/api_pb2_dump.cpp           |  6 +-----
 .../voice_assistant/voice_assistant.cpp           | 15 ++++++++-------
 5 files changed, 19 insertions(+), 25 deletions(-)
diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto
index b5aaec430c..43b721c2d5 100644
--- a/esphome/components/api/api.proto
+++ b/esphome/components/api/api.proto
@@ -1937,7 +1937,7 @@ message VoiceAssistantAudio {
   option (source) = SOURCE_BOTH;
   option (ifdef) = "USE_VOICE_ASSISTANT";
 
-  bytes data = 1;
+  bytes data = 1 [(pointer_to_buffer) = true];
   bool end = 2;
 }
 
diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp
index 1147cd986e..698e08f9b3 100644
--- a/esphome/components/api/api_pb2.cpp
+++ b/esphome/components/api/api_pb2.cpp
@@ -2527,20 +2527,22 @@ bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) {
 }
 bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) {
   switch (field_id) {
-    case 1:
-      this->data = value.as_string();
+    case 1: {
+      this->data = value.data();
+      this->data_len = value.size();
       break;
+    }
     default:
       return false;
   }
   return true;
 }
 void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const {
-  buffer.encode_bytes(1, this->data_ptr_, this->data_len_);
+  buffer.encode_bytes(1, this->data, this->data_len);
   buffer.encode_bool(2, this->end);
 }
 void VoiceAssistantAudio::calculate_size(ProtoSize &size) const {
-  size.add_length(1, this->data_len_);
+  size.add_length(1, this->data_len);
   size.add_bool(1, this->end);
 }
 bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h
index 4b14697181..6275b4c211 100644
--- a/esphome/components/api/api_pb2.h
+++ b/esphome/components/api/api_pb2.h
@@ -2521,17 +2521,12 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage {
 class VoiceAssistantAudio final : public ProtoDecodableMessage {
  public:
   static constexpr uint8_t MESSAGE_TYPE = 106;
-  static constexpr uint8_t ESTIMATED_SIZE = 11;
+  static constexpr uint8_t ESTIMATED_SIZE = 21;
 #ifdef HAS_PROTO_MESSAGE_DUMP
   const char *message_name() const override { return "voice_assistant_audio"; }
 #endif
-  std::string data{};
-  const uint8_t *data_ptr_{nullptr};
-  size_t data_len_{0};
-  void set_data(const uint8_t *data, size_t len) {
-    this->data_ptr_ = data;
-    this->data_len_ = len;
-  }
+  const uint8_t *data{nullptr};
+  uint16_t data_len{0};
   bool end{false};
   void encode(ProtoWriteBuffer buffer) const override;
   void calculate_size(ProtoSize &size) const override;
diff --git a/esphome/components/api/api_pb2_dump.cpp b/esphome/components/api/api_pb2_dump.cpp
index 12df109a3d..1ec6645b3f 100644
--- a/esphome/components/api/api_pb2_dump.cpp
+++ b/esphome/components/api/api_pb2_dump.cpp
@@ -1978,11 +1978,7 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const {
 void VoiceAssistantAudio::dump_to(std::string &out) const {
   MessageDumpHelper helper(out, "VoiceAssistantAudio");
   out.append("  data: ");
-  if (this->data_ptr_ != nullptr) {
-    out.append(format_hex_pretty(this->data_ptr_, this->data_len_));
-  } else {
-    out.append(format_hex_pretty(reinterpret_cast<const uint8_t *>(this->data.data()), this->data.size()));
-  }
+  out.append(format_hex_pretty(this->data, this->data_len));
   out.append("\n");
   dump_field(out, "end", this->end);
 }
diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index 9bb5393be2..8101d210b3 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -272,7 +272,8 @@ void VoiceAssistant::loop() {
         size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
         if (this->audio_mode_ == AUDIO_MODE_API) {
           api::VoiceAssistantAudio msg;
-          msg.set_data(this->send_buffer_, read_bytes);
+          msg.data = this->send_buffer_;
+          msg.data_len = read_bytes;
           this->api_client_->send_message(msg, api::VoiceAssistantAudio::MESSAGE_TYPE);
         } else {
           if (!this->udp_socket_running_) {
@@ -841,12 +842,12 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
 void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) {
 #ifdef USE_SPEAKER  // We should never get to this function if there is no speaker anyway
   if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
-    if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
-      memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
-      this->speaker_buffer_index_ += msg.data.length();
-      this->speaker_buffer_size_ += msg.data.length();
-      this->speaker_bytes_received_ += msg.data.length();
-      ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
+    if (this->speaker_buffer_index_ + msg.data_len < SPEAKER_BUFFER_SIZE) {
+      memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data, msg.data_len);
+      this->speaker_buffer_index_ += msg.data_len;
+      this->speaker_buffer_size_ += msg.data_len;
+      this->speaker_bytes_received_ += msg.data_len;
+      ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data_len);
     } else {
       ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
     }