[voice_assistant] Use zero-copy buffer access for audio data (#12656)

2026-02-08 08:41:59 +00:00 · 2026-01-02 14:10:21 -10:00
parent 0b7ff09657
commit 51259888bf
5 changed files with 19 additions and 25 deletions
--- a/esphome/components/api/api.proto
+++ b/esphome/components/api/api.proto
@@ -1937,7 +1937,7 @@ message VoiceAssistantAudio {
  option (source) = SOURCE_BOTH;
  option (ifdef) = "USE_VOICE_ASSISTANT";

-  bytes data = 1;
+  bytes data = 1 [(pointer_to_buffer) = true];
  bool end = 2;
 }

--- a/esphome/components/api/api_pb2.cpp
+++ b/esphome/components/api/api_pb2.cpp
@@ -2527,20 +2527,22 @@ bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) {
 }
 bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) {
  switch (field_id) {
-    case 1:
-      this->data = value.as_string();
+    case 1: {
+      this->data = value.data();
+      this->data_len = value.size();
      break;
+    }
    default:
      return false;
  }
  return true;
 }
 void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const {
-  buffer.encode_bytes(1, this->data_ptr_, this->data_len_);
+  buffer.encode_bytes(1, this->data, this->data_len);
  buffer.encode_bool(2, this->end);
 }
 void VoiceAssistantAudio::calculate_size(ProtoSize &size) const {
-  size.add_length(1, this->data_len_);
+  size.add_length(1, this->data_len);
  size.add_bool(1, this->end);
 }
 bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
--- a/esphome/components/api/api_pb2.h
+++ b/esphome/components/api/api_pb2.h
@@ -2521,17 +2521,12 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage {
 class VoiceAssistantAudio final : public ProtoDecodableMessage {
 public:
  static constexpr uint8_t MESSAGE_TYPE = 106;
-  static constexpr uint8_t ESTIMATED_SIZE = 11;
+  static constexpr uint8_t ESTIMATED_SIZE = 21;
 #ifdef HAS_PROTO_MESSAGE_DUMP
  const char *message_name() const override { return "voice_assistant_audio"; }
 #endif
-  std::string data{};
-  const uint8_t *data_ptr_{nullptr};
-  size_t data_len_{0};
-  void set_data(const uint8_t *data, size_t len) {
-    this->data_ptr_ = data;
-    this->data_len_ = len;
-  }
+  const uint8_t *data{nullptr};
+  uint16_t data_len{0};
  bool end{false};
  void encode(ProtoWriteBuffer buffer) const override;
  void calculate_size(ProtoSize &size) const override;
--- a/esphome/components/api/api_pb2_dump.cpp
+++ b/esphome/components/api/api_pb2_dump.cpp
@@ -1978,11 +1978,7 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const {
 void VoiceAssistantAudio::dump_to(std::string &out) const {
  MessageDumpHelper helper(out, "VoiceAssistantAudio");
  out.append("  data: ");
-  if (this->data_ptr_ != nullptr) {
-    out.append(format_hex_pretty(this->data_ptr_, this->data_len_));
-  } else {
-    out.append(format_hex_pretty(reinterpret_cast<const uint8_t *>(this->data.data()), this->data.size()));
-  }
+  out.append(format_hex_pretty(this->data, this->data_len));
  out.append("\n");
  dump_field(out, "end", this->end);
 }
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -272,7 +272,8 @@ void VoiceAssistant::loop() {
        size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
        if (this->audio_mode_ == AUDIO_MODE_API) {
          api::VoiceAssistantAudio msg;
-          msg.set_data(this->send_buffer_, read_bytes);
+          msg.data = this->send_buffer_;
+          msg.data_len = read_bytes;
          this->api_client_->send_message(msg, api::VoiceAssistantAudio::MESSAGE_TYPE);
        } else {
          if (!this->udp_socket_running_) {
@@ -841,12 +842,12 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
 void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) {
 #ifdef USE_SPEAKER  // We should never get to this function if there is no speaker anyway
  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
-    if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
-      memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
-      this->speaker_buffer_index_ += msg.data.length();
-      this->speaker_buffer_size_ += msg.data.length();
-      this->speaker_bytes_received_ += msg.data.length();
-      ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
+    if (this->speaker_buffer_index_ + msg.data_len < SPEAKER_BUFFER_SIZE) {
+      memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data, msg.data_len);
+      this->speaker_buffer_index_ += msg.data_len;
+      this->speaker_buffer_size_ += msg.data_len;
+      this->speaker_bytes_received_ += msg.data_len;
+      ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data_len);
    } else {
      ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
    }