From 51259888bf1a3fb640bb299f8dfd4b9a5ff48fe2 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 2 Jan 2026 14:10:21 -1000 Subject: [PATCH] [voice_assistant] Use zero-copy buffer access for audio data (#12656) --- esphome/components/api/api.proto | 2 +- esphome/components/api/api_pb2.cpp | 10 ++++++---- esphome/components/api/api_pb2.h | 11 +++-------- esphome/components/api/api_pb2_dump.cpp | 6 +----- .../voice_assistant/voice_assistant.cpp | 15 ++++++++------- 5 files changed, 19 insertions(+), 25 deletions(-) diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto index b5aaec430c..43b721c2d5 100644 --- a/esphome/components/api/api.proto +++ b/esphome/components/api/api.proto @@ -1937,7 +1937,7 @@ message VoiceAssistantAudio { option (source) = SOURCE_BOTH; option (ifdef) = "USE_VOICE_ASSISTANT"; - bytes data = 1; + bytes data = 1 [(pointer_to_buffer) = true]; bool end = 2; } diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp index 1147cd986e..698e08f9b3 100644 --- a/esphome/components/api/api_pb2.cpp +++ b/esphome/components/api/api_pb2.cpp @@ -2527,20 +2527,22 @@ bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) { } bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) { switch (field_id) { - case 1: - this->data = value.as_string(); + case 1: { + this->data = value.data(); + this->data_len = value.size(); break; + } default: return false; } return true; } void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const { - buffer.encode_bytes(1, this->data_ptr_, this->data_len_); + buffer.encode_bytes(1, this->data, this->data_len); buffer.encode_bool(2, this->end); } void VoiceAssistantAudio::calculate_size(ProtoSize &size) const { - size.add_length(1, this->data_len_); + size.add_length(1, this->data_len); size.add_bool(1, this->end); } bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h index 4b14697181..6275b4c211 100644 --- a/esphome/components/api/api_pb2.h +++ b/esphome/components/api/api_pb2.h @@ -2521,17 +2521,12 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage { class VoiceAssistantAudio final : public ProtoDecodableMessage { public: static constexpr uint8_t MESSAGE_TYPE = 106; - static constexpr uint8_t ESTIMATED_SIZE = 11; + static constexpr uint8_t ESTIMATED_SIZE = 21; #ifdef HAS_PROTO_MESSAGE_DUMP const char *message_name() const override { return "voice_assistant_audio"; } #endif - std::string data{}; - const uint8_t *data_ptr_{nullptr}; - size_t data_len_{0}; - void set_data(const uint8_t *data, size_t len) { - this->data_ptr_ = data; - this->data_len_ = len; - } + const uint8_t *data{nullptr}; + uint16_t data_len{0}; bool end{false}; void encode(ProtoWriteBuffer buffer) const override; void calculate_size(ProtoSize &size) const override; diff --git a/esphome/components/api/api_pb2_dump.cpp b/esphome/components/api/api_pb2_dump.cpp index 12df109a3d..1ec6645b3f 100644 --- a/esphome/components/api/api_pb2_dump.cpp +++ b/esphome/components/api/api_pb2_dump.cpp @@ -1978,11 +1978,7 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const { void VoiceAssistantAudio::dump_to(std::string &out) const { MessageDumpHelper helper(out, "VoiceAssistantAudio"); out.append(" data: "); - if (this->data_ptr_ != nullptr) { - out.append(format_hex_pretty(this->data_ptr_, this->data_len_)); - } else { - out.append(format_hex_pretty(reinterpret_cast(this->data.data()), this->data.size())); - } + out.append(format_hex_pretty(this->data, this->data_len)); out.append("\n"); dump_field(out, "end", this->end); } diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 9bb5393be2..8101d210b3 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -272,7 +272,8 @@ void VoiceAssistant::loop() { size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); if (this->audio_mode_ == AUDIO_MODE_API) { api::VoiceAssistantAudio msg; - msg.set_data(this->send_buffer_, read_bytes); + msg.data = this->send_buffer_; + msg.data_len = read_bytes; this->api_client_->send_message(msg, api::VoiceAssistantAudio::MESSAGE_TYPE); } else { if (!this->udp_socket_running_) { @@ -841,12 +842,12 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) { #ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) { - if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) { - memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length()); - this->speaker_buffer_index_ += msg.data.length(); - this->speaker_buffer_size_ += msg.data.length(); - this->speaker_bytes_received_ += msg.data.length(); - ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length()); + if (this->speaker_buffer_index_ + msg.data_len < SPEAKER_BUFFER_SIZE) { + memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data, msg.data_len); + this->speaker_buffer_index_ += msg.data_len; + this->speaker_buffer_size_ += msg.data_len; + this->speaker_bytes_received_ += msg.data_len; + ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data_len); } else { ESP_LOGE(TAG, "Cannot receive audio, buffer is full"); }