From 266c2ef337fa5e683c6576180c6f162f743addac Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@nabucasa.com>
Date: Wed, 12 Mar 2025 14:18:31 -0500
Subject: [PATCH] [audio, mixer] Memory and CPU performance improvements
 (#8387)

---
 esphome/components/audio/__init__.py          |  2 +-
 esphome/components/audio/audio_decoder.cpp    | 54 +++++++++++++++----
 esphome/components/audio/audio_reader.cpp     |  4 +-
 esphome/components/audio/audio_resampler.cpp  |  6 ++-
 .../audio/audio_transfer_buffer.cpp           | 28 ++++++----
 .../components/audio/audio_transfer_buffer.h  |  9 +++-
 .../mixer/speaker/mixer_speaker.cpp           |  3 +-
 platformio.ini                                |  4 +-
 8 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/esphome/components/audio/__init__.py b/esphome/components/audio/__init__.py
index 31d3c39ffa..c5ef781060 100644
--- a/esphome/components/audio/__init__.py
+++ b/esphome/components/audio/__init__.py
@@ -118,4 +118,4 @@ def final_validate_audio_schema(
 
 
 async def to_code(config):
-    cg.add_library("esphome/esp-audio-libs", "1.1.1")
+    cg.add_library("esphome/esp-audio-libs", "1.1.2")
diff --git a/esphome/components/audio/audio_decoder.cpp b/esphome/components/audio/audio_decoder.cpp
index ab358ad805..60489d7d78 100644
--- a/esphome/components/audio/audio_decoder.cpp
+++ b/esphome/components/audio/audio_decoder.cpp
@@ -66,19 +66,30 @@ esp_err_t AudioDecoder::start(AudioFileType audio_file_type) {
     case AudioFileType::FLAC:
       this->flac_decoder_ = make_unique<esp_audio_libs::flac::FLACDecoder>();
       this->free_buffer_required_ =
-          this->output_transfer_buffer_->capacity();  // We'll revise this after reading the header
+          this->output_transfer_buffer_->capacity();  // Adjusted and reallocated after reading the header
       break;
 #endif
 #ifdef USE_AUDIO_MP3_SUPPORT
     case AudioFileType::MP3:
       this->mp3_decoder_ = esp_audio_libs::helix_decoder::MP3InitDecoder();
+
+      // MP3 always has 1152 samples per chunk
       this->free_buffer_required_ = 1152 * sizeof(int16_t) * 2;  // samples * size per sample * channels
+
+      // Always reallocate the output transfer buffer to the smallest necessary size
+      this->output_transfer_buffer_->reallocate(this->free_buffer_required_);
       break;
 #endif
     case AudioFileType::WAV:
       this->wav_decoder_ = make_unique<esp_audio_libs::wav_decoder::WAVDecoder>();
       this->wav_decoder_->reset();
+
+      // Processing WAVs doesn't actually require a specific amount of buffer size, as it is already in PCM format.
+      // Thus, we don't reallocate to a minimum size.
       this->free_buffer_required_ = 1024;
+      if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) {
+        this->output_transfer_buffer_->reallocate(this->free_buffer_required_);
+      }
       break;
     case AudioFileType::NONE:
     default:
@@ -116,10 +127,18 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
 
   uint32_t decoding_start = millis();
 
+  bool first_loop_iteration = true;
+
+  size_t bytes_processed = 0;
+  size_t bytes_available_before_processing = 0;
+
   while (state == FileDecoderState::MORE_TO_PROCESS) {
     // Transfer decoded out
     if (!this->pause_output_) {
-      size_t bytes_written = this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+      // Never shift the data in the output transfer buffer to avoid unnecessary, slow data moves
+      size_t bytes_written =
+          this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
+
       if (this->audio_stream_info_.has_value()) {
         this->accumulated_frames_written_ += this->audio_stream_info_.value().bytes_to_frames(bytes_written);
         this->playback_ms_ +=
@@ -138,12 +157,24 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
 
     // Decode more audio
 
-    size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+    // Only shift data on the first loop iteration to avoid unnecessary, slow moves
+    size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS),
+                                                                                first_loop_iteration);
 
-    if ((this->potentially_failed_count_ > 0) && (bytes_read == 0)) {
+    if (!first_loop_iteration && (this->input_transfer_buffer_->available() < bytes_processed)) {
+      // Less data is available than what was processed in last iteration, so don't attempt to decode.
+      // This attempts to avoid the decoder from consistently trying to decode an incomplete frame. The transfer buffer
+      // will shift the remaining data to the start and copy more from the source the next time the decode function is
+      // called
+      break;
+    }
+
+    bytes_available_before_processing = this->input_transfer_buffer_->available();
+
+    if ((this->potentially_failed_count_ > 10) && (bytes_read == 0)) {
       // Failed to decode in last attempt and there is no new data
 
-      if (this->input_transfer_buffer_->free() == 0) {
+      if ((this->input_transfer_buffer_->free() == 0) && first_loop_iteration) {
         // The input buffer is full. Since it previously failed on the exact same data, we can never recover
         state = FileDecoderState::FAILED;
       } else {
@@ -175,6 +206,9 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
       }
     }
 
+    first_loop_iteration = false;
+    bytes_processed = bytes_available_before_processing - this->input_transfer_buffer_->available();
+
     if (state == FileDecoderState::POTENTIALLY_FAILED) {
       ++this->potentially_failed_count_;
     } else if (state == FileDecoderState::END_OF_FILE) {
@@ -207,13 +241,11 @@ FileDecoderState AudioDecoder::decode_flac_() {
     size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
     this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed);
 
+    // Reallocate the output transfer buffer to the smallest necessary size
     this->free_buffer_required_ = flac_decoder_->get_output_buffer_size_bytes();
-    if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) {
-      // Output buffer is not big enough
-      if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) {
-        // Couldn't reallocate output buffer
-        return FileDecoderState::FAILED;
-      }
+    if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) {
+      // Couldn't reallocate output buffer
+      return FileDecoderState::FAILED;
     }
 
     this->audio_stream_info_ =
diff --git a/esphome/components/audio/audio_reader.cpp b/esphome/components/audio/audio_reader.cpp
index 90b73a1f46..b82c6db9ee 100644
--- a/esphome/components/audio/audio_reader.cpp
+++ b/esphome/components/audio/audio_reader.cpp
@@ -259,14 +259,14 @@ AudioReaderState AudioReader::file_read_() {
 }
 
 AudioReaderState AudioReader::http_read_() {
-  this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+  this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
 
   if (esp_http_client_is_complete_data_received(this->client_)) {
     if (this->output_transfer_buffer_->available() == 0) {
       this->cleanup_connection_();
       return AudioReaderState::FINISHED;
     }
-  } else {
+  } else if (this->output_transfer_buffer_->free() > 0) {
     size_t bytes_to_read = this->output_transfer_buffer_->free();
     int received_len =
         esp_http_client_read(this->client_, (char *) this->output_transfer_buffer_->get_buffer_end(), bytes_to_read);
diff --git a/esphome/components/audio/audio_resampler.cpp b/esphome/components/audio/audio_resampler.cpp
index 05e9ff6ca1..a7621225a1 100644
--- a/esphome/components/audio/audio_resampler.cpp
+++ b/esphome/components/audio/audio_resampler.cpp
@@ -93,8 +93,9 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d
   }
 
   if (!this->pause_output_) {
-    // Move audio data to the sink
-    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+    // Move audio data to the sink without shifting the data in the output transfer buffer to avoid unnecessary, slow
+    // data moves
+    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
   } else {
     // If paused, block to avoid wasting CPU resources
     delay(READ_WRITE_TIMEOUT_MS);
@@ -115,6 +116,7 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d
 
   if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||
       (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {
+    // Adjust gain by -3 dB to avoid clipping due to the resampling process
     esp_audio_libs::resampler::ResamplerResults results =
         this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(),
                                    this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3);
diff --git a/esphome/components/audio/audio_transfer_buffer.cpp b/esphome/components/audio/audio_transfer_buffer.cpp
index 9b6067aac4..1566884c3d 100644
--- a/esphome/components/audio/audio_transfer_buffer.cpp
+++ b/esphome/components/audio/audio_transfer_buffer.cpp
@@ -33,12 +33,17 @@ size_t AudioTransferBuffer::free() const {
   if (this->buffer_size_ == 0) {
     return 0;
   }
-  return this->buffer_size_ - (this->buffer_length_ - (this->data_start_ - this->buffer_));
+  return this->buffer_size_ - (this->buffer_length_ + (this->data_start_ - this->buffer_));
 }
 
 void AudioTransferBuffer::decrease_buffer_length(size_t bytes) {
   this->buffer_length_ -= bytes;
-  this->data_start_ += bytes;
+  if (this->buffer_length_ > 0) {
+    this->data_start_ += bytes;
+  } else {
+    // All the data in the buffer has been consumed, reset the start pointer
+    this->data_start_ = this->buffer_;
+  }
 }
 
 void AudioTransferBuffer::increase_buffer_length(size_t bytes) { this->buffer_length_ += bytes; }
@@ -71,7 +76,7 @@ bool AudioTransferBuffer::has_buffered_data() const {
 
 bool AudioTransferBuffer::reallocate(size_t new_buffer_size) {
   if (this->buffer_length_ > 0) {
-    // Already has data in the buffer, fail
+    // Buffer currently has data, so reallocation is impossible
     return false;
   }
   this->deallocate_buffer_();
@@ -106,12 +111,14 @@ void AudioTransferBuffer::deallocate_buffer_() {
   this->buffer_length_ = 0;
 }
 
-size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_wait) {
-  // Shift data in buffer to start
-  if (this->buffer_length_ > 0) {
-    memmove(this->buffer_, this->data_start_, this->buffer_length_);
+size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_wait, bool pre_shift) {
+  if (pre_shift) {
+    // Shift data in buffer to start
+    if (this->buffer_length_ > 0) {
+      memmove(this->buffer_, this->data_start_, this->buffer_length_);
+    }
+    this->data_start_ = this->buffer_;
   }
-  this->data_start_ = this->buffer_;
 
   size_t bytes_to_read = this->free();
   size_t bytes_read = 0;
@@ -125,7 +132,7 @@ size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_
   return bytes_read;
 }
 
-size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait) {
+size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait, bool post_shift) {
   size_t bytes_written = 0;
   if (this->available()) {
 #ifdef USE_SPEAKER
@@ -139,11 +146,14 @@ size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait)
     }
 
     this->decrease_buffer_length(bytes_written);
+  }
 
+  if (post_shift) {
     // Shift unwritten data to the start of the buffer
     memmove(this->buffer_, this->data_start_, this->buffer_length_);
     this->data_start_ = this->buffer_;
   }
+
   return bytes_written;
 }
 
diff --git a/esphome/components/audio/audio_transfer_buffer.h b/esphome/components/audio/audio_transfer_buffer.h
index 4e461db56d..edb484e7d2 100644
--- a/esphome/components/audio/audio_transfer_buffer.h
+++ b/esphome/components/audio/audio_transfer_buffer.h
@@ -60,6 +60,7 @@ class AudioTransferBuffer {
 
  protected:
   /// @brief Allocates the transfer buffer in external memory, if available.
+  /// @param buffer_size The number of bytes to allocate
   /// @return True is successful, false otherwise.
   bool allocate_buffer_(size_t buffer_size);
 
@@ -89,8 +90,10 @@ class AudioSinkTransferBuffer : public AudioTransferBuffer {
 
   /// @brief Writes any available data in the transfer buffer to the sink.
   /// @param ticks_to_wait FreeRTOS ticks to block while waiting for the sink to have enough space
+  /// @param post_shift If true, all remaining data is moved to the start of the buffer after transferring to the sink.
+  ///                   Defaults to true.
   /// @return Number of bytes written
-  size_t transfer_data_to_sink(TickType_t ticks_to_wait);
+  size_t transfer_data_to_sink(TickType_t ticks_to_wait, bool post_shift = true);
 
   /// @brief Adds a ring buffer as the transfer buffer's sink.
   /// @param ring_buffer weak_ptr to the allocated ring buffer
@@ -125,8 +128,10 @@ class AudioSourceTransferBuffer : public AudioTransferBuffer {
 
   /// @brief Reads any available data from the sink into the transfer buffer.
   /// @param ticks_to_wait FreeRTOS ticks to block while waiting for the source to have enough data
+  /// @param pre_shift If true, any unwritten data is moved to the start of the buffer before transferring from the
+  ///                  source. Defaults to true.
   /// @return Number of bytes read
-  size_t transfer_data_from_source(TickType_t ticks_to_wait);
+  size_t transfer_data_from_source(TickType_t ticks_to_wait, bool pre_shift = true);
 
   /// @brief Adds a ring buffer as the transfer buffer's source.
   /// @param ring_buffer weak_ptr to the allocated ring buffer
diff --git a/esphome/components/mixer/speaker/mixer_speaker.cpp b/esphome/components/mixer/speaker/mixer_speaker.cpp
index 60cff95eb2..d9231154a3 100644
--- a/esphome/components/mixer/speaker/mixer_speaker.cpp
+++ b/esphome/components/mixer/speaker/mixer_speaker.cpp
@@ -490,7 +490,8 @@ void MixerSpeaker::audio_mixer_task(void *params) {
       break;
     }
 
-    output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS));
+    // Never shift the data in the output transfer buffer to avoid unnecessary, slow data moves
+    output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS), false);
 
     const uint32_t output_frames_free =
         this_mixer->audio_stream_info_.value().bytes_to_frames(output_transfer_buffer->free());
diff --git a/platformio.ini b/platformio.ini
index fab7fda659..a2c2a74ac0 100644
--- a/platformio.ini
+++ b/platformio.ini
@@ -128,7 +128,7 @@ lib_deps =
     DNSServer                            ; captive_portal (Arduino built-in)
     esphome/ESP32-audioI2S@2.0.7         ; i2s_audio
     droscy/esp_wireguard@0.4.2           ; wireguard
-    esphome/esp-audio-libs@1.1.1         ; audio
+    esphome/esp-audio-libs@1.1.2         ; audio
 
 build_flags =
     ${common:arduino.build_flags}
@@ -149,7 +149,7 @@ lib_deps =
     ${common:idf.lib_deps}
     droscy/esp_wireguard@0.4.2              ; wireguard
     kahrendt/ESPMicroSpeechFeatures@1.1.0   ; micro_wake_word
-    esphome/esp-audio-libs@1.1.1            ; audio
+    esphome/esp-audio-libs@1.1.2            ; audio
 build_flags =
     ${common:idf.build_flags}
     -Wno-nonnull-compare