[audio, mixer] Memory and CPU performance improvements (#8387)

2025-04-12 22:00:30 +01:00 · 2025-03-12 14:18:31 -05:00 · 2025-03-12 14:18:31 -05:00 · 266c2ef337
commit 266c2ef337
parent 35199c9b96
8 changed files with 80 additions and 30 deletions
--- a/esphome/components/audio/init.py
+++ b/esphome/components/audio/init.py
@ -118,4 +118,4 @@ def final_validate_audio_schema(


 async def to_code(config):
-    cg.add_library("esphome/esp-audio-libs", "1.1.1")
+    cg.add_library("esphome/esp-audio-libs", "1.1.2")
--- a/esphome/components/audio/audio_decoder.cpp
+++ b/esphome/components/audio/audio_decoder.cpp
@ -66,19 +66,30 @@ esp_err_t AudioDecoder::start(AudioFileType audio_file_type) {
    case AudioFileType::FLAC:
      this->flac_decoder_ = make_unique<esp_audio_libs::flac::FLACDecoder>();
      this->free_buffer_required_ =
-          this->output_transfer_buffer_->capacity();  // We'll revise this after reading the header
+          this->output_transfer_buffer_->capacity();  // Adjusted and reallocated after reading the header
      break;
 #endif
 #ifdef USE_AUDIO_MP3_SUPPORT
    case AudioFileType::MP3:
      this->mp3_decoder_ = esp_audio_libs::helix_decoder::MP3InitDecoder();
+
+      // MP3 always has 1152 samples per chunk
      this->free_buffer_required_ = 1152 * sizeof(int16_t) * 2;  // samples * size per sample * channels
+
+      // Always reallocate the output transfer buffer to the smallest necessary size
+      this->output_transfer_buffer_->reallocate(this->free_buffer_required_);
      break;
 #endif
    case AudioFileType::WAV:
      this->wav_decoder_ = make_unique<esp_audio_libs::wav_decoder::WAVDecoder>();
      this->wav_decoder_->reset();
+
+      // Processing WAVs doesn't actually require a specific amount of buffer size, as it is already in PCM format.
+      // Thus, we don't reallocate to a minimum size.
      this->free_buffer_required_ = 1024;
+      if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) {
+        this->output_transfer_buffer_->reallocate(this->free_buffer_required_);
+      }
      break;
    case AudioFileType::NONE:
    default:
@ -116,10 +127,18 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {

  uint32_t decoding_start = millis();

+  bool first_loop_iteration = true;
+
+  size_t bytes_processed = 0;
+  size_t bytes_available_before_processing = 0;
+
  while (state == FileDecoderState::MORE_TO_PROCESS) {
    // Transfer decoded out
    if (!this->pause_output_) {
-      size_t bytes_written = this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+      // Never shift the data in the output transfer buffer to avoid unnecessary, slow data moves
+      size_t bytes_written =
+          this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
+
      if (this->audio_stream_info_.has_value()) {
        this->accumulated_frames_written_ += this->audio_stream_info_.value().bytes_to_frames(bytes_written);
        this->playback_ms_ +=
@ -138,12 +157,24 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {

    // Decode more audio

-    size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+    // Only shift data on the first loop iteration to avoid unnecessary, slow moves
+    size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS),
+                                                                                first_loop_iteration);

-    if ((this->potentially_failed_count_ > 0) && (bytes_read == 0)) {
+    if (!first_loop_iteration && (this->input_transfer_buffer_->available() < bytes_processed)) {
+      // Less data is available than what was processed in last iteration, so don't attempt to decode.
+      // This attempts to avoid the decoder from consistently trying to decode an incomplete frame. The transfer buffer
+      // will shift the remaining data to the start and copy more from the source the next time the decode function is
+      // called
+      break;
+    }
+
+    bytes_available_before_processing = this->input_transfer_buffer_->available();
+
+    if ((this->potentially_failed_count_ > 10) && (bytes_read == 0)) {
      // Failed to decode in last attempt and there is no new data

-      if (this->input_transfer_buffer_->free() == 0) {
+      if ((this->input_transfer_buffer_->free() == 0) && first_loop_iteration) {
        // The input buffer is full. Since it previously failed on the exact same data, we can never recover
        state = FileDecoderState::FAILED;
      } else {
@ -175,6 +206,9 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
      }
    }

+    first_loop_iteration = false;
+    bytes_processed = bytes_available_before_processing - this->input_transfer_buffer_->available();
+
    if (state == FileDecoderState::POTENTIALLY_FAILED) {
      ++this->potentially_failed_count_;
    } else if (state == FileDecoderState::END_OF_FILE) {
@ -207,13 +241,11 @@ FileDecoderState AudioDecoder::decode_flac_() {
    size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
    this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed);

+    // Reallocate the output transfer buffer to the smallest necessary size
    this->free_buffer_required_ = flac_decoder_->get_output_buffer_size_bytes();
-    if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) {
-      // Output buffer is not big enough
-      if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) {
-        // Couldn't reallocate output buffer
-        return FileDecoderState::FAILED;
-      }
+    if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) {
+      // Couldn't reallocate output buffer
+      return FileDecoderState::FAILED;
    }

    this->audio_stream_info_ =
--- a/esphome/components/audio/audio_reader.cpp
+++ b/esphome/components/audio/audio_reader.cpp
@ -259,14 +259,14 @@ AudioReaderState AudioReader::file_read_() {
 }

 AudioReaderState AudioReader::http_read_() {
-  this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+  this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);

  if (esp_http_client_is_complete_data_received(this->client_)) {
    if (this->output_transfer_buffer_->available() == 0) {
      this->cleanup_connection_();
      return AudioReaderState::FINISHED;
    }
-  } else {
+  } else if (this->output_transfer_buffer_->free() > 0) {
    size_t bytes_to_read = this->output_transfer_buffer_->free();
    int received_len =
        esp_http_client_read(this->client_, (char *) this->output_transfer_buffer_->get_buffer_end(), bytes_to_read);
--- a/esphome/components/audio/audio_resampler.cpp
+++ b/esphome/components/audio/audio_resampler.cpp
@ -93,8 +93,9 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d
  }

  if (!this->pause_output_) {
-    // Move audio data to the sink
-    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+    // Move audio data to the sink without shifting the data in the output transfer buffer to avoid unnecessary, slow
+    // data moves
+    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);
  } else {
    // If paused, block to avoid wasting CPU resources
    delay(READ_WRITE_TIMEOUT_MS);
@ -115,6 +116,7 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d

  if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||
      (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {
+    // Adjust gain by -3 dB to avoid clipping due to the resampling process
    esp_audio_libs::resampler::ResamplerResults results =
        this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(),
                                   this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3);
--- a/esphome/components/audio/audio_transfer_buffer.cpp
+++ b/esphome/components/audio/audio_transfer_buffer.cpp
@ -33,12 +33,17 @@ size_t AudioTransferBuffer::free() const {
  if (this->buffer_size_ == 0) {
    return 0;
  }
-  return this->buffer_size_ - (this->buffer_length_ - (this->data_start_ - this->buffer_));
+  return this->buffer_size_ - (this->buffer_length_ + (this->data_start_ - this->buffer_));
 }

 void AudioTransferBuffer::decrease_buffer_length(size_t bytes) {
  this->buffer_length_ -= bytes;
-  this->data_start_ += bytes;
+  if (this->buffer_length_ > 0) {
+    this->data_start_ += bytes;
+  } else {
+    // All the data in the buffer has been consumed, reset the start pointer
+    this->data_start_ = this->buffer_;
+  }
 }

 void AudioTransferBuffer::increase_buffer_length(size_t bytes) { this->buffer_length_ += bytes; }
@ -71,7 +76,7 @@ bool AudioTransferBuffer::has_buffered_data() const {

 bool AudioTransferBuffer::reallocate(size_t new_buffer_size) {
  if (this->buffer_length_ > 0) {
-    // Already has data in the buffer, fail
+    // Buffer currently has data, so reallocation is impossible
    return false;
  }
  this->deallocate_buffer_();
@ -106,12 +111,14 @@ void AudioTransferBuffer::deallocate_buffer_() {
  this->buffer_length_ = 0;
 }

-size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_wait) {
-  // Shift data in buffer to start
-  if (this->buffer_length_ > 0) {
-    memmove(this->buffer_, this->data_start_, this->buffer_length_);
+size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_wait, bool pre_shift) {
+  if (pre_shift) {
+    // Shift data in buffer to start
+    if (this->buffer_length_ > 0) {
+      memmove(this->buffer_, this->data_start_, this->buffer_length_);
+    }
+    this->data_start_ = this->buffer_;
  }
-  this->data_start_ = this->buffer_;

  size_t bytes_to_read = this->free();
  size_t bytes_read = 0;
@ -125,7 +132,7 @@ size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_
  return bytes_read;
 }

-size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait) {
+size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait, bool post_shift) {
  size_t bytes_written = 0;
  if (this->available()) {
 #ifdef USE_SPEAKER
@ -139,11 +146,14 @@ size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait)
    }

    this->decrease_buffer_length(bytes_written);
+  }

+  if (post_shift) {
    // Shift unwritten data to the start of the buffer
    memmove(this->buffer_, this->data_start_, this->buffer_length_);
    this->data_start_ = this->buffer_;
  }
+
  return bytes_written;
 }

--- a/esphome/components/audio/audio_transfer_buffer.h
+++ b/esphome/components/audio/audio_transfer_buffer.h
@ -60,6 +60,7 @@ class AudioTransferBuffer {

 protected:
  /// @brief Allocates the transfer buffer in external memory, if available.
+  /// @param buffer_size The number of bytes to allocate
  /// @return True is successful, false otherwise.
  bool allocate_buffer_(size_t buffer_size);

@ -89,8 +90,10 @@ class AudioSinkTransferBuffer : public AudioTransferBuffer {

  /// @brief Writes any available data in the transfer buffer to the sink.
  /// @param ticks_to_wait FreeRTOS ticks to block while waiting for the sink to have enough space
+  /// @param post_shift If true, all remaining data is moved to the start of the buffer after transferring to the sink.
+  ///                   Defaults to true.
  /// @return Number of bytes written
-  size_t transfer_data_to_sink(TickType_t ticks_to_wait);
+  size_t transfer_data_to_sink(TickType_t ticks_to_wait, bool post_shift = true);

  /// @brief Adds a ring buffer as the transfer buffer's sink.
  /// @param ring_buffer weak_ptr to the allocated ring buffer
@ -125,8 +128,10 @@ class AudioSourceTransferBuffer : public AudioTransferBuffer {

  /// @brief Reads any available data from the sink into the transfer buffer.
  /// @param ticks_to_wait FreeRTOS ticks to block while waiting for the source to have enough data
+  /// @param pre_shift If true, any unwritten data is moved to the start of the buffer before transferring from the
+  ///                  source. Defaults to true.
  /// @return Number of bytes read
-  size_t transfer_data_from_source(TickType_t ticks_to_wait);
+  size_t transfer_data_from_source(TickType_t ticks_to_wait, bool pre_shift = true);

  /// @brief Adds a ring buffer as the transfer buffer's source.
  /// @param ring_buffer weak_ptr to the allocated ring buffer
--- a/esphome/components/mixer/speaker/mixer_speaker.cpp
+++ b/esphome/components/mixer/speaker/mixer_speaker.cpp
@ -490,7 +490,8 @@ void MixerSpeaker::audio_mixer_task(void *params) {
      break;
    }

-    output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS));
+    // Never shift the data in the output transfer buffer to avoid unnecessary, slow data moves
+    output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS), false);

    const uint32_t output_frames_free =
        this_mixer->audio_stream_info_.value().bytes_to_frames(output_transfer_buffer->free());
--- a/platformio.ini
+++ b/platformio.ini
@ -128,7 +128,7 @@ lib_deps =
    DNSServer                            ; captive_portal (Arduino built-in)
    esphome/ESP32-audioI2S@2.0.7         ; i2s_audio
    droscy/esp_wireguard@0.4.2           ; wireguard
-    esphome/esp-audio-libs@1.1.1         ; audio
+    esphome/esp-audio-libs@1.1.2         ; audio

 build_flags =
    ${common:arduino.build_flags}
@ -149,7 +149,7 @@ lib_deps =
    ${common:idf.lib_deps}
    droscy/esp_wireguard@0.4.2              ; wireguard
    kahrendt/ESPMicroSpeechFeatures@1.1.0   ; micro_wake_word
-    esphome/esp-audio-libs@1.1.1            ; audio
+    esphome/esp-audio-libs@1.1.2            ; audio
 build_flags =
    ${common:idf.build_flags}
    -Wno-nonnull-compare