From c8bbc2e84c2eb53c58ec43f0e69a2a4501f8afab Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@nabucasa.com>
Date: Mon, 3 Feb 2025 16:34:20 -0600
Subject: [PATCH 1/3] [audio] Media Player Components PR4 (#8166)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
---
 esphome/components/audio/audio_reader.cpp | 308 ++++++++++++++++++++++
 esphome/components/audio/audio_reader.h   |  85 ++++++
 2 files changed, 393 insertions(+)
 create mode 100644 esphome/components/audio/audio_reader.cpp
 create mode 100644 esphome/components/audio/audio_reader.h
diff --git a/esphome/components/audio/audio_reader.cpp b/esphome/components/audio/audio_reader.cpp
new file mode 100644
index 0000000000..b93e4e74ea
--- /dev/null
+++ b/esphome/components/audio/audio_reader.cpp
@@ -0,0 +1,308 @@
+#include "audio_reader.h"
+
+#ifdef USE_ESP_IDF
+
+#include "esphome/core/defines.h"
+#include "esphome/core/hal.h"
+#include "esphome/core/helpers.h"
+
+#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE
+#include "esp_crt_bundle.h"
+#endif
+
+namespace esphome {
+namespace audio {
+
+static const uint32_t READ_WRITE_TIMEOUT_MS = 20;
+
+// The number of times the http read times out with no data before throwing an error
+static const uint32_t ERROR_COUNT_NO_DATA_READ_TIMEOUT = 100;
+
+static const size_t HTTP_STREAM_BUFFER_SIZE = 2048;
+
+static const uint8_t MAX_REDIRECTION = 5;
+
+// Some common HTTP status codes - borrowed from http_request component accessed 20241224
+enum HttpStatus {
+  HTTP_STATUS_OK = 200,
+  HTTP_STATUS_NO_CONTENT = 204,
+  HTTP_STATUS_PARTIAL_CONTENT = 206,
+
+  /* 3xx - Redirection */
+  HTTP_STATUS_MULTIPLE_CHOICES = 300,
+  HTTP_STATUS_MOVED_PERMANENTLY = 301,
+  HTTP_STATUS_FOUND = 302,
+  HTTP_STATUS_SEE_OTHER = 303,
+  HTTP_STATUS_NOT_MODIFIED = 304,
+  HTTP_STATUS_TEMPORARY_REDIRECT = 307,
+  HTTP_STATUS_PERMANENT_REDIRECT = 308,
+
+  /* 4XX - CLIENT ERROR */
+  HTTP_STATUS_BAD_REQUEST = 400,
+  HTTP_STATUS_UNAUTHORIZED = 401,
+  HTTP_STATUS_FORBIDDEN = 403,
+  HTTP_STATUS_NOT_FOUND = 404,
+  HTTP_STATUS_METHOD_NOT_ALLOWED = 405,
+  HTTP_STATUS_NOT_ACCEPTABLE = 406,
+  HTTP_STATUS_LENGTH_REQUIRED = 411,
+
+  /* 5xx - Server Error */
+  HTTP_STATUS_INTERNAL_ERROR = 500
+};
+
+AudioReader::~AudioReader() { this->cleanup_connection_(); }
+
+esp_err_t AudioReader::add_sink(const std::weak_ptr<RingBuffer> &output_ring_buffer) {
+  if (current_audio_file_ != nullptr) {
+    // A transfer buffer isn't ncessary for a local file
+    this->file_ring_buffer_ = output_ring_buffer.lock();
+    return ESP_OK;
+  }
+
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(output_ring_buffer);
+    return ESP_OK;
+  }
+
+  return ESP_ERR_INVALID_STATE;
+}
+
+esp_err_t AudioReader::start(AudioFile *audio_file, AudioFileType &file_type) {
+  file_type = AudioFileType::NONE;
+
+  this->current_audio_file_ = audio_file;
+
+  this->file_current_ = audio_file->data;
+  file_type = audio_file->file_type;
+
+  return ESP_OK;
+}
+
+esp_err_t AudioReader::start(const std::string &uri, AudioFileType &file_type) {
+  file_type = AudioFileType::NONE;
+
+  this->cleanup_connection_();
+
+  if (uri.empty()) {
+    return ESP_ERR_INVALID_ARG;
+  }
+
+  esp_http_client_config_t client_config = {};
+
+  client_config.url = uri.c_str();
+  client_config.cert_pem = nullptr;
+  client_config.disable_auto_redirect = false;
+  client_config.max_redirection_count = 10;
+  client_config.event_handler = http_event_handler;
+  client_config.user_data = this;
+  client_config.buffer_size = HTTP_STREAM_BUFFER_SIZE;
+  client_config.keep_alive_enable = true;
+  client_config.timeout_ms = 5000;  // Shouldn't trigger watchdog resets if caller runs in a task
+
+#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE
+  if (uri.find("https:") != std::string::npos) {
+    client_config.crt_bundle_attach = esp_crt_bundle_attach;
+  }
+#endif
+
+  this->client_ = esp_http_client_init(&client_config);
+
+  if (this->client_ == nullptr) {
+    return ESP_FAIL;
+  }
+
+  esp_err_t err = esp_http_client_open(this->client_, 0);
+
+  if (err != ESP_OK) {
+    this->cleanup_connection_();
+    return err;
+  }
+
+  int64_t header_length = esp_http_client_fetch_headers(this->client_);
+  if (header_length < 0) {
+    this->cleanup_connection_();
+    return ESP_FAIL;
+  }
+
+  int status_code = esp_http_client_get_status_code(this->client_);
+
+  if ((status_code < HTTP_STATUS_OK) || (status_code > HTTP_STATUS_PERMANENT_REDIRECT)) {
+    this->cleanup_connection_();
+    return ESP_FAIL;
+  }
+
+  ssize_t redirect_count = 0;
+
+  while ((esp_http_client_set_redirection(this->client_) == ESP_OK) && (redirect_count < MAX_REDIRECTION)) {
+    err = esp_http_client_open(this->client_, 0);
+    if (err != ESP_OK) {
+      this->cleanup_connection_();
+      return ESP_FAIL;
+    }
+
+    header_length = esp_http_client_fetch_headers(this->client_);
+    if (header_length < 0) {
+      this->cleanup_connection_();
+      return ESP_FAIL;
+    }
+
+    status_code = esp_http_client_get_status_code(this->client_);
+
+    if ((status_code < HTTP_STATUS_OK) || (status_code > HTTP_STATUS_PERMANENT_REDIRECT)) {
+      this->cleanup_connection_();
+      return ESP_FAIL;
+    }
+
+    ++redirect_count;
+  }
+
+  if (this->audio_file_type_ == AudioFileType::NONE) {
+    // Failed to determine the file type from the header, fallback to using the url
+    char url[500];
+    err = esp_http_client_get_url(this->client_, url, 500);
+    if (err != ESP_OK) {
+      this->cleanup_connection_();
+      return err;
+    }
+
+    std::string url_string = str_lower_case(url);
+
+    if (str_endswith(url_string, ".wav")) {
+      file_type = AudioFileType::WAV;
+    }
+#ifdef USE_AUDIO_MP3_SUPPORT
+    else if (str_endswith(url_string, ".mp3")) {
+      file_type = AudioFileType::MP3;
+    }
+#endif
+#ifdef USE_AUDIO_FLAC_SUPPORT
+    else if (str_endswith(url_string, ".flac")) {
+      file_type = AudioFileType::FLAC;
+    }
+#endif
+    else {
+      file_type = AudioFileType::NONE;
+      this->cleanup_connection_();
+      return ESP_ERR_NOT_SUPPORTED;
+    }
+  } else {
+    file_type = this->audio_file_type_;
+  }
+
+  this->no_data_read_count_ = 0;
+
+  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(this->buffer_size_);
+  if (this->output_transfer_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  return ESP_OK;
+}
+
+AudioReaderState AudioReader::read() {
+  if (this->client_ != nullptr) {
+    return this->http_read_();
+  } else if (this->current_audio_file_ != nullptr) {
+    return this->file_read_();
+  }
+
+  return AudioReaderState::FAILED;
+}
+
+AudioFileType AudioReader::get_audio_type(const char *content_type) {
+#ifdef USE_AUDIO_MP3_SUPPORT
+  if (strcasecmp(content_type, "mp3") == 0 || strcasecmp(content_type, "audio/mp3") == 0 ||
+      strcasecmp(content_type, "audio/mpeg") == 0) {
+    return AudioFileType::MP3;
+  }
+#endif
+  if (strcasecmp(content_type, "audio/wav") == 0) {
+    return AudioFileType::WAV;
+  }
+#ifdef USE_AUDIO_FLAC_SUPPORT
+  if (strcasecmp(content_type, "audio/flac") == 0 || strcasecmp(content_type, "audio/x-flac") == 0) {
+    return AudioFileType::FLAC;
+  }
+#endif
+  return AudioFileType::NONE;
+}
+
+esp_err_t AudioReader::http_event_handler(esp_http_client_event_t *evt) {
+  // Based on https://github.com/maroc81/WeatherLily/tree/main/main/net accessed 20241224
+  AudioReader *this_reader = (AudioReader *) evt->user_data;
+
+  switch (evt->event_id) {
+    case HTTP_EVENT_ON_HEADER:
+      if (strcasecmp(evt->header_key, "Content-Type") == 0) {
+        this_reader->audio_file_type_ = get_audio_type(evt->header_value);
+      }
+      break;
+    default:
+      break;
+  }
+  return ESP_OK;
+}
+
+AudioReaderState AudioReader::file_read_() {
+  size_t remaining_bytes = this->current_audio_file_->length - (this->file_current_ - this->current_audio_file_->data);
+  if (remaining_bytes > 0) {
+    size_t bytes_written = this->file_ring_buffer_->write_without_replacement(this->file_current_, remaining_bytes,
+                                                                              pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+    this->file_current_ += bytes_written;
+
+    return AudioReaderState::READING;
+  }
+
+  return AudioReaderState::FINISHED;
+}
+
+AudioReaderState AudioReader::http_read_() {
+  this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+
+  if (esp_http_client_is_complete_data_received(this->client_)) {
+    if (this->output_transfer_buffer_->available() == 0) {
+      this->cleanup_connection_();
+      return AudioReaderState::FINISHED;
+    }
+  } else {
+    size_t bytes_to_read = this->output_transfer_buffer_->free();
+    int received_len =
+        esp_http_client_read(this->client_, (char *) this->output_transfer_buffer_->get_buffer_end(), bytes_to_read);
+
+    if (received_len > 0) {
+      this->output_transfer_buffer_->increase_buffer_length(received_len);
+
+      this->no_data_read_count_ = 0;
+    } else if (received_len < 0) {
+      // HTTP read error
+      this->cleanup_connection_();
+      return AudioReaderState::FAILED;
+    } else {
+      if (bytes_to_read > 0) {
+        // Read timed out
+        ++this->no_data_read_count_;
+        if (this->no_data_read_count_ >= ERROR_COUNT_NO_DATA_READ_TIMEOUT) {
+          // Timed out with no data read too many times, so the http read has failed
+          this->cleanup_connection_();
+          return AudioReaderState::FAILED;
+        }
+        delay(READ_WRITE_TIMEOUT_MS);
+      }
+    }
+  }
+
+  return AudioReaderState::READING;
+}
+
+void AudioReader::cleanup_connection_() {
+  if (this->client_ != nullptr) {
+    esp_http_client_close(this->client_);
+    esp_http_client_cleanup(this->client_);
+    this->client_ = nullptr;
+  }
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/audio/audio_reader.h b/esphome/components/audio/audio_reader.h
new file mode 100644
index 0000000000..90113e6dda
--- /dev/null
+++ b/esphome/components/audio/audio_reader.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#ifdef USE_ESP_IDF
+
+#include "audio.h"
+#include "audio_transfer_buffer.h"
+
+#include "esphome/core/ring_buffer.h"
+
+#include "esp_err.h"
+
+#include <esp_http_client.h>
+
+namespace esphome {
+namespace audio {
+
+enum class AudioReaderState : uint8_t {
+  READING = 0,  // More data is available to read
+  FINISHED,     // All data has been read and transferred
+  FAILED,       // Encountered an error
+};
+
+class AudioReader {
+  /*
+   * @brief Class that facilitates reading a raw audio file.
+   * Files can be read from flash (stored in a AudioFile struct) or from an http source.
+   * The file data is sent to a ring buffer sink.
+   */
+ public:
+  /// @brief Constructs an AudioReader object.
+  /// The transfer buffer isn't allocated here, but only if necessary (an http source) in the start function.
+  /// @param buffer_size Transfer buffer size in bytes.
+  AudioReader(size_t buffer_size) : buffer_size_(buffer_size) {}
+  ~AudioReader();
+
+  /// @brief Adds a sink ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr
+  /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return  ESP_OK if successful, ESP_ERR_INVALID_STATE otherwise
+  esp_err_t add_sink(const std::weak_ptr<RingBuffer> &output_ring_buffer);
+
+  /// @brief Starts reading an audio file from an http source. The transfer buffer is allocated here.
+  /// @param uri Web url to the http file.
+  /// @param file_type AudioFileType variable passed-by-reference indicating the type of file being read.
+  /// @return ESP_OK if successful, an ESP_ERR* code otherwise.
+  esp_err_t start(const std::string &uri, AudioFileType &file_type);
+
+  /// @brief Starts reading an audio file from flash. No transfer buffer is allocated.
+  /// @param audio_file AudioFile struct containing the file.
+  /// @param file_type AudioFileType variable passed-by-reference indicating the type of file being read.
+  /// @return ESP_OK
+  esp_err_t start(AudioFile *audio_file, AudioFileType &file_type);
+
+  /// @brief Reads new file data from the source and sends to the ring buffer sink.
+  /// @return AudioReaderState
+  AudioReaderState read();
+
+ protected:
+  /// @brief Monitors the http client events to attempt determining the file type from the Content-Type header
+  static esp_err_t http_event_handler(esp_http_client_event_t *evt);
+
+  /// @brief Determines the audio file type from the http header's Content-Type key
+  /// @param content_type string with the Content-Type key
+  /// @return AudioFileType of the url, if it can be determined. If not, return AudioFileType::NONE.
+  static AudioFileType get_audio_type(const char *content_type);
+
+  AudioReaderState file_read_();
+  AudioReaderState http_read_();
+
+  std::shared_ptr<RingBuffer> file_ring_buffer_;
+  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;
+  void cleanup_connection_();
+
+  size_t buffer_size_;
+  uint32_t no_data_read_count_;
+
+  esp_http_client_handle_t client_{nullptr};
+
+  AudioFile *current_audio_file_{nullptr};
+  AudioFileType audio_file_type_{AudioFileType::NONE};
+  const uint8_t *file_current_{nullptr};
+};
+}  // namespace audio
+}  // namespace esphome
+
+#endif

From b8f9eaecd85fc2b9e6b3086f030b6fe76d94c656 Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@nabucasa.com>
Date: Mon, 3 Feb 2025 17:47:50 -0600
Subject: [PATCH 2/3] [audio] Media Player Components PR5 (#8167)

---
 esphome/components/audio/audio_decoder.cpp | 362 +++++++++++++++++++++
 esphome/components/audio/audio_decoder.h   | 135 ++++++++
 2 files changed, 497 insertions(+)
 create mode 100644 esphome/components/audio/audio_decoder.cpp
 create mode 100644 esphome/components/audio/audio_decoder.h

diff --git a/esphome/components/audio/audio_decoder.cpp b/esphome/components/audio/audio_decoder.cpp
new file mode 100644
index 0000000000..b249f1381d
--- /dev/null
+++ b/esphome/components/audio/audio_decoder.cpp
@@ -0,0 +1,362 @@
+#include "audio_decoder.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/hal.h"
+
+namespace esphome {
+namespace audio {
+
+static const uint32_t DECODING_TIMEOUT_MS = 50;    // The decode function will yield after this duration
+static const uint32_t READ_WRITE_TIMEOUT_MS = 20;  // Timeout for transferring audio data
+
+static const uint32_t MAX_POTENTIALLY_FAILED_COUNT = 10;
+
+AudioDecoder::AudioDecoder(size_t input_buffer_size, size_t output_buffer_size) {
+  this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size);
+  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size);
+}
+
+AudioDecoder::~AudioDecoder() {
+#ifdef USE_AUDIO_MP3_SUPPORT
+  if (this->audio_file_type_ == AudioFileType::MP3) {
+    esp_audio_libs::helix_decoder::MP3FreeDecoder(this->mp3_decoder_);
+  }
+#endif
+}
+
+esp_err_t AudioDecoder::add_source(std::weak_ptr<RingBuffer> &input_ring_buffer) {
+  if (this->input_transfer_buffer_ != nullptr) {
+    this->input_transfer_buffer_->set_source(input_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+esp_err_t AudioDecoder::add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(output_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+#ifdef USE_SPEAKER
+esp_err_t AudioDecoder::add_sink(speaker::Speaker *speaker) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(speaker);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+#endif
+
+esp_err_t AudioDecoder::start(AudioFileType audio_file_type) {
+  if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  this->audio_file_type_ = audio_file_type;
+
+  this->potentially_failed_count_ = 0;
+  this->end_of_file_ = false;
+
+  switch (this->audio_file_type_) {
+#ifdef USE_AUDIO_FLAC_SUPPORT
+    case AudioFileType::FLAC:
+      this->flac_decoder_ = make_unique<esp_audio_libs::flac::FLACDecoder>();
+      this->free_buffer_required_ =
+          this->output_transfer_buffer_->capacity();  // We'll revise this after reading the header
+      break;
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+    case AudioFileType::MP3:
+      this->mp3_decoder_ = esp_audio_libs::helix_decoder::MP3InitDecoder();
+      this->free_buffer_required_ = 1152 * sizeof(int16_t) * 2;  // samples * size per sample * channels
+      break;
+#endif
+    case AudioFileType::WAV:
+      this->wav_decoder_ = make_unique<esp_audio_libs::wav_decoder::WAVDecoder>();
+      this->wav_decoder_->reset();
+      this->free_buffer_required_ = 1024;
+      break;
+    case AudioFileType::NONE:
+    default:
+      return ESP_ERR_NOT_SUPPORTED;
+      break;
+  }
+
+  return ESP_OK;
+}
+
+AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
+  if (stop_gracefully) {
+    if (this->output_transfer_buffer_->available() == 0) {
+      if (this->end_of_file_) {
+        // The file decoder indicates it reached the end of file
+        return AudioDecoderState::FINISHED;
+      }
+
+      if (!this->input_transfer_buffer_->has_buffered_data()) {
+        // If all the internal buffers are empty, the decoding is done
+        return AudioDecoderState::FINISHED;
+      }
+    }
+  }
+
+  if (this->potentially_failed_count_ > MAX_POTENTIALLY_FAILED_COUNT) {
+    if (stop_gracefully) {
+      // No more new data is going to come in, so decoding is done
+      return AudioDecoderState::FINISHED;
+    }
+    return AudioDecoderState::FAILED;
+  }
+
+  FileDecoderState state = FileDecoderState::MORE_TO_PROCESS;
+
+  uint32_t decoding_start = millis();
+
+  while (state == FileDecoderState::MORE_TO_PROCESS) {
+    // Transfer decoded out
+    if (!this->pause_output_) {
+      size_t bytes_written = this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+      if (this->audio_stream_info_.has_value()) {
+        this->accumulated_frames_written_ += this->audio_stream_info_.value().bytes_to_frames(bytes_written);
+        this->playback_ms_ +=
+            this->audio_stream_info_.value().frames_to_milliseconds_with_remainder(&this->accumulated_frames_written_);
+      }
+    } else {
+      // If paused, block to avoid wasting CPU resources
+      delay(READ_WRITE_TIMEOUT_MS);
+    }
+
+    // Verify there is enough space to store more decoded audio and that the function hasn't been running too long
+    if ((this->output_transfer_buffer_->free() < this->free_buffer_required_) ||
+        (millis() - decoding_start > DECODING_TIMEOUT_MS)) {
+      return AudioDecoderState::DECODING;
+    }
+
+    // Decode more audio
+
+    size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+
+    if ((this->potentially_failed_count_ > 0) && (bytes_read == 0)) {
+      // Failed to decode in last attempt and there is no new data
+
+      if (this->input_transfer_buffer_->free() == 0) {
+        // The input buffer is full. Since it previously failed on the exact same data, we can never recover
+        state = FileDecoderState::FAILED;
+      } else {
+        // Attempt to get more data next time
+        state = FileDecoderState::IDLE;
+      }
+    } else if (this->input_transfer_buffer_->available() == 0) {
+      // No data to decode, attempt to get more data next time
+      state = FileDecoderState::IDLE;
+    } else {
+      switch (this->audio_file_type_) {
+#ifdef USE_AUDIO_FLAC_SUPPORT
+        case AudioFileType::FLAC:
+          state = this->decode_flac_();
+          break;
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+        case AudioFileType::MP3:
+          state = this->decode_mp3_();
+          break;
+#endif
+        case AudioFileType::WAV:
+          state = this->decode_wav_();
+          break;
+        case AudioFileType::NONE:
+        default:
+          state = FileDecoderState::IDLE;
+          break;
+      }
+    }
+
+    if (state == FileDecoderState::POTENTIALLY_FAILED) {
+      ++this->potentially_failed_count_;
+    } else if (state == FileDecoderState::END_OF_FILE) {
+      this->end_of_file_ = true;
+    } else if (state == FileDecoderState::FAILED) {
+      return AudioDecoderState::FAILED;
+    } else if (state == FileDecoderState::MORE_TO_PROCESS) {
+      this->potentially_failed_count_ = 0;
+    }
+  }
+  return AudioDecoderState::DECODING;
+}
+
+#ifdef USE_AUDIO_FLAC_SUPPORT
+FileDecoderState AudioDecoder::decode_flac_() {
+  if (!this->audio_stream_info_.has_value()) {
+    // Header hasn't been read
+    auto result = this->flac_decoder_->read_header(this->input_transfer_buffer_->get_buffer_start(),
+                                                   this->input_transfer_buffer_->available());
+
+    if (result == esp_audio_libs::flac::FLAC_DECODER_HEADER_OUT_OF_DATA) {
+      return FileDecoderState::POTENTIALLY_FAILED;
+    }
+
+    if (result != esp_audio_libs::flac::FLAC_DECODER_SUCCESS) {
+      // Couldn't read FLAC header
+      return FileDecoderState::FAILED;
+    }
+
+    size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
+    this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed);
+
+    this->free_buffer_required_ = flac_decoder_->get_output_buffer_size_bytes();
+    if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) {
+      // Output buffer is not big enough
+      if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) {
+        // Couldn't reallocate output buffer
+        return FileDecoderState::FAILED;
+      }
+    }
+
+    this->audio_stream_info_ =
+        audio::AudioStreamInfo(this->flac_decoder_->get_sample_depth(), this->flac_decoder_->get_num_channels(),
+                               this->flac_decoder_->get_sample_rate());
+
+    return FileDecoderState::MORE_TO_PROCESS;
+  }
+
+  uint32_t output_samples = 0;
+  auto result = this->flac_decoder_->decode_frame(
+      this->input_transfer_buffer_->get_buffer_start(), this->input_transfer_buffer_->available(),
+      reinterpret_cast<int16_t *>(this->output_transfer_buffer_->get_buffer_end()), &output_samples);
+
+  if (result == esp_audio_libs::flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
+    // Not an issue, just needs more data that we'll get next time.
+    return FileDecoderState::POTENTIALLY_FAILED;
+  }
+
+  size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
+  this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed);
+
+  if (result > esp_audio_libs::flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
+    // Corrupted frame, don't retry with current buffer content, wait for new sync
+    return FileDecoderState::POTENTIALLY_FAILED;
+  }
+
+  // We have successfully decoded some input data and have new output data
+  this->output_transfer_buffer_->increase_buffer_length(
+      this->audio_stream_info_.value().samples_to_bytes(output_samples));
+
+  if (result == esp_audio_libs::flac::FLAC_DECODER_NO_MORE_FRAMES) {
+    return FileDecoderState::END_OF_FILE;
+  }
+
+  return FileDecoderState::MORE_TO_PROCESS;
+}
+#endif
+
+#ifdef USE_AUDIO_MP3_SUPPORT
+FileDecoderState AudioDecoder::decode_mp3_() {
+  // Look for the next sync word
+  int buffer_length = (int) this->input_transfer_buffer_->available();
+  int32_t offset =
+      esp_audio_libs::helix_decoder::MP3FindSyncWord(this->input_transfer_buffer_->get_buffer_start(), buffer_length);
+
+  if (offset < 0) {
+    // New data may have the sync word
+    this->input_transfer_buffer_->decrease_buffer_length(buffer_length);
+    return FileDecoderState::POTENTIALLY_FAILED;
+  }
+
+  // Advance read pointer to match the offset for the syncword
+  this->input_transfer_buffer_->decrease_buffer_length(offset);
+  uint8_t *buffer_start = this->input_transfer_buffer_->get_buffer_start();
+
+  buffer_length = (int) this->input_transfer_buffer_->available();
+  int err = esp_audio_libs::helix_decoder::MP3Decode(this->mp3_decoder_, &buffer_start, &buffer_length,
+                                                     (int16_t *) this->output_transfer_buffer_->get_buffer_end(), 0);
+
+  size_t consumed = this->input_transfer_buffer_->available() - buffer_length;
+  this->input_transfer_buffer_->decrease_buffer_length(consumed);
+
+  if (err) {
+    switch (err) {
+      case esp_audio_libs::helix_decoder::ERR_MP3_OUT_OF_MEMORY:
+        return FileDecoderState::FAILED;
+        break;
+      case esp_audio_libs::helix_decoder::ERR_MP3_NULL_POINTER:
+        return FileDecoderState::FAILED;
+        break;
+      default:
+        // Most errors are recoverable by moving on to the next frame, so mark as potentailly failed
+        return FileDecoderState::POTENTIALLY_FAILED;
+        break;
+    }
+  } else {
+    esp_audio_libs::helix_decoder::MP3FrameInfo mp3_frame_info;
+    esp_audio_libs::helix_decoder::MP3GetLastFrameInfo(this->mp3_decoder_, &mp3_frame_info);
+    if (mp3_frame_info.outputSamps > 0) {
+      int bytes_per_sample = (mp3_frame_info.bitsPerSample / 8);
+      this->output_transfer_buffer_->increase_buffer_length(mp3_frame_info.outputSamps * bytes_per_sample);
+
+      if (!this->audio_stream_info_.has_value()) {
+        this->audio_stream_info_ =
+            audio::AudioStreamInfo(mp3_frame_info.bitsPerSample, mp3_frame_info.nChans, mp3_frame_info.samprate);
+      }
+    }
+  }
+
+  return FileDecoderState::MORE_TO_PROCESS;
+}
+#endif
+
+FileDecoderState AudioDecoder::decode_wav_() {
+  if (!this->audio_stream_info_.has_value()) {
+    // Header hasn't been processed
+
+    esp_audio_libs::wav_decoder::WAVDecoderResult result = this->wav_decoder_->decode_header(
+        this->input_transfer_buffer_->get_buffer_start(), this->input_transfer_buffer_->available());
+
+    if (result == esp_audio_libs::wav_decoder::WAV_DECODER_SUCCESS_IN_DATA) {
+      this->input_transfer_buffer_->decrease_buffer_length(this->wav_decoder_->bytes_processed());
+
+      this->audio_stream_info_ = audio::AudioStreamInfo(
+          this->wav_decoder_->bits_per_sample(), this->wav_decoder_->num_channels(), this->wav_decoder_->sample_rate());
+
+      this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left();
+      this->wav_has_known_end_ = (this->wav_bytes_left_ > 0);
+      return FileDecoderState::MORE_TO_PROCESS;
+    } else if (result == esp_audio_libs::wav_decoder::WAV_DECODER_WARNING_INCOMPLETE_DATA) {
+      // Available data didn't have the full header
+      return FileDecoderState::POTENTIALLY_FAILED;
+    } else {
+      return FileDecoderState::FAILED;
+    }
+  } else {
+    if (!this->wav_has_known_end_ || (this->wav_bytes_left_ > 0)) {
+      size_t bytes_to_copy = this->input_transfer_buffer_->available();
+
+      if (this->wav_has_known_end_) {
+        bytes_to_copy = std::min(bytes_to_copy, this->wav_bytes_left_);
+      }
+
+      bytes_to_copy = std::min(bytes_to_copy, this->output_transfer_buffer_->free());
+
+      if (bytes_to_copy > 0) {
+        std::memcpy(this->output_transfer_buffer_->get_buffer_end(), this->input_transfer_buffer_->get_buffer_start(),
+                    bytes_to_copy);
+        this->input_transfer_buffer_->decrease_buffer_length(bytes_to_copy);
+        this->output_transfer_buffer_->increase_buffer_length(bytes_to_copy);
+        if (this->wav_has_known_end_) {
+          this->wav_bytes_left_ -= bytes_to_copy;
+        }
+      }
+      return FileDecoderState::IDLE;
+    }
+  }
+
+  return FileDecoderState::END_OF_FILE;
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/audio/audio_decoder.h b/esphome/components/audio/audio_decoder.h
new file mode 100644
index 0000000000..2ca1d623fe
--- /dev/null
+++ b/esphome/components/audio/audio_decoder.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "audio.h"
+#include "audio_transfer_buffer.h"
+
+#include "esphome/core/defines.h"
+#include "esphome/core/helpers.h"
+#include "esphome/core/ring_buffer.h"
+
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
+
+#include "esp_err.h"
+
+// esp-audio-libs
+#ifdef USE_AUDIO_FLAC_SUPPORT
+#include <flac_decoder.h>
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+#include <mp3_decoder.h>
+#endif
+#include <wav_decoder.h>
+
+namespace esphome {
+namespace audio {
+
+enum class AudioDecoderState : uint8_t {
+  DECODING = 0,  // More data is available to decode
+  FINISHED,      // All file data has been decoded and transferred
+  FAILED,        // Encountered an error
+};
+
+// Only used within the AudioDecoder class; conveys the state of the particular file type decoder
+enum class FileDecoderState : uint8_t {
+  MORE_TO_PROCESS,     // Successsfully read a file chunk and more data is available to decode
+  IDLE,                // Not enough data to decode, waiting for more to be transferred
+  POTENTIALLY_FAILED,  // Decoder encountered a potentially recoverable error if more file data is available
+  FAILED,              // Decoder encoutnered an uncrecoverable error
+  END_OF_FILE,         // The specific file decoder knows its the end of the file
+};
+
+class AudioDecoder {
+  /*
+   * @brief Class that facilitates decoding an audio file.
+   * The audio file is read from a ring buffer source, decoded, and sent to an audio sink (ring buffer or speaker
+   * component).
+   * Supports wav, flac, and mp3 formats.
+   */
+ public:
+  /// @brief Allocates the input and output transfer buffers
+  /// @param input_buffer_size Size of the input transfer buffer in bytes.
+  /// @param output_buffer_size Size of the output transfer buffer in bytes.
+  AudioDecoder(size_t input_buffer_size, size_t output_buffer_size);
+
+  /// @brief Deallocates the MP3 decoder (the flac and wav decoders are deallocated automatically)
+  ~AudioDecoder();
+
+  /// @brief Adds a source ring buffer for raw file data. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_source(std::weak_ptr<RingBuffer> &input_ring_buffer);
+
+  /// @brief Adds a sink ring buffer for decoded audio. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer);
+
+#ifdef USE_SPEAKER
+  /// @brief Adds a sink speaker for decoded audio.
+  /// @param speaker pointer to speaker component
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(speaker::Speaker *speaker);
+#endif
+
+  /// @brief Sets up decoding the file
+  /// @param audio_file_type AudioFileType of the file
+  /// @return ESP_OK if successful, ESP_ERR_NO_MEM if the transfer buffers fail to allocate, or ESP_ERR_NOT_SUPPORTED if
+  /// the format isn't supported.
+  esp_err_t start(AudioFileType audio_file_type);
+
+  /// @brief Decodes audio from the ring buffer source and writes to the sink.
+  /// @param stop_gracefully If true, it indicates the file source is finished. The decoder will decode all the
+  /// reamining data and then finish.
+  /// @return AudioDecoderState
+  AudioDecoderState decode(bool stop_gracefully);
+
+  /// @brief Gets the audio stream information, if it has been decoded from the files header
+  /// @return optional<AudioStreamInfo> with the audio information. If not available yet, returns no value.
+  const optional<audio::AudioStreamInfo> &get_audio_stream_info() const { return this->audio_stream_info_; }
+
+  /// @brief Returns the duration of audio (in milliseconds) decoded and sent to the sink
+  /// @return Duration of decoded audio in milliseconds
+  uint32_t get_playback_ms() const { return this->playback_ms_; }
+
+  /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers.
+  /// @param pause_state If true, audio data is not sent to the sink.
+  void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; }
+
+ protected:
+  std::unique_ptr<esp_audio_libs::wav_decoder::WAVDecoder> wav_decoder_;
+#ifdef USE_AUDIO_FLAC_SUPPORT
+  FileDecoderState decode_flac_();
+  std::unique_ptr<esp_audio_libs::flac::FLACDecoder> flac_decoder_;
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+  FileDecoderState decode_mp3_();
+  esp_audio_libs::helix_decoder::HMP3Decoder mp3_decoder_;
+#endif
+  FileDecoderState decode_wav_();
+
+  std::unique_ptr<AudioSourceTransferBuffer> input_transfer_buffer_;
+  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;
+
+  AudioFileType audio_file_type_{AudioFileType::NONE};
+  optional<AudioStreamInfo> audio_stream_info_{};
+
+  size_t free_buffer_required_{0};
+  size_t wav_bytes_left_{0};
+
+  uint32_t potentially_failed_count_{0};
+  bool end_of_file_{false};
+  bool wav_has_known_end_{false};
+
+  bool pause_output_{false};
+
+  uint32_t accumulated_frames_written_{0};
+  uint32_t playback_ms_{0};
+};
+}  // namespace audio
+}  // namespace esphome
+
+#endif

From 6b55df36c7e21997d69c4688ee1627e545596087 Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@nabucasa.com>
Date: Mon, 3 Feb 2025 20:58:35 -0600
Subject: [PATCH 3/3] [audio] Media Player Components PR6 (#8168)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
---
 esphome/components/audio/audio_resampler.cpp | 159 +++++++++++++++++++
 esphome/components/audio/audio_resampler.h   | 100 ++++++++++++
 2 files changed, 259 insertions(+)
 create mode 100644 esphome/components/audio/audio_resampler.cpp
 create mode 100644 esphome/components/audio/audio_resampler.h

diff --git a/esphome/components/audio/audio_resampler.cpp b/esphome/components/audio/audio_resampler.cpp
new file mode 100644
index 0000000000..05e9ff6ca1
--- /dev/null
+++ b/esphome/components/audio/audio_resampler.cpp
@@ -0,0 +1,159 @@
+#include "audio_resampler.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/hal.h"
+
+namespace esphome {
+namespace audio {
+
+static const uint32_t READ_WRITE_TIMEOUT_MS = 20;
+
+AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size)
+    : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) {
+  this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size);
+  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size);
+}
+
+esp_err_t AudioResampler::add_source(std::weak_ptr<RingBuffer> &input_ring_buffer) {
+  if (this->input_transfer_buffer_ != nullptr) {
+    this->input_transfer_buffer_->set_source(input_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+esp_err_t AudioResampler::add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(output_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+#ifdef USE_SPEAKER
+esp_err_t AudioResampler::add_sink(speaker::Speaker *speaker) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(speaker);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+#endif
+
+esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info,
+                                uint16_t number_of_taps, uint16_t number_of_filters) {
+  this->input_stream_info_ = input_stream_info;
+  this->output_stream_info_ = output_stream_info;
+
+  if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if ((input_stream_info.get_bits_per_sample() > 32) || (output_stream_info.get_bits_per_sample() > 32) ||
+      (input_stream_info_.get_channels() != output_stream_info.get_channels())) {
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+
+  if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) ||
+      (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) {
+    this->resampler_ = make_unique<esp_audio_libs::resampler::Resampler>(
+        input_stream_info.bytes_to_samples(this->input_buffer_size_),
+        output_stream_info.bytes_to_samples(this->output_buffer_size_));
+
+    // Use cascaded biquad filters when downsampling to avoid aliasing
+    bool use_pre_filter = output_stream_info.get_sample_rate() < input_stream_info.get_sample_rate();
+
+    esp_audio_libs::resampler::ResamplerConfiguration resample_config = {
+        .source_sample_rate = static_cast<float>(input_stream_info.get_sample_rate()),
+        .target_sample_rate = static_cast<float>(output_stream_info.get_sample_rate()),
+        .source_bits_per_sample = input_stream_info.get_bits_per_sample(),
+        .target_bits_per_sample = output_stream_info.get_bits_per_sample(),
+        .channels = input_stream_info_.get_channels(),
+        .use_pre_or_post_filter = use_pre_filter,
+        .subsample_interpolate = false,  // Doubles the CPU load. Using more filters is a better alternative
+        .number_of_taps = number_of_taps,
+        .number_of_filters = number_of_filters,
+    };
+
+    if (!this->resampler_->initialize(resample_config)) {
+      // Failed to allocate the resampler's internal buffers
+      return ESP_ERR_NO_MEM;
+    }
+  }
+
+  return ESP_OK;
+}
+
+AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) {
+  if (stop_gracefully) {
+    if (!this->input_transfer_buffer_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) {
+      return AudioResamplerState::FINISHED;
+    }
+  }
+
+  if (!this->pause_output_) {
+    // Move audio data to the sink
+    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+  } else {
+    // If paused, block to avoid wasting CPU resources
+    delay(READ_WRITE_TIMEOUT_MS);
+  }
+
+  this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+
+  if (this->input_transfer_buffer_->available() == 0) {
+    // No samples available to process
+    return AudioResamplerState::RESAMPLING;
+  }
+
+  const size_t bytes_free = this->output_transfer_buffer_->free();
+  const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free);
+
+  const size_t bytes_available = this->input_transfer_buffer_->available();
+  const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available);
+
+  if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||
+      (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {
+    esp_audio_libs::resampler::ResamplerResults results =
+        this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(),
+                                   this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3);
+
+    this->input_transfer_buffer_->decrease_buffer_length(this->input_stream_info_.frames_to_bytes(results.frames_used));
+    this->output_transfer_buffer_->increase_buffer_length(
+        this->output_stream_info_.frames_to_bytes(results.frames_generated));
+
+    // Resampling causes slight differences in the durations used versus generated. Computes the difference in
+    // millisconds. The callback function passing the played audio duration uses the difference to convert from output
+    // duration to input duration.
+    this->accumulated_frames_used_ += results.frames_used;
+    this->accumulated_frames_generated_ += results.frames_generated;
+
+    const int32_t used_ms =
+        this->input_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_used_);
+    const int32_t generated_ms =
+        this->output_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_generated_);
+
+    *ms_differential = used_ms - generated_ms;
+
+  } else {
+    // No resampling required, copy samples directly to the output transfer buffer
+    *ms_differential = 0;
+
+    const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free),
+                                              this->input_stream_info_.frames_to_bytes(frames_available));
+
+    std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(),
+                (void *) this->input_transfer_buffer_->get_buffer_start(), bytes_to_transfer);
+
+    this->input_transfer_buffer_->decrease_buffer_length(bytes_to_transfer);
+    this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer);
+  }
+
+  return AudioResamplerState::RESAMPLING;
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/audio/audio_resampler.h b/esphome/components/audio/audio_resampler.h
new file mode 100644
index 0000000000..a348aaf783
--- /dev/null
+++ b/esphome/components/audio/audio_resampler.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "audio.h"
+#include "audio_transfer_buffer.h"
+
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
+
+#include "esphome/core/ring_buffer.h"
+
+#include "esp_err.h"
+
+#include <resampler.h>  // esp-audio-libs
+
+namespace esphome {
+namespace audio {
+
+enum class AudioResamplerState : uint8_t {
+  RESAMPLING,  // More data is available to resample
+  FINISHED,    // All file data has been resampled and transferred
+  FAILED,      // Unused state included for consistency among Audio classes
+};
+
+class AudioResampler {
+  /*
+   * @brief Class that facilitates resampling audio.
+   * The audio data is read from a ring buffer source, resampled, and sent to an audio sink (ring buffer or speaker
+   * component). Also supports converting bits per sample.
+   */
+ public:
+  /// @brief Allocates the input and output transfer buffers
+  /// @param input_buffer_size Size of the input transfer buffer in bytes.
+  /// @param output_buffer_size Size of the output transfer buffer in bytes.
+  AudioResampler(size_t input_buffer_size, size_t output_buffer_size);
+
+  /// @brief Adds a source ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_source(std::weak_ptr<RingBuffer> &input_ring_buffer);
+
+  /// @brief Adds a sink ring buffer for resampled audio. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer);
+
+#ifdef USE_SPEAKER
+  /// @brief Adds a sink speaker for decoded audio.
+  /// @param speaker pointer to speaker component
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(speaker::Speaker *speaker);
+#endif
+
+  /// @brief Sets up the class to resample.
+  /// @param input_stream_info The incoming sample rate, bits per sample, and number of channels
+  /// @param output_stream_info The desired outgoing sample rate, bits per sample, and number of channels
+  /// @param number_of_taps Number of taps per FIR filter
+  /// @param number_of_filters Number of FIR filters
+  /// @return ESP_OK if it is able to convert the incoming stream,
+  ///         ESP_ERR_NO_MEM if the transfer buffers failed to allocate,
+  ///         ESP_ERR_NOT_SUPPORTED if the stream can't be converted.
+  esp_err_t start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, uint16_t number_of_taps,
+                  uint16_t number_of_filters);
+
+  /// @brief Resamples audio from the ring buffer source and writes to the sink.
+  /// @param stop_gracefully If true, it indicates the file decoder is finished. The resampler will resample all the
+  ///                        remaining audio and then finish.
+  /// @param ms_differential Pointer to a (int32_t) variable that will store the difference, in milliseconds, between
+  ///                        the duration of input audio used and the duration of output audio generated.
+  /// @return AudioResamplerState
+  AudioResamplerState resample(bool stop_gracefully, int32_t *ms_differential);
+
+  /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers.
+  /// @param pause_state If true, audio data is not sent to the sink.
+  void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; }
+
+ protected:
+  std::unique_ptr<AudioSourceTransferBuffer> input_transfer_buffer_;
+  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;
+
+  size_t input_buffer_size_;
+  size_t output_buffer_size_;
+
+  uint32_t accumulated_frames_used_{0};
+  uint32_t accumulated_frames_generated_{0};
+
+  bool pause_output_{false};
+
+  AudioStreamInfo input_stream_info_;
+  AudioStreamInfo output_stream_info_;
+
+  std::unique_ptr<esp_audio_libs::resampler::Resampler> resampler_;
+};
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif