Merge branch 'dev' into vornado-ir

2025-10-31 23:21:54 +00:00 · 2025-02-04 16:56:37 -08:00
parent fff4bc21b7 1215d2ffeb
commit d9b8a7bc17
1210 changed files with 12002 additions and 20523 deletions
--- a/.github/workflows/ci-docker.yml
+++ b/.github/workflows/ci-docker.yml
@@ -37,7 +37,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        arch: [amd64, armv7, aarch64]
+        arch: [amd64, aarch64]
        build_type: ["ha-addon", "docker", "lint"]
    steps:
      - uses: actions/checkout@v4.1.7
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -80,7 +80,6 @@ jobs:
      matrix:
        platform:
          - linux/amd64
-          - linux/arm/v7
          - linux/arm64
    steps:
      - uses: actions/checkout@v4.1.7
--- a/3
+++ b/3
@@ -277,6 +277,7 @@ esphome/components/mics_4514/* @jesserockz
 esphome/components/midea/* @dudanov
 esphome/components/midea_ir/* @dudanov
 esphome/components/mitsubishi/* @RubyBailey
+esphome/components/mixer/speaker/* @kahrendt
 esphome/components/mlx90393/* @functionpointer
 esphome/components/mlx90614/* @jesserockz
 esphome/components/mmc5603/* @benhoff
@@ -343,6 +344,7 @@ esphome/components/radon_eye_rd200/* @jeffeb3
 esphome/components/rc522/* @glmnet
 esphome/components/rc522_i2c/* @glmnet
 esphome/components/rc522_spi/* @glmnet
+esphome/components/resampler/speaker/* @kahrendt
 esphome/components/restart/* @esphome/core
 esphome/components/rf_bridge/* @jesserockz
 esphome/components/rgbct/* @jesserockz
@@ -499,5 +501,6 @@ esphome/components/xiaomi_mhoc401/* @vevsvevs
 esphome/components/xiaomi_rtcgq02lm/* @jesserockz
 esphome/components/xl9535/* @mreditor97
 esphome/components/xpt2046/touchscreen/* @nielsnl68 @numo68
+esphome/components/xxtea/* @clydebarrow
 esphome/components/zhlt01/* @cfeenstra1024
 esphome/components/zio_ultrasonic/* @kahrendt
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -51,19 +51,7 @@ ENV \
  # Store globally installed pio libs in /piolibs
  PLATFORMIO_GLOBALLIB_DIR=/piolibs

-# Support legacy binaries on Debian multiarch system. There is no "correct" way
-# to do this, other than using properly built toolchains...
-# See: https://unix.stackexchange.com/questions/553743/correct-way-to-add-lib-ld-linux-so-3-in-debian
 RUN \
-    if [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]; then \
-        ln -s /lib/arm-linux-gnueabihf/ld-linux-armhf.so.3 /lib/ld-linux.so.3; \
-    fi
-
-RUN \
-    # Ubuntu python3-pip is missing wheel
-    if [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]; then \
-        export PIP_EXTRA_INDEX_URL="https://www.piwheels.org/simple"; \
-    fi; \
    pip3 install \
    --break-system-packages --no-cache-dir \
    # Keep platformio version in sync with requirements.txt
@@ -82,14 +70,6 @@ RUN --mount=type=tmpfs,target=/root/.cargo <<END-OF-RUN
 # Fail on any non-zero status
 set -e

-if [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]
-then
-    curl -L https://www.piwheels.org/cp311/cryptography-43.0.0-cp37-abi3-linux_armv7l.whl -o /tmp/cryptography-43.0.0-cp37-abi3-linux_armv7l.whl
-    pip3 install --break-system-packages --no-cache-dir /tmp/cryptography-43.0.0-cp37-abi3-linux_armv7l.whl
-    rm /tmp/cryptography-43.0.0-cp37-abi3-linux_armv7l.whl
-    export PIP_EXTRA_INDEX_URL="https://www.piwheels.org/simple";
-fi
-
 # install build tools in case wheels are not available
 BUILD_DEPS="
    build-essential=12.9
@@ -106,7 +86,7 @@ LIB_DEPS="
    libtiff6=4.5.0-6+deb12u1
    libopenjp2-7=2.5.0-2
 "
-if [ "$TARGETARCH$TARGETVARIANT" = "arm64" ] || [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]
+if [ "$TARGETARCH$TARGETVARIANT" = "arm64" ]
 then
    apt-get update
    apt-get install -y --no-install-recommends $BUILD_DEPS $LIB_DEPS
@@ -115,7 +95,7 @@ fi
 CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse CARGO_HOME=/root/.cargo
 pip3 install --break-system-packages --no-cache-dir -r /requirements.txt -r /requirements_optional.txt

-if [ "$TARGETARCH$TARGETVARIANT" = "arm64" ] || [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]
+if [ "$TARGETARCH$TARGETVARIANT" = "arm64" ]
 then
    apt-get remove -y --purge --auto-remove $BUILD_DEPS
    rm -rf /tmp/* /var/{cache,log}/* /var/lib/apt/lists/*
@@ -135,11 +115,7 @@ FROM base AS docker

 # Copy esphome and install
 COPY . /esphome
-RUN if [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]; then \
-        export PIP_EXTRA_INDEX_URL="https://www.piwheels.org/simple"; \
-  fi; \
-  pip3 install \
-  --break-system-packages --no-cache-dir -e /esphome
+RUN pip3 install --break-system-packages --no-cache-dir -e /esphome

 # Settings for dashboard
 ENV USERNAME="" PASSWORD=""
@@ -197,11 +173,7 @@ COPY docker/ha-addon-rootfs/ /

 # Copy esphome and install
 COPY . /esphome
-RUN if [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]; then \
-        export PIP_EXTRA_INDEX_URL="https://www.piwheels.org/simple"; \
-  fi; \
-  pip3 install \
-  --break-system-packages --no-cache-dir -e /esphome
+RUN pip3 install --break-system-packages --no-cache-dir -e /esphome

 # Labels
 LABEL \
@@ -232,21 +204,14 @@ RUN \
        nano=7.2-1+deb12u1 \
        build-essential=12.9 \
        python3-dev=3.11.2-1+b1 \
-    && if [ "$TARGETARCH$TARGETVARIANT" != "armv7" ]; then \
-    # move this up after armv7 is retired
-    apt-get install -y --no-install-recommends clang-tidy-18=1:18.1.8~++20240731024826+3b5b5c1ec4a3-1~exp1~20240731144843.145 ; \
-    fi; \
-    rm -rf \
+        clang-tidy-18=1:18.1.8~++20240731024826+3b5b5c1ec4a3-1~exp1~20240731144843.145 \
+    && rm -rf \
        /tmp/* \
        /var/{cache,log}/* \
        /var/lib/apt/lists/*

 COPY requirements_test.txt /
-RUN if [ "$TARGETARCH$TARGETVARIANT" = "armv7" ]; then \
-        export PIP_EXTRA_INDEX_URL="https://www.piwheels.org/simple"; \
-  fi; \
-  pip3 install \
-  --break-system-packages --no-cache-dir -r /requirements_test.txt
+RUN pip3 install --break-system-packages --no-cache-dir -r /requirements_test.txt

 VOLUME ["/esphome"]
 WORKDIR /esphome
--- a/docker/build.py
+++ b/docker/build.py
@@ -1,22 +1,19 @@
 #!/usr/bin/env python3
-from dataclasses import dataclass
-import subprocess
 import argparse
-from platform import machine
-import shlex
+from dataclasses import dataclass
 import re
+import shlex
+import subprocess
 import sys

-
 CHANNEL_DEV = "dev"
 CHANNEL_BETA = "beta"
 CHANNEL_RELEASE = "release"
 CHANNELS = [CHANNEL_DEV, CHANNEL_BETA, CHANNEL_RELEASE]

 ARCH_AMD64 = "amd64"
-ARCH_ARMV7 = "armv7"
 ARCH_AARCH64 = "aarch64"
-ARCHS = [ARCH_AMD64, ARCH_ARMV7, ARCH_AARCH64]
+ARCHS = [ARCH_AMD64, ARCH_AARCH64]

 TYPE_DOCKER = "docker"
 TYPE_HA_ADDON = "ha-addon"
@@ -76,7 +73,6 @@ class DockerParams:
        }[build_type]
        platform = {
            ARCH_AMD64: "linux/amd64",
-            ARCH_ARMV7: "linux/arm/v7",
            ARCH_AARCH64: "linux/arm64",
        }[arch]
        target = {
--- a/esphome/components/audio/init.py
+++ b/esphome/components/audio/init.py
@@ -1,9 +1,121 @@
 import esphome.codegen as cg
 import esphome.config_validation as cv
+from esphome.const import CONF_BITS_PER_SAMPLE, CONF_NUM_CHANNELS, CONF_SAMPLE_RATE
+import esphome.final_validate as fv

 CODEOWNERS = ["@kahrendt"]
 audio_ns = cg.esphome_ns.namespace("audio")

+AudioFile = audio_ns.struct("AudioFile")
+AudioFileType = audio_ns.enum("AudioFileType", is_class=True)
+AUDIO_FILE_TYPE_ENUM = {
+    "NONE": AudioFileType.NONE,
+    "WAV": AudioFileType.WAV,
+    "MP3": AudioFileType.MP3,
+    "FLAC": AudioFileType.FLAC,
+}
+
+
+CONF_MIN_BITS_PER_SAMPLE = "min_bits_per_sample"
+CONF_MAX_BITS_PER_SAMPLE = "max_bits_per_sample"
+CONF_MIN_CHANNELS = "min_channels"
+CONF_MAX_CHANNELS = "max_channels"
+CONF_MIN_SAMPLE_RATE = "min_sample_rate"
+CONF_MAX_SAMPLE_RATE = "max_sample_rate"
+
+
 CONFIG_SCHEMA = cv.All(
    cv.Schema({}),
 )
+
+AUDIO_COMPONENT_SCHEMA = cv.Schema(
+    {
+        cv.Optional(CONF_BITS_PER_SAMPLE): cv.int_range(8, 32),
+        cv.Optional(CONF_NUM_CHANNELS): cv.int_range(1, 2),
+        cv.Optional(CONF_SAMPLE_RATE): cv.int_range(8000, 48000),
+    }
+)
+
+
+_UNDEF = object()
+
+
+def set_stream_limits(
+    min_bits_per_sample: int = _UNDEF,
+    max_bits_per_sample: int = _UNDEF,
+    min_channels: int = _UNDEF,
+    max_channels: int = _UNDEF,
+    min_sample_rate: int = _UNDEF,
+    max_sample_rate: int = _UNDEF,
+):
+    def set_limits_in_config(config):
+        if min_bits_per_sample is not _UNDEF:
+            config[CONF_MIN_BITS_PER_SAMPLE] = min_bits_per_sample
+        if max_bits_per_sample is not _UNDEF:
+            config[CONF_MAX_BITS_PER_SAMPLE] = max_bits_per_sample
+        if min_channels is not _UNDEF:
+            config[CONF_MIN_CHANNELS] = min_channels
+        if max_channels is not _UNDEF:
+            config[CONF_MAX_CHANNELS] = max_channels
+        if min_sample_rate is not _UNDEF:
+            config[CONF_MIN_SAMPLE_RATE] = min_sample_rate
+        if max_sample_rate is not _UNDEF:
+            config[CONF_MAX_SAMPLE_RATE] = max_sample_rate
+
+    return set_limits_in_config
+
+
+def final_validate_audio_schema(
+    name: str,
+    *,
+    audio_device: str,
+    bits_per_sample: int,
+    channels: int,
+    sample_rate: int,
+):
+    def validate_audio_compatiblity(audio_config):
+        audio_schema = {}
+
+        try:
+            cv.int_range(
+                min=audio_config.get(CONF_MIN_BITS_PER_SAMPLE),
+                max=audio_config.get(CONF_MAX_BITS_PER_SAMPLE),
+            )(bits_per_sample)
+        except cv.Invalid as exc:
+            raise cv.Invalid(
+                f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
+            ) from exc
+
+        try:
+            cv.int_range(
+                min=audio_config.get(CONF_MIN_CHANNELS),
+                max=audio_config.get(CONF_MAX_CHANNELS),
+            )(channels)
+        except cv.Invalid as exc:
+            raise cv.Invalid(
+                f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
+            ) from exc
+
+        try:
+            cv.int_range(
+                min=audio_config.get(CONF_MIN_SAMPLE_RATE),
+                max=audio_config.get(CONF_MAX_SAMPLE_RATE),
+            )(sample_rate)
+            return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)
+        except cv.Invalid as exc:
+            raise cv.Invalid(
+                f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
+            ) from exc
+
+    return cv.Schema(
+        {
+            cv.Required(audio_device): fv.id_declaration_match_schema(
+                validate_audio_compatiblity
+            )
+        },
+        extra=cv.ALLOW_EXTRA,
+    )
+
+
+async def to_code(config):
+    cg.add_library("esphome/esp-audio-libs", "1.1.1")
--- a/esphome/components/audio/audio.cpp
+++ b/esphome/components/audio/audio.cpp
@@ -0,0 +1,67 @@
+#include "audio.h"
+
+namespace esphome {
+namespace audio {
+
+// Euclidean's algorithm for finding the greatest common divisor
+static uint32_t gcd(uint32_t a, uint32_t b) {
+  while (b != 0) {
+    uint32_t t = b;
+    b = a % b;
+    a = t;
+  }
+  return a;
+}
+
+AudioStreamInfo::AudioStreamInfo(uint8_t bits_per_sample, uint8_t channels, uint32_t sample_rate)
+    : bits_per_sample_(bits_per_sample), channels_(channels), sample_rate_(sample_rate) {
+  this->ms_sample_rate_gcd_ = gcd(1000, this->sample_rate_);
+  this->bytes_per_sample_ = (this->bits_per_sample_ + 7) / 8;
+}
+
+uint32_t AudioStreamInfo::frames_to_microseconds(uint32_t frames) const {
+  return (frames * 1000000 + (this->sample_rate_ >> 1)) / this->sample_rate_;
+}
+
+uint32_t AudioStreamInfo::frames_to_milliseconds_with_remainder(uint32_t *total_frames) const {
+  uint32_t unprocessable_frames = *total_frames % (this->sample_rate_ / this->ms_sample_rate_gcd_);
+  uint32_t frames_for_ms_calculation = *total_frames - unprocessable_frames;
+
+  uint32_t playback_ms = (frames_for_ms_calculation * 1000) / this->sample_rate_;
+  *total_frames = unprocessable_frames;
+  return playback_ms;
+}
+
+bool AudioStreamInfo::operator==(const AudioStreamInfo &rhs) const {
+  return (this->bits_per_sample_ == rhs.get_bits_per_sample()) && (this->channels_ == rhs.get_channels()) &&
+         (this->sample_rate_ == rhs.get_sample_rate());
+}
+
+const char *audio_file_type_to_string(AudioFileType file_type) {
+  switch (file_type) {
+#ifdef USE_AUDIO_FLAC_SUPPORT
+    case AudioFileType::FLAC:
+      return "FLAC";
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+    case AudioFileType::MP3:
+      return "MP3";
+#endif
+    case AudioFileType::WAV:
+      return "WAV";
+    default:
+      return "unknown";
+  }
+}
+
+void scale_audio_samples(const int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor,
+                         size_t samples_to_scale) {
+  // Note the assembly dsps_mulc function has audio glitches if the input and output buffers are the same.
+  for (int i = 0; i < samples_to_scale; i++) {
+    int32_t acc = (int32_t) audio_samples[i] * (int32_t) scale_factor;
+    output_buffer[i] = (int16_t) (acc >> 15);
+  }
+}
+
+}  // namespace audio
+}  // namespace esphome
--- a/esphome/components/audio/audio.h
+++ b/esphome/components/audio/audio.h
@@ -1,21 +1,139 @@
 #pragma once

+#include "esphome/core/defines.h"
+
 #include <cstddef>
 #include <cstdint>

 namespace esphome {
 namespace audio {

-struct AudioStreamInfo {
-  bool operator==(const AudioStreamInfo &rhs) const {
-    return (channels == rhs.channels) && (bits_per_sample == rhs.bits_per_sample) && (sample_rate == rhs.sample_rate);
+class AudioStreamInfo {
+  /* Class to respresent important parameters of the audio stream that also provides helper function to convert between
+   * various audio related units.
+   *
+   *  - An audio sample represents a unit of audio for one channel.
+   *  - A frame represents a unit of audio with a sample for every channel.
+   *
+   * In gneneral, converting between bytes, samples, and frames shouldn't result in rounding errors so long as frames
+   * are used as the main unit when transferring audio data. Durations may result in rounding for certain sample rates;
+   * e.g., 44.1 KHz. The ``frames_to_milliseconds_with_remainder`` function should be used for accuracy, as it takes
+   * into account the remainder rather than just ignoring any rounding.
+   */
+ public:
+  AudioStreamInfo()
+      : AudioStreamInfo(16, 1, 16000){};  // Default values represent ESPHome's audio components historical values
+  AudioStreamInfo(uint8_t bits_per_sample, uint8_t channels, uint32_t sample_rate);
+
+  uint8_t get_bits_per_sample() const { return this->bits_per_sample_; }
+  uint8_t get_channels() const { return this->channels_; }
+  uint32_t get_sample_rate() const { return this->sample_rate_; }
+
+  /// @brief Convert bytes to duration in milliseconds.
+  /// @param bytes Number of bytes to convert
+  /// @return Duration in milliseconds that will store `bytes` bytes of audio. May round down for certain sample rates
+  ///         or values of `bytes`.
+  uint32_t bytes_to_ms(size_t bytes) const {
+    return bytes * 1000 / (this->sample_rate_ * this->bytes_per_sample_ * this->channels_);
  }
+
+  /// @brief Convert bytes to frames.
+  /// @param bytes Number of bytes to convert
+  /// @return Audio frames that will store `bytes` bytes.
+  uint32_t bytes_to_frames(size_t bytes) const { return (bytes / (this->bytes_per_sample_ * this->channels_)); }
+
+  /// @brief Convert bytes to samples.
+  /// @param bytes Number of bytes to convert
+  /// @return Audio samples that will store `bytes` bytes.
+  uint32_t bytes_to_samples(size_t bytes) const { return (bytes / this->bytes_per_sample_); }
+
+  /// @brief Converts frames to bytes.
+  /// @param frames Number of frames to convert.
+  /// @return Number of bytes that will store `frames` frames of audio.
+  size_t frames_to_bytes(uint32_t frames) const { return frames * this->bytes_per_sample_ * this->channels_; }
+
+  /// @brief Converts samples to bytes.
+  /// @param samples Number of samples to convert.
+  /// @return Number of bytes that will store `samples` samples of audio.
+  size_t samples_to_bytes(uint32_t samples) const { return samples * this->bytes_per_sample_; }
+
+  /// @brief Converts duration to frames.
+  /// @param ms Duration in milliseconds
+  /// @return Audio frames that will store `ms` milliseconds of audio.  May round down for certain sample rates.
+  uint32_t ms_to_frames(uint32_t ms) const { return (ms * this->sample_rate_) / 1000; }
+
+  /// @brief Converts duration to samples.
+  /// @param ms Duration in milliseconds
+  /// @return Audio samples that will store `ms` milliseconds of audio.  May round down for certain sample rates.
+  uint32_t ms_to_samples(uint32_t ms) const { return (ms * this->channels_ * this->sample_rate_) / 1000; }
+
+  /// @brief Converts duration to bytes. May round down for certain sample rates.
+  /// @param ms Duration in milliseconds
+  /// @return Bytes that will store `ms` milliseconds of audio.  May round down for certain sample rates.
+  size_t ms_to_bytes(uint32_t ms) const {
+    return (ms * this->bytes_per_sample_ * this->channels_ * this->sample_rate_) / 1000;
+  }
+
+  /// @brief Computes the duration, in microseconds, the given amount of frames represents.
+  /// @param frames Number of audio frames
+  /// @return Duration in microseconds `frames` respresents. May be slightly inaccurate due to integer divison rounding
+  ///         for certain sample rates.
+  uint32_t frames_to_microseconds(uint32_t frames) const;
+
+  /// @brief Computes the duration, in milliseconds, the given amount of frames represents. Avoids
+  /// accumulating rounding errors by updating `frames` with the remainder after converting.
+  /// @param frames Pointer to uint32_t with the number of audio frames. Replaced with the remainder.
+  /// @return Duration in milliseconds `frames` represents. Always less than or equal to the actual value due to
+  ///         rounding.
+  uint32_t frames_to_milliseconds_with_remainder(uint32_t *frames) const;
+
+  // Class comparison operators
+  bool operator==(const AudioStreamInfo &rhs) const;
  bool operator!=(const AudioStreamInfo &rhs) const { return !operator==(rhs); }
-  size_t get_bytes_per_sample() const { return bits_per_sample / 8; }
-  uint8_t channels = 1;
-  uint8_t bits_per_sample = 16;
-  uint32_t sample_rate = 16000;
+
+ protected:
+  uint8_t bits_per_sample_;
+  uint8_t channels_;
+  uint32_t sample_rate_;
+
+  // The greatest common divisor between 1000 ms = 1 second and the sample rate. Used to avoid accumulating error when
+  // converting from frames to duration. Computed at construction.
+  uint32_t ms_sample_rate_gcd_;
+
+  // Conversion factor derived from the number of bits per sample. Assumes audio data is aligned to the byte. Computed
+  // at construction.
+  size_t bytes_per_sample_;
 };

+enum class AudioFileType : uint8_t {
+  NONE = 0,
+#ifdef USE_AUDIO_FLAC_SUPPORT
+  FLAC,
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+  MP3,
+#endif
+  WAV,
+};
+
+struct AudioFile {
+  const uint8_t *data;
+  size_t length;
+  AudioFileType file_type;
+};
+
+/// @brief Helper function to convert file type to a const char string
+/// @param file_type
+/// @return const char pointer to the readable file type
+const char *audio_file_type_to_string(AudioFileType file_type);
+
+/// @brief Scales Q15 fixed point audio samples. Scales in place if audio_samples == output_buffer.
+/// @param audio_samples PCM int16 audio samples
+/// @param output_buffer Buffer to store the scaled samples
+/// @param scale_factor Q15 fixed point scaling factor
+/// @param samples_to_scale Number of samples to scale
+void scale_audio_samples(const int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor,
+                         size_t samples_to_scale);
+
 }  // namespace audio
 }  // namespace esphome
--- a/esphome/components/audio/audio_decoder.cpp
+++ b/esphome/components/audio/audio_decoder.cpp
@@ -0,0 +1,361 @@
+#include "audio_decoder.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/hal.h"
+
+namespace esphome {
+namespace audio {
+
+static const uint32_t DECODING_TIMEOUT_MS = 50;    // The decode function will yield after this duration
+static const uint32_t READ_WRITE_TIMEOUT_MS = 20;  // Timeout for transferring audio data
+
+static const uint32_t MAX_POTENTIALLY_FAILED_COUNT = 10;
+
+AudioDecoder::AudioDecoder(size_t input_buffer_size, size_t output_buffer_size) {
+  this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size);
+  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size);
+}
+
+AudioDecoder::~AudioDecoder() {
+#ifdef USE_AUDIO_MP3_SUPPORT
+  if (this->audio_file_type_ == AudioFileType::MP3) {
+    esp_audio_libs::helix_decoder::MP3FreeDecoder(this->mp3_decoder_);
+  }
+#endif
+}
+
+esp_err_t AudioDecoder::add_source(std::weak_ptr<RingBuffer> &input_ring_buffer) {
+  if (this->input_transfer_buffer_ != nullptr) {
+    this->input_transfer_buffer_->set_source(input_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+esp_err_t AudioDecoder::add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(output_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+#ifdef USE_SPEAKER
+esp_err_t AudioDecoder::add_sink(speaker::Speaker *speaker) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(speaker);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+#endif
+
+esp_err_t AudioDecoder::start(AudioFileType audio_file_type) {
+  if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  this->audio_file_type_ = audio_file_type;
+
+  this->potentially_failed_count_ = 0;
+  this->end_of_file_ = false;
+
+  switch (this->audio_file_type_) {
+#ifdef USE_AUDIO_FLAC_SUPPORT
+    case AudioFileType::FLAC:
+      this->flac_decoder_ = make_unique<esp_audio_libs::flac::FLACDecoder>();
+      this->free_buffer_required_ =
+          this->output_transfer_buffer_->capacity();  // We'll revise this after reading the header
+      break;
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+    case AudioFileType::MP3:
+      this->mp3_decoder_ = esp_audio_libs::helix_decoder::MP3InitDecoder();
+      this->free_buffer_required_ = 1152 * sizeof(int16_t) * 2;  // samples * size per sample * channels
+      break;
+#endif
+    case AudioFileType::WAV:
+      this->wav_decoder_ = make_unique<esp_audio_libs::wav_decoder::WAVDecoder>();
+      this->wav_decoder_->reset();
+      this->free_buffer_required_ = 1024;
+      break;
+    case AudioFileType::NONE:
+    default:
+      return ESP_ERR_NOT_SUPPORTED;
+      break;
+  }
+
+  return ESP_OK;
+}
+
+AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
+  if (stop_gracefully) {
+    if (this->output_transfer_buffer_->available() == 0) {
+      if (this->end_of_file_) {
+        // The file decoder indicates it reached the end of file
+        return AudioDecoderState::FINISHED;
+      }
+
+      if (!this->input_transfer_buffer_->has_buffered_data()) {
+        // If all the internal buffers are empty, the decoding is done
+        return AudioDecoderState::FINISHED;
+      }
+    }
+  }
+
+  if (this->potentially_failed_count_ > MAX_POTENTIALLY_FAILED_COUNT) {
+    if (stop_gracefully) {
+      // No more new data is going to come in, so decoding is done
+      return AudioDecoderState::FINISHED;
+    }
+    return AudioDecoderState::FAILED;
+  }
+
+  FileDecoderState state = FileDecoderState::MORE_TO_PROCESS;
+
+  uint32_t decoding_start = millis();
+
+  while (state == FileDecoderState::MORE_TO_PROCESS) {
+    // Transfer decoded out
+    if (!this->pause_output_) {
+      size_t bytes_written = this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+      if (this->audio_stream_info_.has_value()) {
+        this->accumulated_frames_written_ += this->audio_stream_info_.value().bytes_to_frames(bytes_written);
+        this->playback_ms_ +=
+            this->audio_stream_info_.value().frames_to_milliseconds_with_remainder(&this->accumulated_frames_written_);
+      }
+    } else {
+      // If paused, block to avoid wasting CPU resources
+      delay(READ_WRITE_TIMEOUT_MS);
+    }
+
+    // Verify there is enough space to store more decoded audio and that the function hasn't been running too long
+    if ((this->output_transfer_buffer_->free() < this->free_buffer_required_) ||
+        (millis() - decoding_start > DECODING_TIMEOUT_MS)) {
+      return AudioDecoderState::DECODING;
+    }
+
+    // Decode more audio
+
+    size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+
+    if ((this->potentially_failed_count_ > 0) && (bytes_read == 0)) {
+      // Failed to decode in last attempt and there is no new data
+
+      if (this->input_transfer_buffer_->free() == 0) {
+        // The input buffer is full. Since it previously failed on the exact same data, we can never recover
+        state = FileDecoderState::FAILED;
+      } else {
+        // Attempt to get more data next time
+        state = FileDecoderState::IDLE;
+      }
+    } else if (this->input_transfer_buffer_->available() == 0) {
+      // No data to decode, attempt to get more data next time
+      state = FileDecoderState::IDLE;
+    } else {
+      switch (this->audio_file_type_) {
+#ifdef USE_AUDIO_FLAC_SUPPORT
+        case AudioFileType::FLAC:
+          state = this->decode_flac_();
+          break;
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+        case AudioFileType::MP3:
+          state = this->decode_mp3_();
+          break;
+#endif
+        case AudioFileType::WAV:
+          state = this->decode_wav_();
+          break;
+        case AudioFileType::NONE:
+        default:
+          state = FileDecoderState::IDLE;
+          break;
+      }
+    }
+
+    if (state == FileDecoderState::POTENTIALLY_FAILED) {
+      ++this->potentially_failed_count_;
+    } else if (state == FileDecoderState::END_OF_FILE) {
+      this->end_of_file_ = true;
+    } else if (state == FileDecoderState::FAILED) {
+      return AudioDecoderState::FAILED;
+    } else if (state == FileDecoderState::MORE_TO_PROCESS) {
+      this->potentially_failed_count_ = 0;
+    }
+  }
+  return AudioDecoderState::DECODING;
+}
+
+#ifdef USE_AUDIO_FLAC_SUPPORT
+FileDecoderState AudioDecoder::decode_flac_() {
+  if (!this->audio_stream_info_.has_value()) {
+    // Header hasn't been read
+    auto result = this->flac_decoder_->read_header(this->input_transfer_buffer_->get_buffer_start(),
+                                                   this->input_transfer_buffer_->available());
+
+    if (result == esp_audio_libs::flac::FLAC_DECODER_HEADER_OUT_OF_DATA) {
+      return FileDecoderState::POTENTIALLY_FAILED;
+    }
+
+    if (result != esp_audio_libs::flac::FLAC_DECODER_SUCCESS) {
+      // Couldn't read FLAC header
+      return FileDecoderState::FAILED;
+    }
+
+    size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
+    this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed);
+
+    this->free_buffer_required_ = flac_decoder_->get_output_buffer_size_bytes();
+    if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) {
+      // Output buffer is not big enough
+      if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) {
+        // Couldn't reallocate output buffer
+        return FileDecoderState::FAILED;
+      }
+    }
+
+    this->audio_stream_info_ =
+        audio::AudioStreamInfo(this->flac_decoder_->get_sample_depth(), this->flac_decoder_->get_num_channels(),
+                               this->flac_decoder_->get_sample_rate());
+
+    return FileDecoderState::MORE_TO_PROCESS;
+  }
+
+  uint32_t output_samples = 0;
+  auto result = this->flac_decoder_->decode_frame(
+      this->input_transfer_buffer_->get_buffer_start(), this->input_transfer_buffer_->available(),
+      reinterpret_cast<int16_t *>(this->output_transfer_buffer_->get_buffer_end()), &output_samples);
+
+  if (result == esp_audio_libs::flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
+    // Not an issue, just needs more data that we'll get next time.
+    return FileDecoderState::POTENTIALLY_FAILED;
+  }
+
+  size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
+  this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed);
+
+  if (result > esp_audio_libs::flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
+    // Corrupted frame, don't retry with current buffer content, wait for new sync
+    return FileDecoderState::POTENTIALLY_FAILED;
+  }
+
+  // We have successfully decoded some input data and have new output data
+  this->output_transfer_buffer_->increase_buffer_length(
+      this->audio_stream_info_.value().samples_to_bytes(output_samples));
+
+  if (result == esp_audio_libs::flac::FLAC_DECODER_NO_MORE_FRAMES) {
+    return FileDecoderState::END_OF_FILE;
+  }
+
+  return FileDecoderState::MORE_TO_PROCESS;
+}
+#endif
+
+#ifdef USE_AUDIO_MP3_SUPPORT
+FileDecoderState AudioDecoder::decode_mp3_() {
+  // Look for the next sync word
+  int buffer_length = (int) this->input_transfer_buffer_->available();
+  int32_t offset =
+      esp_audio_libs::helix_decoder::MP3FindSyncWord(this->input_transfer_buffer_->get_buffer_start(), buffer_length);
+
+  if (offset < 0) {
+    // New data may have the sync word
+    this->input_transfer_buffer_->decrease_buffer_length(buffer_length);
+    return FileDecoderState::POTENTIALLY_FAILED;
+  }
+
+  // Advance read pointer to match the offset for the syncword
+  this->input_transfer_buffer_->decrease_buffer_length(offset);
+  uint8_t *buffer_start = this->input_transfer_buffer_->get_buffer_start();
+
+  buffer_length = (int) this->input_transfer_buffer_->available();
+  int err = esp_audio_libs::helix_decoder::MP3Decode(this->mp3_decoder_, &buffer_start, &buffer_length,
+                                                     (int16_t *) this->output_transfer_buffer_->get_buffer_end(), 0);
+
+  size_t consumed = this->input_transfer_buffer_->available() - buffer_length;
+  this->input_transfer_buffer_->decrease_buffer_length(consumed);
+
+  if (err) {
+    switch (err) {
+      case esp_audio_libs::helix_decoder::ERR_MP3_OUT_OF_MEMORY:
+        // Intentional fallthrough
+      case esp_audio_libs::helix_decoder::ERR_MP3_NULL_POINTER:
+        return FileDecoderState::FAILED;
+        break;
+      default:
+        // Most errors are recoverable by moving on to the next frame, so mark as potentailly failed
+        return FileDecoderState::POTENTIALLY_FAILED;
+        break;
+    }
+  } else {
+    esp_audio_libs::helix_decoder::MP3FrameInfo mp3_frame_info;
+    esp_audio_libs::helix_decoder::MP3GetLastFrameInfo(this->mp3_decoder_, &mp3_frame_info);
+    if (mp3_frame_info.outputSamps > 0) {
+      int bytes_per_sample = (mp3_frame_info.bitsPerSample / 8);
+      this->output_transfer_buffer_->increase_buffer_length(mp3_frame_info.outputSamps * bytes_per_sample);
+
+      if (!this->audio_stream_info_.has_value()) {
+        this->audio_stream_info_ =
+            audio::AudioStreamInfo(mp3_frame_info.bitsPerSample, mp3_frame_info.nChans, mp3_frame_info.samprate);
+      }
+    }
+  }
+
+  return FileDecoderState::MORE_TO_PROCESS;
+}
+#endif
+
+FileDecoderState AudioDecoder::decode_wav_() {
+  if (!this->audio_stream_info_.has_value()) {
+    // Header hasn't been processed
+
+    esp_audio_libs::wav_decoder::WAVDecoderResult result = this->wav_decoder_->decode_header(
+        this->input_transfer_buffer_->get_buffer_start(), this->input_transfer_buffer_->available());
+
+    if (result == esp_audio_libs::wav_decoder::WAV_DECODER_SUCCESS_IN_DATA) {
+      this->input_transfer_buffer_->decrease_buffer_length(this->wav_decoder_->bytes_processed());
+
+      this->audio_stream_info_ = audio::AudioStreamInfo(
+          this->wav_decoder_->bits_per_sample(), this->wav_decoder_->num_channels(), this->wav_decoder_->sample_rate());
+
+      this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left();
+      this->wav_has_known_end_ = (this->wav_bytes_left_ > 0);
+      return FileDecoderState::MORE_TO_PROCESS;
+    } else if (result == esp_audio_libs::wav_decoder::WAV_DECODER_WARNING_INCOMPLETE_DATA) {
+      // Available data didn't have the full header
+      return FileDecoderState::POTENTIALLY_FAILED;
+    } else {
+      return FileDecoderState::FAILED;
+    }
+  } else {
+    if (!this->wav_has_known_end_ || (this->wav_bytes_left_ > 0)) {
+      size_t bytes_to_copy = this->input_transfer_buffer_->available();
+
+      if (this->wav_has_known_end_) {
+        bytes_to_copy = std::min(bytes_to_copy, this->wav_bytes_left_);
+      }
+
+      bytes_to_copy = std::min(bytes_to_copy, this->output_transfer_buffer_->free());
+
+      if (bytes_to_copy > 0) {
+        std::memcpy(this->output_transfer_buffer_->get_buffer_end(), this->input_transfer_buffer_->get_buffer_start(),
+                    bytes_to_copy);
+        this->input_transfer_buffer_->decrease_buffer_length(bytes_to_copy);
+        this->output_transfer_buffer_->increase_buffer_length(bytes_to_copy);
+        if (this->wav_has_known_end_) {
+          this->wav_bytes_left_ -= bytes_to_copy;
+        }
+      }
+      return FileDecoderState::IDLE;
+    }
+  }
+
+  return FileDecoderState::END_OF_FILE;
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_decoder.h
+++ b/esphome/components/audio/audio_decoder.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "audio.h"
+#include "audio_transfer_buffer.h"
+
+#include "esphome/core/defines.h"
+#include "esphome/core/helpers.h"
+#include "esphome/core/ring_buffer.h"
+
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
+
+#include "esp_err.h"
+
+// esp-audio-libs
+#ifdef USE_AUDIO_FLAC_SUPPORT
+#include <flac_decoder.h>
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+#include <mp3_decoder.h>
+#endif
+#include <wav_decoder.h>
+
+namespace esphome {
+namespace audio {
+
+enum class AudioDecoderState : uint8_t {
+  DECODING = 0,  // More data is available to decode
+  FINISHED,      // All file data has been decoded and transferred
+  FAILED,        // Encountered an error
+};
+
+// Only used within the AudioDecoder class; conveys the state of the particular file type decoder
+enum class FileDecoderState : uint8_t {
+  MORE_TO_PROCESS,     // Successsfully read a file chunk and more data is available to decode
+  IDLE,                // Not enough data to decode, waiting for more to be transferred
+  POTENTIALLY_FAILED,  // Decoder encountered a potentially recoverable error if more file data is available
+  FAILED,              // Decoder encoutnered an uncrecoverable error
+  END_OF_FILE,         // The specific file decoder knows its the end of the file
+};
+
+class AudioDecoder {
+  /*
+   * @brief Class that facilitates decoding an audio file.
+   * The audio file is read from a ring buffer source, decoded, and sent to an audio sink (ring buffer or speaker
+   * component).
+   * Supports wav, flac, and mp3 formats.
+   */
+ public:
+  /// @brief Allocates the input and output transfer buffers
+  /// @param input_buffer_size Size of the input transfer buffer in bytes.
+  /// @param output_buffer_size Size of the output transfer buffer in bytes.
+  AudioDecoder(size_t input_buffer_size, size_t output_buffer_size);
+
+  /// @brief Deallocates the MP3 decoder (the flac and wav decoders are deallocated automatically)
+  ~AudioDecoder();
+
+  /// @brief Adds a source ring buffer for raw file data. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_source(std::weak_ptr<RingBuffer> &input_ring_buffer);
+
+  /// @brief Adds a sink ring buffer for decoded audio. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer);
+
+#ifdef USE_SPEAKER
+  /// @brief Adds a sink speaker for decoded audio.
+  /// @param speaker pointer to speaker component
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(speaker::Speaker *speaker);
+#endif
+
+  /// @brief Sets up decoding the file
+  /// @param audio_file_type AudioFileType of the file
+  /// @return ESP_OK if successful, ESP_ERR_NO_MEM if the transfer buffers fail to allocate, or ESP_ERR_NOT_SUPPORTED if
+  /// the format isn't supported.
+  esp_err_t start(AudioFileType audio_file_type);
+
+  /// @brief Decodes audio from the ring buffer source and writes to the sink.
+  /// @param stop_gracefully If true, it indicates the file source is finished. The decoder will decode all the
+  /// reamining data and then finish.
+  /// @return AudioDecoderState
+  AudioDecoderState decode(bool stop_gracefully);
+
+  /// @brief Gets the audio stream information, if it has been decoded from the files header
+  /// @return optional<AudioStreamInfo> with the audio information. If not available yet, returns no value.
+  const optional<audio::AudioStreamInfo> &get_audio_stream_info() const { return this->audio_stream_info_; }
+
+  /// @brief Returns the duration of audio (in milliseconds) decoded and sent to the sink
+  /// @return Duration of decoded audio in milliseconds
+  uint32_t get_playback_ms() const { return this->playback_ms_; }
+
+  /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers.
+  /// @param pause_state If true, audio data is not sent to the sink.
+  void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; }
+
+ protected:
+  std::unique_ptr<esp_audio_libs::wav_decoder::WAVDecoder> wav_decoder_;
+#ifdef USE_AUDIO_FLAC_SUPPORT
+  FileDecoderState decode_flac_();
+  std::unique_ptr<esp_audio_libs::flac::FLACDecoder> flac_decoder_;
+#endif
+#ifdef USE_AUDIO_MP3_SUPPORT
+  FileDecoderState decode_mp3_();
+  esp_audio_libs::helix_decoder::HMP3Decoder mp3_decoder_;
+#endif
+  FileDecoderState decode_wav_();
+
+  std::unique_ptr<AudioSourceTransferBuffer> input_transfer_buffer_;
+  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;
+
+  AudioFileType audio_file_type_{AudioFileType::NONE};
+  optional<AudioStreamInfo> audio_stream_info_{};
+
+  size_t free_buffer_required_{0};
+  size_t wav_bytes_left_{0};
+
+  uint32_t potentially_failed_count_{0};
+  bool end_of_file_{false};
+  bool wav_has_known_end_{false};
+
+  bool pause_output_{false};
+
+  uint32_t accumulated_frames_written_{0};
+  uint32_t playback_ms_{0};
+};
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_reader.cpp
+++ b/esphome/components/audio/audio_reader.cpp
@@ -0,0 +1,308 @@
+#include "audio_reader.h"
+
+#ifdef USE_ESP_IDF
+
+#include "esphome/core/defines.h"
+#include "esphome/core/hal.h"
+#include "esphome/core/helpers.h"
+
+#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE
+#include "esp_crt_bundle.h"
+#endif
+
+namespace esphome {
+namespace audio {
+
+static const uint32_t READ_WRITE_TIMEOUT_MS = 20;
+
+// The number of times the http read times out with no data before throwing an error
+static const uint32_t ERROR_COUNT_NO_DATA_READ_TIMEOUT = 100;
+
+static const size_t HTTP_STREAM_BUFFER_SIZE = 2048;
+
+static const uint8_t MAX_REDIRECTION = 5;
+
+// Some common HTTP status codes - borrowed from http_request component accessed 20241224
+enum HttpStatus {
+  HTTP_STATUS_OK = 200,
+  HTTP_STATUS_NO_CONTENT = 204,
+  HTTP_STATUS_PARTIAL_CONTENT = 206,
+
+  /* 3xx - Redirection */
+  HTTP_STATUS_MULTIPLE_CHOICES = 300,
+  HTTP_STATUS_MOVED_PERMANENTLY = 301,
+  HTTP_STATUS_FOUND = 302,
+  HTTP_STATUS_SEE_OTHER = 303,
+  HTTP_STATUS_NOT_MODIFIED = 304,
+  HTTP_STATUS_TEMPORARY_REDIRECT = 307,
+  HTTP_STATUS_PERMANENT_REDIRECT = 308,
+
+  /* 4XX - CLIENT ERROR */
+  HTTP_STATUS_BAD_REQUEST = 400,
+  HTTP_STATUS_UNAUTHORIZED = 401,
+  HTTP_STATUS_FORBIDDEN = 403,
+  HTTP_STATUS_NOT_FOUND = 404,
+  HTTP_STATUS_METHOD_NOT_ALLOWED = 405,
+  HTTP_STATUS_NOT_ACCEPTABLE = 406,
+  HTTP_STATUS_LENGTH_REQUIRED = 411,
+
+  /* 5xx - Server Error */
+  HTTP_STATUS_INTERNAL_ERROR = 500
+};
+
+AudioReader::~AudioReader() { this->cleanup_connection_(); }
+
+esp_err_t AudioReader::add_sink(const std::weak_ptr<RingBuffer> &output_ring_buffer) {
+  if (current_audio_file_ != nullptr) {
+    // A transfer buffer isn't ncessary for a local file
+    this->file_ring_buffer_ = output_ring_buffer.lock();
+    return ESP_OK;
+  }
+
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(output_ring_buffer);
+    return ESP_OK;
+  }
+
+  return ESP_ERR_INVALID_STATE;
+}
+
+esp_err_t AudioReader::start(AudioFile *audio_file, AudioFileType &file_type) {
+  file_type = AudioFileType::NONE;
+
+  this->current_audio_file_ = audio_file;
+
+  this->file_current_ = audio_file->data;
+  file_type = audio_file->file_type;
+
+  return ESP_OK;
+}
+
+esp_err_t AudioReader::start(const std::string &uri, AudioFileType &file_type) {
+  file_type = AudioFileType::NONE;
+
+  this->cleanup_connection_();
+
+  if (uri.empty()) {
+    return ESP_ERR_INVALID_ARG;
+  }
+
+  esp_http_client_config_t client_config = {};
+
+  client_config.url = uri.c_str();
+  client_config.cert_pem = nullptr;
+  client_config.disable_auto_redirect = false;
+  client_config.max_redirection_count = 10;
+  client_config.event_handler = http_event_handler;
+  client_config.user_data = this;
+  client_config.buffer_size = HTTP_STREAM_BUFFER_SIZE;
+  client_config.keep_alive_enable = true;
+  client_config.timeout_ms = 5000;  // Shouldn't trigger watchdog resets if caller runs in a task
+
+#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE
+  if (uri.find("https:") != std::string::npos) {
+    client_config.crt_bundle_attach = esp_crt_bundle_attach;
+  }
+#endif
+
+  this->client_ = esp_http_client_init(&client_config);
+
+  if (this->client_ == nullptr) {
+    return ESP_FAIL;
+  }
+
+  esp_err_t err = esp_http_client_open(this->client_, 0);
+
+  if (err != ESP_OK) {
+    this->cleanup_connection_();
+    return err;
+  }
+
+  int64_t header_length = esp_http_client_fetch_headers(this->client_);
+  if (header_length < 0) {
+    this->cleanup_connection_();
+    return ESP_FAIL;
+  }
+
+  int status_code = esp_http_client_get_status_code(this->client_);
+
+  if ((status_code < HTTP_STATUS_OK) || (status_code > HTTP_STATUS_PERMANENT_REDIRECT)) {
+    this->cleanup_connection_();
+    return ESP_FAIL;
+  }
+
+  ssize_t redirect_count = 0;
+
+  while ((esp_http_client_set_redirection(this->client_) == ESP_OK) && (redirect_count < MAX_REDIRECTION)) {
+    err = esp_http_client_open(this->client_, 0);
+    if (err != ESP_OK) {
+      this->cleanup_connection_();
+      return ESP_FAIL;
+    }
+
+    header_length = esp_http_client_fetch_headers(this->client_);
+    if (header_length < 0) {
+      this->cleanup_connection_();
+      return ESP_FAIL;
+    }
+
+    status_code = esp_http_client_get_status_code(this->client_);
+
+    if ((status_code < HTTP_STATUS_OK) || (status_code > HTTP_STATUS_PERMANENT_REDIRECT)) {
+      this->cleanup_connection_();
+      return ESP_FAIL;
+    }
+
+    ++redirect_count;
+  }
+
+  if (this->audio_file_type_ == AudioFileType::NONE) {
+    // Failed to determine the file type from the header, fallback to using the url
+    char url[500];
+    err = esp_http_client_get_url(this->client_, url, 500);
+    if (err != ESP_OK) {
+      this->cleanup_connection_();
+      return err;
+    }
+
+    std::string url_string = str_lower_case(url);
+
+    if (str_endswith(url_string, ".wav")) {
+      file_type = AudioFileType::WAV;
+    }
+#ifdef USE_AUDIO_MP3_SUPPORT
+    else if (str_endswith(url_string, ".mp3")) {
+      file_type = AudioFileType::MP3;
+    }
+#endif
+#ifdef USE_AUDIO_FLAC_SUPPORT
+    else if (str_endswith(url_string, ".flac")) {
+      file_type = AudioFileType::FLAC;
+    }
+#endif
+    else {
+      file_type = AudioFileType::NONE;
+      this->cleanup_connection_();
+      return ESP_ERR_NOT_SUPPORTED;
+    }
+  } else {
+    file_type = this->audio_file_type_;
+  }
+
+  this->no_data_read_count_ = 0;
+
+  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(this->buffer_size_);
+  if (this->output_transfer_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  return ESP_OK;
+}
+
+AudioReaderState AudioReader::read() {
+  if (this->client_ != nullptr) {
+    return this->http_read_();
+  } else if (this->current_audio_file_ != nullptr) {
+    return this->file_read_();
+  }
+
+  return AudioReaderState::FAILED;
+}
+
+AudioFileType AudioReader::get_audio_type(const char *content_type) {
+#ifdef USE_AUDIO_MP3_SUPPORT
+  if (strcasecmp(content_type, "mp3") == 0 || strcasecmp(content_type, "audio/mp3") == 0 ||
+      strcasecmp(content_type, "audio/mpeg") == 0) {
+    return AudioFileType::MP3;
+  }
+#endif
+  if (strcasecmp(content_type, "audio/wav") == 0) {
+    return AudioFileType::WAV;
+  }
+#ifdef USE_AUDIO_FLAC_SUPPORT
+  if (strcasecmp(content_type, "audio/flac") == 0 || strcasecmp(content_type, "audio/x-flac") == 0) {
+    return AudioFileType::FLAC;
+  }
+#endif
+  return AudioFileType::NONE;
+}
+
+esp_err_t AudioReader::http_event_handler(esp_http_client_event_t *evt) {
+  // Based on https://github.com/maroc81/WeatherLily/tree/main/main/net accessed 20241224
+  AudioReader *this_reader = (AudioReader *) evt->user_data;
+
+  switch (evt->event_id) {
+    case HTTP_EVENT_ON_HEADER:
+      if (strcasecmp(evt->header_key, "Content-Type") == 0) {
+        this_reader->audio_file_type_ = get_audio_type(evt->header_value);
+      }
+      break;
+    default:
+      break;
+  }
+  return ESP_OK;
+}
+
+AudioReaderState AudioReader::file_read_() {
+  size_t remaining_bytes = this->current_audio_file_->length - (this->file_current_ - this->current_audio_file_->data);
+  if (remaining_bytes > 0) {
+    size_t bytes_written = this->file_ring_buffer_->write_without_replacement(this->file_current_, remaining_bytes,
+                                                                              pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+    this->file_current_ += bytes_written;
+
+    return AudioReaderState::READING;
+  }
+
+  return AudioReaderState::FINISHED;
+}
+
+AudioReaderState AudioReader::http_read_() {
+  this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+
+  if (esp_http_client_is_complete_data_received(this->client_)) {
+    if (this->output_transfer_buffer_->available() == 0) {
+      this->cleanup_connection_();
+      return AudioReaderState::FINISHED;
+    }
+  } else {
+    size_t bytes_to_read = this->output_transfer_buffer_->free();
+    int received_len =
+        esp_http_client_read(this->client_, (char *) this->output_transfer_buffer_->get_buffer_end(), bytes_to_read);
+
+    if (received_len > 0) {
+      this->output_transfer_buffer_->increase_buffer_length(received_len);
+
+      this->no_data_read_count_ = 0;
+    } else if (received_len < 0) {
+      // HTTP read error
+      this->cleanup_connection_();
+      return AudioReaderState::FAILED;
+    } else {
+      if (bytes_to_read > 0) {
+        // Read timed out
+        ++this->no_data_read_count_;
+        if (this->no_data_read_count_ >= ERROR_COUNT_NO_DATA_READ_TIMEOUT) {
+          // Timed out with no data read too many times, so the http read has failed
+          this->cleanup_connection_();
+          return AudioReaderState::FAILED;
+        }
+        delay(READ_WRITE_TIMEOUT_MS);
+      }
+    }
+  }
+
+  return AudioReaderState::READING;
+}
+
+void AudioReader::cleanup_connection_() {
+  if (this->client_ != nullptr) {
+    esp_http_client_close(this->client_);
+    esp_http_client_cleanup(this->client_);
+    this->client_ = nullptr;
+  }
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_reader.h
+++ b/esphome/components/audio/audio_reader.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#ifdef USE_ESP_IDF
+
+#include "audio.h"
+#include "audio_transfer_buffer.h"
+
+#include "esphome/core/ring_buffer.h"
+
+#include "esp_err.h"
+
+#include <esp_http_client.h>
+
+namespace esphome {
+namespace audio {
+
+enum class AudioReaderState : uint8_t {
+  READING = 0,  // More data is available to read
+  FINISHED,     // All data has been read and transferred
+  FAILED,       // Encountered an error
+};
+
+class AudioReader {
+  /*
+   * @brief Class that facilitates reading a raw audio file.
+   * Files can be read from flash (stored in a AudioFile struct) or from an http source.
+   * The file data is sent to a ring buffer sink.
+   */
+ public:
+  /// @brief Constructs an AudioReader object.
+  /// The transfer buffer isn't allocated here, but only if necessary (an http source) in the start function.
+  /// @param buffer_size Transfer buffer size in bytes.
+  AudioReader(size_t buffer_size) : buffer_size_(buffer_size) {}
+  ~AudioReader();
+
+  /// @brief Adds a sink ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr
+  /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return  ESP_OK if successful, ESP_ERR_INVALID_STATE otherwise
+  esp_err_t add_sink(const std::weak_ptr<RingBuffer> &output_ring_buffer);
+
+  /// @brief Starts reading an audio file from an http source. The transfer buffer is allocated here.
+  /// @param uri Web url to the http file.
+  /// @param file_type AudioFileType variable passed-by-reference indicating the type of file being read.
+  /// @return ESP_OK if successful, an ESP_ERR* code otherwise.
+  esp_err_t start(const std::string &uri, AudioFileType &file_type);
+
+  /// @brief Starts reading an audio file from flash. No transfer buffer is allocated.
+  /// @param audio_file AudioFile struct containing the file.
+  /// @param file_type AudioFileType variable passed-by-reference indicating the type of file being read.
+  /// @return ESP_OK
+  esp_err_t start(AudioFile *audio_file, AudioFileType &file_type);
+
+  /// @brief Reads new file data from the source and sends to the ring buffer sink.
+  /// @return AudioReaderState
+  AudioReaderState read();
+
+ protected:
+  /// @brief Monitors the http client events to attempt determining the file type from the Content-Type header
+  static esp_err_t http_event_handler(esp_http_client_event_t *evt);
+
+  /// @brief Determines the audio file type from the http header's Content-Type key
+  /// @param content_type string with the Content-Type key
+  /// @return AudioFileType of the url, if it can be determined. If not, return AudioFileType::NONE.
+  static AudioFileType get_audio_type(const char *content_type);
+
+  AudioReaderState file_read_();
+  AudioReaderState http_read_();
+
+  std::shared_ptr<RingBuffer> file_ring_buffer_;
+  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;
+  void cleanup_connection_();
+
+  size_t buffer_size_;
+  uint32_t no_data_read_count_;
+
+  esp_http_client_handle_t client_{nullptr};
+
+  AudioFile *current_audio_file_{nullptr};
+  AudioFileType audio_file_type_{AudioFileType::NONE};
+  const uint8_t *file_current_{nullptr};
+};
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_resampler.cpp
+++ b/esphome/components/audio/audio_resampler.cpp
@@ -0,0 +1,159 @@
+#include "audio_resampler.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/hal.h"
+
+namespace esphome {
+namespace audio {
+
+static const uint32_t READ_WRITE_TIMEOUT_MS = 20;
+
+AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size)
+    : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) {
+  this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size);
+  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size);
+}
+
+esp_err_t AudioResampler::add_source(std::weak_ptr<RingBuffer> &input_ring_buffer) {
+  if (this->input_transfer_buffer_ != nullptr) {
+    this->input_transfer_buffer_->set_source(input_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+esp_err_t AudioResampler::add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(output_ring_buffer);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+
+#ifdef USE_SPEAKER
+esp_err_t AudioResampler::add_sink(speaker::Speaker *speaker) {
+  if (this->output_transfer_buffer_ != nullptr) {
+    this->output_transfer_buffer_->set_sink(speaker);
+    return ESP_OK;
+  }
+  return ESP_ERR_NO_MEM;
+}
+#endif
+
+esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info,
+                                uint16_t number_of_taps, uint16_t number_of_filters) {
+  this->input_stream_info_ = input_stream_info;
+  this->output_stream_info_ = output_stream_info;
+
+  if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if ((input_stream_info.get_bits_per_sample() > 32) || (output_stream_info.get_bits_per_sample() > 32) ||
+      (input_stream_info_.get_channels() != output_stream_info.get_channels())) {
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+
+  if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) ||
+      (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) {
+    this->resampler_ = make_unique<esp_audio_libs::resampler::Resampler>(
+        input_stream_info.bytes_to_samples(this->input_buffer_size_),
+        output_stream_info.bytes_to_samples(this->output_buffer_size_));
+
+    // Use cascaded biquad filters when downsampling to avoid aliasing
+    bool use_pre_filter = output_stream_info.get_sample_rate() < input_stream_info.get_sample_rate();
+
+    esp_audio_libs::resampler::ResamplerConfiguration resample_config = {
+        .source_sample_rate = static_cast<float>(input_stream_info.get_sample_rate()),
+        .target_sample_rate = static_cast<float>(output_stream_info.get_sample_rate()),
+        .source_bits_per_sample = input_stream_info.get_bits_per_sample(),
+        .target_bits_per_sample = output_stream_info.get_bits_per_sample(),
+        .channels = input_stream_info_.get_channels(),
+        .use_pre_or_post_filter = use_pre_filter,
+        .subsample_interpolate = false,  // Doubles the CPU load. Using more filters is a better alternative
+        .number_of_taps = number_of_taps,
+        .number_of_filters = number_of_filters,
+    };
+
+    if (!this->resampler_->initialize(resample_config)) {
+      // Failed to allocate the resampler's internal buffers
+      return ESP_ERR_NO_MEM;
+    }
+  }
+
+  return ESP_OK;
+}
+
+AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) {
+  if (stop_gracefully) {
+    if (!this->input_transfer_buffer_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) {
+      return AudioResamplerState::FINISHED;
+    }
+  }
+
+  if (!this->pause_output_) {
+    // Move audio data to the sink
+    this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+  } else {
+    // If paused, block to avoid wasting CPU resources
+    delay(READ_WRITE_TIMEOUT_MS);
+  }
+
+  this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+
+  if (this->input_transfer_buffer_->available() == 0) {
+    // No samples available to process
+    return AudioResamplerState::RESAMPLING;
+  }
+
+  const size_t bytes_free = this->output_transfer_buffer_->free();
+  const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free);
+
+  const size_t bytes_available = this->input_transfer_buffer_->available();
+  const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available);
+
+  if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||
+      (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {
+    esp_audio_libs::resampler::ResamplerResults results =
+        this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(),
+                                   this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3);
+
+    this->input_transfer_buffer_->decrease_buffer_length(this->input_stream_info_.frames_to_bytes(results.frames_used));
+    this->output_transfer_buffer_->increase_buffer_length(
+        this->output_stream_info_.frames_to_bytes(results.frames_generated));
+
+    // Resampling causes slight differences in the durations used versus generated. Computes the difference in
+    // millisconds. The callback function passing the played audio duration uses the difference to convert from output
+    // duration to input duration.
+    this->accumulated_frames_used_ += results.frames_used;
+    this->accumulated_frames_generated_ += results.frames_generated;
+
+    const int32_t used_ms =
+        this->input_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_used_);
+    const int32_t generated_ms =
+        this->output_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_generated_);
+
+    *ms_differential = used_ms - generated_ms;
+
+  } else {
+    // No resampling required, copy samples directly to the output transfer buffer
+    *ms_differential = 0;
+
+    const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free),
+                                              this->input_stream_info_.frames_to_bytes(frames_available));
+
+    std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(),
+                (void *) this->input_transfer_buffer_->get_buffer_start(), bytes_to_transfer);
+
+    this->input_transfer_buffer_->decrease_buffer_length(bytes_to_transfer);
+    this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer);
+  }
+
+  return AudioResamplerState::RESAMPLING;
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_resampler.h
+++ b/esphome/components/audio/audio_resampler.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "audio.h"
+#include "audio_transfer_buffer.h"
+
+#include "esphome/core/defines.h"
+#include "esphome/core/ring_buffer.h"
+
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
+
+#include "esp_err.h"
+
+#include <resampler.h>  // esp-audio-libs
+
+namespace esphome {
+namespace audio {
+
+enum class AudioResamplerState : uint8_t {
+  RESAMPLING,  // More data is available to resample
+  FINISHED,    // All file data has been resampled and transferred
+  FAILED,      // Unused state included for consistency among Audio classes
+};
+
+class AudioResampler {
+  /*
+   * @brief Class that facilitates resampling audio.
+   * The audio data is read from a ring buffer source, resampled, and sent to an audio sink (ring buffer or speaker
+   * component). Also supports converting bits per sample.
+   */
+ public:
+  /// @brief Allocates the input and output transfer buffers
+  /// @param input_buffer_size Size of the input transfer buffer in bytes.
+  /// @param output_buffer_size Size of the output transfer buffer in bytes.
+  AudioResampler(size_t input_buffer_size, size_t output_buffer_size);
+
+  /// @brief Adds a source ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_source(std::weak_ptr<RingBuffer> &input_ring_buffer);
+
+  /// @brief Adds a sink ring buffer for resampled audio. Takes ownership of the ring buffer in a shared_ptr.
+  /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(std::weak_ptr<RingBuffer> &output_ring_buffer);
+
+#ifdef USE_SPEAKER
+  /// @brief Adds a sink speaker for decoded audio.
+  /// @param speaker pointer to speaker component
+  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  esp_err_t add_sink(speaker::Speaker *speaker);
+#endif
+
+  /// @brief Sets up the class to resample.
+  /// @param input_stream_info The incoming sample rate, bits per sample, and number of channels
+  /// @param output_stream_info The desired outgoing sample rate, bits per sample, and number of channels
+  /// @param number_of_taps Number of taps per FIR filter
+  /// @param number_of_filters Number of FIR filters
+  /// @return ESP_OK if it is able to convert the incoming stream,
+  ///         ESP_ERR_NO_MEM if the transfer buffers failed to allocate,
+  ///         ESP_ERR_NOT_SUPPORTED if the stream can't be converted.
+  esp_err_t start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, uint16_t number_of_taps,
+                  uint16_t number_of_filters);
+
+  /// @brief Resamples audio from the ring buffer source and writes to the sink.
+  /// @param stop_gracefully If true, it indicates the file decoder is finished. The resampler will resample all the
+  ///                        remaining audio and then finish.
+  /// @param ms_differential Pointer to a (int32_t) variable that will store the difference, in milliseconds, between
+  ///                        the duration of input audio used and the duration of output audio generated.
+  /// @return AudioResamplerState
+  AudioResamplerState resample(bool stop_gracefully, int32_t *ms_differential);
+
+  /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers.
+  /// @param pause_state If true, audio data is not sent to the sink.
+  void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; }
+
+ protected:
+  std::unique_ptr<AudioSourceTransferBuffer> input_transfer_buffer_;
+  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;
+
+  size_t input_buffer_size_;
+  size_t output_buffer_size_;
+
+  uint32_t accumulated_frames_used_{0};
+  uint32_t accumulated_frames_generated_{0};
+
+  bool pause_output_{false};
+
+  AudioStreamInfo input_stream_info_;
+  AudioStreamInfo output_stream_info_;
+
+  std::unique_ptr<esp_audio_libs::resampler::Resampler> resampler_;
+};
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_transfer_buffer.cpp
+++ b/esphome/components/audio/audio_transfer_buffer.cpp
@@ -0,0 +1,165 @@
+#include "audio_transfer_buffer.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/helpers.h"
+
+namespace esphome {
+namespace audio {
+
+AudioTransferBuffer::~AudioTransferBuffer() { this->deallocate_buffer_(); };
+
+std::unique_ptr<AudioSinkTransferBuffer> AudioSinkTransferBuffer::create(size_t buffer_size) {
+  std::unique_ptr<AudioSinkTransferBuffer> sink_buffer = make_unique<AudioSinkTransferBuffer>();
+
+  if (!sink_buffer->allocate_buffer_(buffer_size)) {
+    return nullptr;
+  }
+
+  return sink_buffer;
+}
+
+std::unique_ptr<AudioSourceTransferBuffer> AudioSourceTransferBuffer::create(size_t buffer_size) {
+  std::unique_ptr<AudioSourceTransferBuffer> source_buffer = make_unique<AudioSourceTransferBuffer>();
+
+  if (!source_buffer->allocate_buffer_(buffer_size)) {
+    return nullptr;
+  }
+
+  return source_buffer;
+}
+
+size_t AudioTransferBuffer::free() const {
+  if (this->buffer_size_ == 0) {
+    return 0;
+  }
+  return this->buffer_size_ - (this->buffer_length_ - (this->data_start_ - this->buffer_));
+}
+
+void AudioTransferBuffer::decrease_buffer_length(size_t bytes) {
+  this->buffer_length_ -= bytes;
+  this->data_start_ += bytes;
+}
+
+void AudioTransferBuffer::increase_buffer_length(size_t bytes) { this->buffer_length_ += bytes; }
+
+void AudioTransferBuffer::clear_buffered_data() {
+  this->buffer_length_ = 0;
+  if (this->ring_buffer_.use_count() > 0) {
+    this->ring_buffer_->reset();
+  }
+}
+
+void AudioSinkTransferBuffer::clear_buffered_data() {
+  this->buffer_length_ = 0;
+  if (this->ring_buffer_.use_count() > 0) {
+    this->ring_buffer_->reset();
+  }
+#ifdef USE_SPEAKER
+  if (this->speaker_ != nullptr) {
+    this->speaker_->stop();
+  }
+#endif
+}
+
+bool AudioTransferBuffer::has_buffered_data() const {
+  if (this->ring_buffer_.use_count() > 0) {
+    return ((this->ring_buffer_->available() > 0) || (this->available() > 0));
+  }
+  return (this->available() > 0);
+}
+
+bool AudioTransferBuffer::reallocate(size_t new_buffer_size) {
+  if (this->buffer_length_ > 0) {
+    // Already has data in the buffer, fail
+    return false;
+  }
+  this->deallocate_buffer_();
+  return this->allocate_buffer_(new_buffer_size);
+}
+
+bool AudioTransferBuffer::allocate_buffer_(size_t buffer_size) {
+  this->buffer_size_ = buffer_size;
+
+  RAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+
+  this->buffer_ = allocator.allocate(this->buffer_size_);
+  if (this->buffer_ == nullptr) {
+    return false;
+  }
+
+  this->data_start_ = this->buffer_;
+  this->buffer_length_ = 0;
+
+  return true;
+}
+
+void AudioTransferBuffer::deallocate_buffer_() {
+  if (this->buffer_ != nullptr) {
+    RAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+    allocator.deallocate(this->buffer_, this->buffer_size_);
+    this->buffer_ = nullptr;
+    this->data_start_ = nullptr;
+  }
+
+  this->buffer_size_ = 0;
+  this->buffer_length_ = 0;
+}
+
+size_t AudioSourceTransferBuffer::transfer_data_from_source(TickType_t ticks_to_wait) {
+  // Shift data in buffer to start
+  if (this->buffer_length_ > 0) {
+    memmove(this->buffer_, this->data_start_, this->buffer_length_);
+  }
+  this->data_start_ = this->buffer_;
+
+  size_t bytes_to_read = this->free();
+  size_t bytes_read = 0;
+  if (bytes_to_read > 0) {
+    if (this->ring_buffer_.use_count() > 0) {
+      bytes_read = this->ring_buffer_->read((void *) this->get_buffer_end(), bytes_to_read, ticks_to_wait);
+    }
+
+    this->increase_buffer_length(bytes_read);
+  }
+  return bytes_read;
+}
+
+size_t AudioSinkTransferBuffer::transfer_data_to_sink(TickType_t ticks_to_wait) {
+  size_t bytes_written = 0;
+  if (this->available()) {
+#ifdef USE_SPEAKER
+    if (this->speaker_ != nullptr) {
+      bytes_written = this->speaker_->play(this->data_start_, this->available(), ticks_to_wait);
+    } else
+#endif
+        if (this->ring_buffer_.use_count() > 0) {
+      bytes_written =
+          this->ring_buffer_->write_without_replacement((void *) this->data_start_, this->available(), ticks_to_wait);
+    }
+
+    this->decrease_buffer_length(bytes_written);
+
+    // Shift unwritten data to the start of the buffer
+    memmove(this->buffer_, this->data_start_, this->buffer_length_);
+    this->data_start_ = this->buffer_;
+  }
+  return bytes_written;
+}
+
+bool AudioSinkTransferBuffer::has_buffered_data() const {
+#ifdef USE_SPEAKER
+  if (this->speaker_ != nullptr) {
+    return (this->speaker_->has_buffered_data() || (this->available() > 0));
+  }
+#endif
+  if (this->ring_buffer_.use_count() > 0) {
+    return ((this->ring_buffer_->available() > 0) || (this->available() > 0));
+  }
+  return (this->available() > 0);
+}
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/audio/audio_transfer_buffer.h
+++ b/esphome/components/audio/audio_transfer_buffer.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#ifdef USE_ESP32
+#include "esphome/core/defines.h"
+#include "esphome/core/ring_buffer.h"
+
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
+
+#include "esp_err.h"
+
+#include <freertos/FreeRTOS.h>
+
+namespace esphome {
+namespace audio {
+
+class AudioTransferBuffer {
+  /*
+   * @brief Class that facilitates tranferring data between a buffer and an audio source or sink.
+   * The transfer buffer is a typical C array that temporarily holds data for processing in other audio components.
+   * Both sink and source transfer buffers can use a ring buffer as the sink/source.
+   *   - The ring buffer is stored in a shared_ptr, so destroying the transfer buffer object will release ownership.
+   */
+ public:
+  /// @brief Destructor that deallocates the transfer buffer
+  ~AudioTransferBuffer();
+
+  /// @brief Returns a pointer to the start of the transfer buffer where available() bytes of exisiting data can be read
+  uint8_t *get_buffer_start() const { return this->data_start_; }
+
+  /// @brief Returns a pointer to the end of the transfer buffer where free() bytes of new data can be written
+  uint8_t *get_buffer_end() const { return this->data_start_ + this->buffer_length_; }
+
+  /// @brief Updates the internal state of the transfer buffer. This should be called after reading data
+  /// @param bytes The number of bytes consumed/read
+  void decrease_buffer_length(size_t bytes);
+
+  /// @brief Updates the internal state of the transfer buffer. This should be called after writing data
+  /// @param bytes The number of bytes written
+  void increase_buffer_length(size_t bytes);
+
+  /// @brief Returns the transfer buffer's currently available bytes to read
+  size_t available() const { return this->buffer_length_; }
+
+  /// @brief Returns the transfer buffers allocated bytes
+  size_t capacity() const { return this->buffer_size_; }
+
+  /// @brief Returns the transfer buffer's currrently free bytes available to write
+  size_t free() const;
+
+  /// @brief Clears data in the transfer buffer and, if possible, the source/sink.
+  virtual void clear_buffered_data();
+
+  /// @brief Tests if there is any data in the tranfer buffer or the source/sink.
+  /// @return True if there is data, false otherwise.
+  virtual bool has_buffered_data() const;
+
+  bool reallocate(size_t new_buffer_size);
+
+ protected:
+  /// @brief Allocates the transfer buffer in external memory, if available.
+  /// @return True is successful, false otherwise.
+  bool allocate_buffer_(size_t buffer_size);
+
+  /// @brief Deallocates the buffer and resets the class variables.
+  void deallocate_buffer_();
+
+  // A possible source or sink for the transfer buffer
+  std::shared_ptr<RingBuffer> ring_buffer_;
+
+  uint8_t *buffer_{nullptr};
+  uint8_t *data_start_{nullptr};
+
+  size_t buffer_size_{0};
+  size_t buffer_length_{0};
+};
+
+class AudioSinkTransferBuffer : public AudioTransferBuffer {
+  /*
+   * @brief A class that implements a transfer buffer for audio sinks.
+   * Supports writing processed data in the transfer buffer to a ring buffer or a speaker component.
+   */
+ public:
+  /// @brief Creates a new sink transfer buffer.
+  /// @param buffer_size Size of the transfer buffer in bytes.
+  /// @return unique_ptr if successfully allocated, nullptr otherwise
+  static std::unique_ptr<AudioSinkTransferBuffer> create(size_t buffer_size);
+
+  /// @brief Writes any available data in the transfer buffer to the sink.
+  /// @param ticks_to_wait FreeRTOS ticks to block while waiting for the sink to have enough space
+  /// @return Number of bytes written
+  size_t transfer_data_to_sink(TickType_t ticks_to_wait);
+
+  /// @brief Adds a ring buffer as the transfer buffer's sink.
+  /// @param ring_buffer weak_ptr to the allocated ring buffer
+  void set_sink(const std::weak_ptr<RingBuffer> &ring_buffer) { this->ring_buffer_ = ring_buffer.lock(); }
+
+#ifdef USE_SPEAKER
+  /// @brief Adds a speaker as the transfer buffer's sink.
+  /// @param speaker Pointer to the speaker component
+  void set_sink(speaker::Speaker *speaker) { this->speaker_ = speaker; }
+#endif
+
+  void clear_buffered_data() override;
+
+  bool has_buffered_data() const override;
+
+ protected:
+#ifdef USE_SPEAKER
+  speaker::Speaker *speaker_{nullptr};
+#endif
+};
+
+class AudioSourceTransferBuffer : public AudioTransferBuffer {
+  /*
+   * @brief A class that implements a transfer buffer for audio sources.
+   * Supports reading audio data from a ring buffer into the transfer buffer for processing.
+   */
+ public:
+  /// @brief Creates a new source transfer buffer.
+  /// @param buffer_size Size of the transfer buffer in bytes.
+  /// @return unique_ptr if successfully allocated, nullptr otherwise
+  static std::unique_ptr<AudioSourceTransferBuffer> create(size_t buffer_size);
+
+  /// @brief Reads any available data from the sink into the transfer buffer.
+  /// @param ticks_to_wait FreeRTOS ticks to block while waiting for the source to have enough data
+  /// @return Number of bytes read
+  size_t transfer_data_from_source(TickType_t ticks_to_wait);
+
+  /// @brief Adds a ring buffer as the transfer buffer's source.
+  /// @param ring_buffer weak_ptr to the allocated ring buffer
+  void set_source(const std::weak_ptr<RingBuffer> &ring_buffer) { this->ring_buffer_ = ring_buffer.lock(); };
+};
+
+}  // namespace audio
+}  // namespace esphome
+
+#endif
--- a/esphome/components/ch422g/ch422g.h
+++ b/esphome/components/ch422g/ch422g.h
@@ -57,6 +57,8 @@ class CH422GGPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_flags(gpio::Flags flags);

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  CH422GComponent *parent_{};
  uint8_t pin_{};
--- a/esphome/components/esp32/gpio.h
+++ b/esphome/components/esp32/gpio.h
@@ -13,6 +13,7 @@ class ESP32InternalGPIOPin : public InternalGPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_drive_strength(gpio_drive_cap_t drive_strength) { drive_strength_ = drive_strength; }
  void set_flags(gpio::Flags flags) { flags_ = flags; }
+
  void setup() override;
  void pin_mode(gpio::Flags flags) override;
  bool digital_read() override;
@@ -21,6 +22,7 @@ class ESP32InternalGPIOPin : public InternalGPIOPin {
  void detach_interrupt() const override;
  ISRInternalGPIOPin to_isr() const override;
  uint8_t get_pin() const override { return (uint8_t) pin_; }
+  gpio::Flags get_flags() const override { return flags_; }
  bool is_inverted() const override { return inverted_; }

 protected:
--- a/esphome/components/esp8266/gpio.h
+++ b/esphome/components/esp8266/gpio.h
@@ -22,6 +22,7 @@ class ESP8266GPIOPin : public InternalGPIOPin {
  void detach_interrupt() const override;
  ISRInternalGPIOPin to_isr() const override;
  uint8_t get_pin() const override { return pin_; }
+  gpio::Flags get_flags() const override { return flags_; }
  bool is_inverted() const override { return inverted_; }

 protected:
--- a/esphome/components/host/gpio.h
+++ b/esphome/components/host/gpio.h
@@ -21,6 +21,7 @@ class HostGPIOPin : public InternalGPIOPin {
  void detach_interrupt() const override;
  ISRInternalGPIOPin to_isr() const override;
  uint8_t get_pin() const override { return pin_; }
+  gpio::Flags get_flags() const override { return flags_; }
  bool is_inverted() const override { return inverted_; }

 protected:
--- a/esphome/components/i2c/i2c_bus_esp_idf.cpp
+++ b/esphome/components/i2c/i2c_bus_esp_idf.cpp
@@ -39,6 +39,10 @@ void IDFI2CBus::setup() {
  conf.scl_io_num = scl_pin_;
  conf.scl_pullup_en = scl_pullup_enabled_;
  conf.master.clk_speed = frequency_;
+#ifdef USE_ESP32_VARIANT_ESP32S2
+  // workaround for https://github.com/esphome/issues/issues/6718
+  conf.clk_flags = I2C_SCLK_SRC_FLAG_AWARE_DFS;
+#endif
  esp_err_t err = i2c_param_config(port_, &conf);
  if (err != ESP_OK) {
    ESP_LOGW(TAG, "i2c_param_config failed: %s", esp_err_to_name(err));
--- a/esphome/components/i2s_audio/speaker/init.py
+++ b/esphome/components/i2s_audio/speaker/init.py
@@ -1,13 +1,25 @@
 from esphome import pins
 import esphome.codegen as cg
-from esphome.components import esp32, speaker
+from esphome.components import audio, esp32, speaker
 import esphome.config_validation as cv
-from esphome.const import CONF_CHANNEL, CONF_ID, CONF_MODE, CONF_TIMEOUT
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_BUFFER_DURATION,
+    CONF_CHANNEL,
+    CONF_ID,
+    CONF_MODE,
+    CONF_NEVER,
+    CONF_NUM_CHANNELS,
+    CONF_SAMPLE_RATE,
+    CONF_TIMEOUT,
+)

 from .. import (
    CONF_I2S_DOUT_PIN,
+    CONF_I2S_MODE,
    CONF_LEFT,
    CONF_MONO,
+    CONF_PRIMARY,
    CONF_RIGHT,
    CONF_STEREO,
    I2SAudioOut,
@@ -24,10 +36,8 @@ I2SAudioSpeaker = i2s_audio_ns.class_(
    "I2SAudioSpeaker", cg.Component, speaker.Speaker, I2SAudioOut
 )

-CONF_BUFFER_DURATION = "buffer_duration"
 CONF_DAC_TYPE = "dac_type"
 CONF_I2S_COMM_FMT = "i2s_comm_fmt"
-CONF_NEVER = "never"

 i2s_dac_mode_t = cg.global_ns.enum("i2s_dac_mode_t")
 INTERNAL_DAC_OPTIONS = {
@@ -53,7 +63,41 @@ I2C_COMM_FMT_OPTIONS = {
 NO_INTERNAL_DAC_VARIANTS = [esp32.const.VARIANT_ESP32S2]


-def validate_esp32_variant(config):
+def _set_num_channels_from_config(config):
+    if config[CONF_CHANNEL] in (CONF_MONO, CONF_LEFT, CONF_RIGHT):
+        config[CONF_NUM_CHANNELS] = 1
+    else:
+        config[CONF_NUM_CHANNELS] = 2
+
+    return config
+
+
+def _set_stream_limits(config):
+    if config[CONF_I2S_MODE] == CONF_PRIMARY:
+        # Primary mode has modifiable stream settings
+        audio.set_stream_limits(
+            min_bits_per_sample=8,
+            max_bits_per_sample=32,
+            min_channels=1,
+            max_channels=2,
+            min_sample_rate=16000,
+            max_sample_rate=48000,
+        )(config)
+    else:
+        # Secondary mode has unmodifiable max bits per sample and min/max sample rates
+        audio.set_stream_limits(
+            min_bits_per_sample=8,
+            max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+            min_channels=1,
+            max_channels=2,
+            min_sample_rate=config.get(CONF_SAMPLE_RATE),
+            max_sample_rate=config.get(CONF_SAMPLE_RATE),
+        )
+
+    return config
+
+
+def _validate_esp32_variant(config):
    if config[CONF_DAC_TYPE] != "internal":
        return config
    variant = esp32.get_esp32_variant()
@@ -85,6 +129,7 @@ BASE_SCHEMA = (
    .extend(cv.COMPONENT_SCHEMA)
 )

+
 CONFIG_SCHEMA = cv.All(
    cv.typed_schema(
        {
@@ -106,7 +151,9 @@ CONFIG_SCHEMA = cv.All(
        },
        key=CONF_DAC_TYPE,
    ),
-    validate_esp32_variant,
+    _validate_esp32_variant,
+    _set_num_channels_from_config,
+    _set_stream_limits,
 )


--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -148,9 +148,11 @@ void I2SAudioSpeaker::loop() {
    this->status_set_error("Failed to adjust I2S bus to match the incoming audio");
    ESP_LOGE(TAG,
             "Incompatible audio format: sample rate = %" PRIu32 ", channels = %" PRIu8 ", bits per sample = %" PRIu8,
-             this->audio_stream_info_.sample_rate, this->audio_stream_info_.channels,
-             this->audio_stream_info_.bits_per_sample);
+             this->audio_stream_info_.get_sample_rate(), this->audio_stream_info_.get_channels(),
+             this->audio_stream_info_.get_bits_per_sample());
  }
+
+  xEventGroupClearBits(this->event_group_, ALL_ERR_ESP_BITS);
 }

 void I2SAudioSpeaker::set_volume(float volume) {
@@ -201,6 +203,12 @@ size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t tick
    this->start();
  }

+  if ((this->state_ != speaker::STATE_RUNNING) || (this->audio_ring_buffer_.use_count() == 1)) {
+    // Unable to write data to a running speaker, so delay the max amount of time so it can get ready
+    vTaskDelay(ticks_to_wait);
+    ticks_to_wait = 0;
+  }
+
  size_t bytes_written = 0;
  if ((this->state_ == speaker::STATE_RUNNING) && (this->audio_ring_buffer_.use_count() == 1)) {
    // Only one owner of the ring buffer (the speaker task), so the ring buffer is allocated and no other components are
@@ -223,6 +231,8 @@ bool I2SAudioSpeaker::has_buffered_data() const {

 void I2SAudioSpeaker::speaker_task(void *params) {
  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
+  this_speaker->task_created_ = true;
+
  uint32_t event_group_bits =
      xEventGroupWaitBits(this_speaker->event_group_,
                          SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP |
@@ -240,19 +250,20 @@ void I2SAudioSpeaker::speaker_task(void *params) {

  audio::AudioStreamInfo audio_stream_info = this_speaker->audio_stream_info_;

-  const uint32_t bytes_per_ms =
-      audio_stream_info.channels * audio_stream_info.get_bytes_per_sample() * audio_stream_info.sample_rate / 1000;
+  const uint32_t dma_buffers_duration_ms = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT;
+  // Ensure ring buffer duration is at least the duration of all DMA buffers
+  const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this_speaker->buffer_duration_ms_);

-  const size_t dma_buffers_size = DMA_BUFFERS_COUNT * DMA_BUFFER_DURATION_MS * bytes_per_ms;
+  // The DMA buffers may have more bits per sample, so calculate buffer sizes based in the input audio stream info
+  const size_t data_buffer_size = audio_stream_info.ms_to_bytes(dma_buffers_duration_ms);
+  const size_t ring_buffer_size = audio_stream_info.ms_to_bytes(ring_buffer_duration);

-  // Ensure ring buffer is at least as large as the total size of the DMA buffers
-  const size_t ring_buffer_size =
-      std::max((uint32_t) dma_buffers_size, this_speaker->buffer_duration_ms_ * bytes_per_ms);
+  const size_t single_dma_buffer_input_size = data_buffer_size / DMA_BUFFERS_COUNT;

-  if (this_speaker->send_esp_err_to_event_group_(this_speaker->allocate_buffers_(dma_buffers_size, ring_buffer_size))) {
+  if (this_speaker->send_esp_err_to_event_group_(this_speaker->allocate_buffers_(data_buffer_size, ring_buffer_size))) {
    // Failed to allocate buffers
    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
-    this_speaker->delete_task_(dma_buffers_size);
+    this_speaker->delete_task_(data_buffer_size);
  }

  if (!this_speaker->send_esp_err_to_event_group_(this_speaker->start_i2s_driver_(audio_stream_info))) {
@@ -262,20 +273,25 @@ void I2SAudioSpeaker::speaker_task(void *params) {
    uint32_t last_data_received_time = millis();
    bool tx_dma_underflow = false;

-    while (!this_speaker->timeout_.has_value() ||
+    this_speaker->accumulated_frames_written_ = 0;
+
+    // Keep looping if paused, there is no timeout configured, or data was received more recently than the configured
+    // timeout
+    while (this_speaker->pause_state_ || !this_speaker->timeout_.has_value() ||
           (millis() - last_data_received_time) <= this_speaker->timeout_.value()) {
      event_group_bits = xEventGroupGetBits(this_speaker->event_group_);

      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
+        xEventGroupClearBits(this_speaker->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
        break;
      }
      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
+        xEventGroupClearBits(this_speaker->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
        stop_gracefully = true;
      }

      if (this_speaker->audio_stream_info_ != audio_stream_info) {
-        // Audio stream info has changed, stop the speaker task so it will restart with the proper settings.
-
+        // Audio stream info changed, stop the speaker task so it will restart with the proper settings.
        break;
      }

@@ -286,33 +302,64 @@ void I2SAudioSpeaker::speaker_task(void *params) {
        }
      }

-      size_t bytes_to_read = dma_buffers_size;
-      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, bytes_to_read,
+      if (this_speaker->pause_state_) {
+        // Pause state is accessed atomically, so thread safe
+        // Delay so the task can yields, then skip transferring audio data
+        delay(TASK_DELAY_MS);
+        continue;
+      }
+
+      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, data_buffer_size,
                                                                 pdMS_TO_TICKS(TASK_DELAY_MS));

      if (bytes_read > 0) {
-        size_t bytes_written = 0;
-
-        if ((audio_stream_info.bits_per_sample == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
+        if ((audio_stream_info.get_bits_per_sample() == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
          // Scale samples by the volume factor in place
          q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
                             bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
        }

-        if (audio_stream_info.bits_per_sample == (uint8_t) this_speaker->bits_per_sample_) {
-          i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read, &bytes_written,
-                    portMAX_DELAY);
-        } else if (audio_stream_info.bits_per_sample < (uint8_t) this_speaker->bits_per_sample_) {
-          i2s_write_expand(this_speaker->parent_->get_port(), this_speaker->data_buffer_, bytes_read,
-                           audio_stream_info.bits_per_sample, this_speaker->bits_per_sample_, &bytes_written,
-                           portMAX_DELAY);
-        }
+        // Write the audio data to a single DMA buffer at a time to reduce latency for the audio duration played
+        // callback.
+        const uint32_t batches = (bytes_read + single_dma_buffer_input_size - 1) / single_dma_buffer_input_size;

-        if (bytes_written != bytes_read) {
-          xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
+        for (uint32_t i = 0; i < batches; ++i) {
+          size_t bytes_written = 0;
+          size_t bytes_to_write = std::min(single_dma_buffer_input_size, bytes_read);
+
+          if (audio_stream_info.get_bits_per_sample() == (uint8_t) this_speaker->bits_per_sample_) {
+            i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_ + i * single_dma_buffer_input_size,
+                      bytes_to_write, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
+          } else if (audio_stream_info.get_bits_per_sample() < (uint8_t) this_speaker->bits_per_sample_) {
+            i2s_write_expand(this_speaker->parent_->get_port(),
+                             this_speaker->data_buffer_ + i * single_dma_buffer_input_size, bytes_to_write,
+                             audio_stream_info.get_bits_per_sample(), this_speaker->bits_per_sample_, &bytes_written,
+                             pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
+          }
+
+          uint32_t write_timestamp = micros();
+
+          if (bytes_written != bytes_to_write) {
+            xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
+          }
+
+          bytes_read -= bytes_written;
+
+          this_speaker->accumulated_frames_written_ += audio_stream_info.bytes_to_frames(bytes_written);
+          const uint32_t new_playback_ms =
+              audio_stream_info.frames_to_milliseconds_with_remainder(&this_speaker->accumulated_frames_written_);
+          const uint32_t remainder_us =
+              audio_stream_info.frames_to_microseconds(this_speaker->accumulated_frames_written_);
+
+          uint32_t pending_frames =
+              audio_stream_info.bytes_to_frames(bytes_read + this_speaker->audio_ring_buffer_->available());
+          const uint32_t pending_ms = audio_stream_info.frames_to_milliseconds_with_remainder(&pending_frames);
+
+          this_speaker->audio_output_callback_(new_playback_ms, remainder_us, pending_ms, write_timestamp);
+
+          tx_dma_underflow = false;
+          last_data_received_time = millis();
        }
-        tx_dma_underflow = false;
-        last_data_received_time = millis();
      } else {
        // No data received
        if (stop_gracefully && tx_dma_underflow) {
@@ -328,7 +375,7 @@ void I2SAudioSpeaker::speaker_task(void *params) {
    this_speaker->parent_->unlock();
  }

-  this_speaker->delete_task_(dma_buffers_size);
+  this_speaker->delete_task_(data_buffer_size);
 }

 void I2SAudioSpeaker::start() {
@@ -337,16 +384,15 @@ void I2SAudioSpeaker::start() {
  if ((this->state_ == speaker::STATE_STARTING) || (this->state_ == speaker::STATE_RUNNING))
    return;

-  if (this->speaker_task_handle_ == nullptr) {
+  if (!this->task_created_ && (this->speaker_task_handle_ == nullptr)) {
    xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
                &this->speaker_task_handle_);
-  }

-  if (this->speaker_task_handle_ != nullptr) {
-    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
-    this->task_created_ = true;
-  } else {
-    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+    if (this->speaker_task_handle_ != nullptr) {
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
+    } else {
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+    }
  }
 }

@@ -416,12 +462,12 @@ esp_err_t I2SAudioSpeaker::allocate_buffers_(size_t data_buffer_size, size_t rin
 }

 esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info) {
-  if ((this->i2s_mode_ & I2S_MODE_SLAVE) && (this->sample_rate_ != audio_stream_info.sample_rate)) {  // NOLINT
-    //  Can't reconfigure I2S bus, so the sample rate must match the configured value
+  if ((this->i2s_mode_ & I2S_MODE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) {  // NOLINT
+    // Can't reconfigure I2S bus, so the sample rate must match the configured value
    return ESP_ERR_NOT_SUPPORTED;
  }

-  if ((i2s_bits_per_sample_t) audio_stream_info.bits_per_sample > this->bits_per_sample_) {
+  if ((i2s_bits_per_sample_t) audio_stream_info.get_bits_per_sample() > this->bits_per_sample_) {
    // Currently can't handle the case when the incoming audio has more bits per sample than the configured value
    return ESP_ERR_NOT_SUPPORTED;
  }
@@ -432,21 +478,21 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea

  i2s_channel_fmt_t channel = this->channel_;

-  if (audio_stream_info.channels == 1) {
+  if (audio_stream_info.get_channels() == 1) {
    if (this->channel_ == I2S_CHANNEL_FMT_ONLY_LEFT) {
      channel = I2S_CHANNEL_FMT_ONLY_LEFT;
    } else {
      channel = I2S_CHANNEL_FMT_ONLY_RIGHT;
    }
-  } else if (audio_stream_info.channels == 2) {
+  } else if (audio_stream_info.get_channels() == 2) {
    channel = I2S_CHANNEL_FMT_RIGHT_LEFT;
  }

-  int dma_buffer_length = DMA_BUFFER_DURATION_MS * this->sample_rate_ / 1000;
+  int dma_buffer_length = audio_stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);

  i2s_driver_config_t config = {
    .mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_TX),
-    .sample_rate = audio_stream_info.sample_rate,
+    .sample_rate = audio_stream_info.get_sample_rate(),
    .bits_per_sample = this->bits_per_sample_,
    .channel_format = channel,
    .communication_format = this->i2s_comm_fmt_,
@@ -504,7 +550,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
 }

 void I2SAudioSpeaker::delete_task_(size_t buffer_size) {
-  this->audio_ring_buffer_.reset();  // Releases onwership of the shared_ptr
+  this->audio_ring_buffer_.reset();  // Releases ownership of the shared_ptr

  if (this->data_buffer_ != nullptr) {
    ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@@ -40,6 +40,9 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
  void stop() override;
  void finish() override;

+  void set_pause_state(bool pause_state) override { this->pause_state_ = pause_state; }
+  bool get_pause_state() const override { return this->pause_state_; }
+
  /// @brief Plays the provided audio data.
  /// Starts the speaker task, if necessary. Writes the audio data to the ring buffer.
  /// @param data Audio data in the format set by the parent speaker classes ``set_audio_stream_info`` method.
@@ -121,13 +124,18 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
  uint8_t dout_pin_;

  bool task_created_{false};
+  bool pause_state_{false};

  int16_t q15_volume_factor_{INT16_MAX};

+  size_t bytes_written_{0};
+
 #if SOC_I2S_SUPPORTS_DAC
  i2s_dac_mode_t internal_dac_mode_{I2S_DAC_CHANNEL_DISABLE};
 #endif
  i2s_comm_format_t i2s_comm_fmt_;
+
+  uint32_t accumulated_frames_written_{0};
 };

 }  // namespace i2s_audio
--- a/esphome/components/libretiny/gpio_arduino.h
+++ b/esphome/components/libretiny/gpio_arduino.h
@@ -20,6 +20,7 @@ class ArduinoInternalGPIOPin : public InternalGPIOPin {
  void detach_interrupt() const override;
  ISRInternalGPIOPin to_isr() const override;
  uint8_t get_pin() const override { return pin_; }
+  gpio::Flags get_flags() const override { return flags_; }
  bool is_inverted() const override { return inverted_; }

 protected:
--- a/esphome/components/lvgl/init.py
+++ b/esphome/components/lvgl/init.py
@@ -61,7 +61,14 @@ from .types import (
    lv_style_t,
    lvgl_ns,
 )
-from .widgets import Widget, add_widgets, get_scr_act, set_obj_properties, styles_used
+from .widgets import (
+    LvScrActType,
+    Widget,
+    add_widgets,
+    get_scr_act,
+    set_obj_properties,
+    styles_used,
+)
 from .widgets.animimg import animimg_spec
 from .widgets.arc import arc_spec
 from .widgets.button import button_spec
@@ -318,7 +325,7 @@ async def to_code(configs):
            config[df.CONF_RESUME_ON_INPUT],
        )
        await cg.register_component(lv_component, config)
-        Widget.create(config[CONF_ID], lv_component, obj_spec, config)
+        Widget.create(config[CONF_ID], lv_component, LvScrActType(), config)

        lv_scr_act = get_scr_act(lv_component)
        async with LvContext():
@@ -391,7 +398,7 @@ FINAL_VALIDATE_SCHEMA = final_validation

 LVGL_SCHEMA = (
    cv.polling_component_schema("1s")
-    .extend(obj_schema(obj_spec))
+    .extend(obj_schema(LvScrActType()))
    .extend(
        {
            cv.GenerateID(CONF_ID): cv.declare_id(LvglComponent),
--- a/esphome/components/lvgl/defines.py
+++ b/esphome/components/lvgl/defines.py
@@ -146,6 +146,8 @@ TYPE_FLEX = "flex"
 TYPE_GRID = "grid"
 TYPE_NONE = "none"

+DIRECTIONS = LvConstant("LV_DIR_", "LEFT", "RIGHT", "BOTTOM", "TOP")
+
 LV_FONTS = list(f"montserrat_{s}" for s in range(8, 50, 2)) + [
    "dejavu_16_persian_hebrew",
    "simsun_16_cjk",
@@ -169,9 +171,13 @@ LV_EVENT_MAP = {
    "CANCEL": "CANCEL",
    "ALL_EVENTS": "ALL",
    "CHANGE": "VALUE_CHANGED",
+    "GESTURE": "GESTURE",
 }

 LV_EVENT_TRIGGERS = tuple(f"on_{x.lower()}" for x in LV_EVENT_MAP)
+SWIPE_TRIGGERS = tuple(
+    f"on_swipe_{x.lower()}" for x in DIRECTIONS.choices + ("up", "down")
+)


 LV_ANIM = LvConstant(
@@ -250,7 +256,6 @@ KEYBOARD_MODES = LvConstant(
    "NUMBER",
 )
 ROLLER_MODES = LvConstant("LV_ROLLER_MODE_", "NORMAL", "INFINITE")
-DIRECTIONS = LvConstant("LV_DIR_", "LEFT", "RIGHT", "BOTTOM", "TOP")
 TILE_DIRECTIONS = DIRECTIONS.extend("HOR", "VER", "ALL")
 CHILD_ALIGNMENTS = LvConstant(
    "LV_ALIGN_",
--- a/esphome/components/lvgl/schemas.py
+++ b/esphome/components/lvgl/schemas.py
@@ -211,10 +211,9 @@ def part_schema(parts):


 def automation_schema(typ: LvType):
+    events = df.LV_EVENT_TRIGGERS + df.SWIPE_TRIGGERS
    if typ.has_on_value:
-        events = df.LV_EVENT_TRIGGERS + (CONF_ON_VALUE,)
-    else:
-        events = df.LV_EVENT_TRIGGERS
+        events = events + (CONF_ON_VALUE,)
    args = typ.get_arg_type() if isinstance(typ, LvType) else []
    args.append(lv_event_t_ptr)
    return {
--- a/esphome/components/lvgl/trigger.py
+++ b/esphome/components/lvgl/trigger.py
@@ -7,8 +7,10 @@ from .defines import (
    CONF_ALIGN_TO,
    CONF_X,
    CONF_Y,
+    DIRECTIONS,
    LV_EVENT_MAP,
    LV_EVENT_TRIGGERS,
+    SWIPE_TRIGGERS,
    literal,
 )
 from .lvcode import (
@@ -23,7 +25,7 @@ from .lvcode import (
    lvgl_static,
 )
 from .types import LV_EVENT
-from .widgets import widget_map
+from .widgets import LvScrActType, get_scr_act, widget_map


 async def generate_triggers():
@@ -33,6 +35,9 @@ async def generate_triggers():
    """

    for w in widget_map.values():
+        if isinstance(w.type, LvScrActType):
+            w = get_scr_act(w.var)
+
        if w.config:
            for event, conf in {
                event: conf
@@ -43,6 +48,24 @@ async def generate_triggers():
                w.add_flag("LV_OBJ_FLAG_CLICKABLE")
                event = literal("LV_EVENT_" + LV_EVENT_MAP[event[3:].upper()])
                await add_trigger(conf, w, event)
+
+            for event, conf in {
+                event: conf
+                for event, conf in w.config.items()
+                if event in SWIPE_TRIGGERS
+            }.items():
+                conf = conf[0]
+                dir = event[9:].upper()
+                dir = {"UP": "TOP", "DOWN": "BOTTOM"}.get(dir, dir)
+                dir = DIRECTIONS.mapper(dir)
+                w.clear_flag("LV_OBJ_FLAG_SCROLLABLE")
+                selected = literal(
+                    f"lv_indev_get_gesture_dir(lv_indev_get_act()) == {dir}"
+                )
+                await add_trigger(
+                    conf, w, literal("LV_EVENT_GESTURE"), is_selected=selected
+                )
+
            for conf in w.config.get(CONF_ON_VALUE, ()):
                await add_trigger(
                    conf,
@@ -61,13 +84,14 @@ async def generate_triggers():
                lv.obj_align_to(w.obj, target, align, x, y)


-async def add_trigger(conf, w, *events):
+async def add_trigger(conf, w, *events, is_selected=None):
+    is_selected = is_selected or w.is_selected()
    tid = conf[CONF_TRIGGER_ID]
    trigger = cg.new_Pvariable(tid)
    args = w.get_args() + [(lv_event_t_ptr, "event")]
    value = w.get_values()
    await automation.build_automation(trigger, args, conf)
    async with LambdaContext(EVENT_ARG, where=tid) as context:
-        with LvConditional(w.is_selected()):
+        with LvConditional(is_selected):
            lv_add(trigger.trigger(*value, literal("event")))
    lv_add(lvgl_static.add_event_cb(w.obj, await context.get_lambda(), *events))
--- a/esphome/components/max6956/max6956.h
+++ b/esphome/components/max6956/max6956.h
@@ -83,6 +83,8 @@ class MAX6956GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  MAX6956 *parent_;
  uint8_t pin_;
--- a/esphome/components/mcp23016/mcp23016.h
+++ b/esphome/components/mcp23016/mcp23016.h
@@ -61,6 +61,8 @@ class MCP23016GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  MCP23016 *parent_;
  uint8_t pin_;
--- a/esphome/components/mcp23xxx_base/mcp23xxx_base.h
+++ b/esphome/components/mcp23xxx_base/mcp23xxx_base.h
@@ -43,6 +43,8 @@ class MCP23XXXGPIOPin : public GPIOPin {
  void set_flags(gpio::Flags flags) { flags_ = flags; }
  void set_interrupt_mode(MCP23XXXInterruptMode interrupt_mode) { interrupt_mode_ = interrupt_mode; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  MCP23XXXBase *parent_;
  uint8_t pin_;
--- a/esphome/components/mixer/init.py
+++ b/esphome/components/mixer/init.py
--- a/esphome/components/mixer/speaker/init.py
+++ b/esphome/components/mixer/speaker/init.py
@@ -0,0 +1,172 @@
+from esphome import automation
+import esphome.codegen as cg
+from esphome.components import audio, esp32, speaker
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_BUFFER_DURATION,
+    CONF_DURATION,
+    CONF_ID,
+    CONF_NEVER,
+    CONF_NUM_CHANNELS,
+    CONF_OUTPUT_SPEAKER,
+    CONF_SAMPLE_RATE,
+    CONF_TASK_STACK_IN_PSRAM,
+    CONF_TIMEOUT,
+    PLATFORM_ESP32,
+)
+from esphome.core.entity_helpers import inherit_property_from
+import esphome.final_validate as fv
+
+AUTO_LOAD = ["audio"]
+CODEOWNERS = ["@kahrendt"]
+
+mixer_speaker_ns = cg.esphome_ns.namespace("mixer_speaker")
+MixerSpeaker = mixer_speaker_ns.class_("MixerSpeaker", cg.Component)
+SourceSpeaker = mixer_speaker_ns.class_("SourceSpeaker", cg.Component, speaker.Speaker)
+
+CONF_DECIBEL_REDUCTION = "decibel_reduction"
+CONF_QUEUE_MODE = "queue_mode"
+CONF_SOURCE_SPEAKERS = "source_speakers"
+
+DuckingApplyAction = mixer_speaker_ns.class_(
+    "DuckingApplyAction", automation.Action, cg.Parented.template(SourceSpeaker)
+)
+
+
+SOURCE_SPEAKER_SCHEMA = speaker.SPEAKER_SCHEMA.extend(
+    {
+        cv.GenerateID(): cv.declare_id(SourceSpeaker),
+        cv.Optional(
+            CONF_BUFFER_DURATION, default="100ms"
+        ): cv.positive_time_period_milliseconds,
+        cv.Optional(CONF_TIMEOUT, default="500ms"): cv.Any(
+            cv.positive_time_period_milliseconds,
+            cv.one_of(CONF_NEVER, lower=True),
+        ),
+        cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(16, 16),
+    }
+)
+
+
+def _set_stream_limits(config):
+    audio.set_stream_limits(
+        min_bits_per_sample=16,
+        max_bits_per_sample=16,
+    )(config)
+
+    return config
+
+
+def _validate_source_speaker(config):
+    fconf = fv.full_config.get()
+
+    # Get ID for the output speaker and add it to the source speakrs config to easily inherit properties
+    path = fconf.get_path_for_id(config[CONF_ID])[:-3]
+    path.append(CONF_OUTPUT_SPEAKER)
+    output_speaker_id = fconf.get_config_for_path(path)
+    config[CONF_OUTPUT_SPEAKER] = output_speaker_id
+
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER)(config)
+    inherit_property_from(CONF_SAMPLE_RATE, CONF_OUTPUT_SPEAKER)(config)
+
+    audio.final_validate_audio_schema(
+        "mixer",
+        audio_device=CONF_OUTPUT_SPEAKER,
+        bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        channels=config.get(CONF_NUM_CHANNELS),
+        sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
+CONFIG_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.GenerateID(): cv.declare_id(MixerSpeaker),
+            cv.Required(CONF_OUTPUT_SPEAKER): cv.use_id(speaker.Speaker),
+            cv.Required(CONF_SOURCE_SPEAKERS): cv.All(
+                cv.ensure_list(SOURCE_SPEAKER_SCHEMA),
+                cv.Length(min=2, max=8),
+                [_set_stream_limits],
+            ),
+            cv.Optional(CONF_NUM_CHANNELS): cv.int_range(min=1, max=2),
+            cv.Optional(CONF_QUEUE_MODE, default=False): cv.boolean,
+            cv.SplitDefault(CONF_TASK_STACK_IN_PSRAM, esp32_idf=False): cv.All(
+                cv.boolean, cv.only_with_esp_idf
+            ),
+        }
+    ),
+    cv.only_on([PLATFORM_ESP32]),
+)
+
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Optional(CONF_SOURCE_SPEAKERS): [_validate_source_speaker],
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER),
+)
+
+
+async def to_code(config):
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+
+    spkr = await cg.get_variable(config[CONF_OUTPUT_SPEAKER])
+
+    cg.add(var.set_output_channels(config[CONF_NUM_CHANNELS]))
+    cg.add(var.set_output_speaker(spkr))
+    cg.add(var.set_queue_mode(config[CONF_QUEUE_MODE]))
+
+    if task_stack_in_psram := config.get(CONF_TASK_STACK_IN_PSRAM):
+        cg.add(var.set_task_stack_in_psram(task_stack_in_psram))
+        if task_stack_in_psram:
+            if config[CONF_TASK_STACK_IN_PSRAM]:
+                esp32.add_idf_sdkconfig_option(
+                    "CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY", True
+                )
+
+    for speaker_config in config[CONF_SOURCE_SPEAKERS]:
+        source_speaker = cg.new_Pvariable(speaker_config[CONF_ID])
+
+        cg.add(source_speaker.set_buffer_duration(speaker_config[CONF_BUFFER_DURATION]))
+
+        if speaker_config[CONF_TIMEOUT] != CONF_NEVER:
+            cg.add(source_speaker.set_timeout(speaker_config[CONF_TIMEOUT]))
+
+        await cg.register_component(source_speaker, speaker_config)
+        await cg.register_parented(source_speaker, config[CONF_ID])
+        await speaker.register_speaker(source_speaker, speaker_config)
+
+        cg.add(var.add_source_speaker(source_speaker))
+
+
+@automation.register_action(
+    "mixer_speaker.apply_ducking",
+    DuckingApplyAction,
+    cv.Schema(
+        {
+            cv.GenerateID(): cv.use_id(SourceSpeaker),
+            cv.Required(CONF_DECIBEL_REDUCTION): cv.templatable(
+                cv.int_range(min=0, max=51)
+            ),
+            cv.Optional(CONF_DURATION, default="0.0s"): cv.templatable(
+                cv.positive_time_period_milliseconds
+            ),
+        }
+    ),
+)
+async def ducking_set_to_code(config, action_id, template_arg, args):
+    var = cg.new_Pvariable(action_id, template_arg)
+    await cg.register_parented(var, config[CONF_ID])
+    decibel_reduction = await cg.templatable(
+        config[CONF_DECIBEL_REDUCTION], args, cg.uint8
+    )
+    cg.add(var.set_decibel_reduction(decibel_reduction))
+    duration = await cg.templatable(config[CONF_DURATION], args, cg.uint32)
+    cg.add(var.set_duration(duration))
+    return var
--- a/esphome/components/mixer/speaker/automation.h
+++ b/esphome/components/mixer/speaker/automation.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "mixer_speaker.h"
+
+#ifdef USE_ESP32
+
+namespace esphome {
+namespace mixer_speaker {
+template<typename... Ts> class DuckingApplyAction : public Action<Ts...>, public Parented<SourceSpeaker> {
+  TEMPLATABLE_VALUE(uint8_t, decibel_reduction)
+  TEMPLATABLE_VALUE(uint32_t, duration)
+  void play(Ts... x) override {
+    this->parent_->apply_ducking(this->decibel_reduction_.value(x...), this->duration_.value(x...));
+  }
+};
+}  // namespace mixer_speaker
+}  // namespace esphome
+
+#endif
--- a/esphome/components/mixer/speaker/mixer_speaker.cpp
+++ b/esphome/components/mixer/speaker/mixer_speaker.cpp
@@ -0,0 +1,624 @@
+#include "mixer_speaker.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/hal.h"
+#include "esphome/core/helpers.h"
+#include "esphome/core/log.h"
+
+#include <algorithm>
+#include <cstring>
+
+namespace esphome {
+namespace mixer_speaker {
+
+static const UBaseType_t MIXER_TASK_PRIORITY = 10;
+
+static const uint32_t TRANSFER_BUFFER_DURATION_MS = 50;
+static const uint32_t TASK_DELAY_MS = 25;
+
+static const size_t TASK_STACK_SIZE = 4096;
+
+static const int16_t MAX_AUDIO_SAMPLE_VALUE = INT16_MAX;
+static const int16_t MIN_AUDIO_SAMPLE_VALUE = INT16_MIN;
+
+static const char *const TAG = "speaker_mixer";
+
+// Gives the Q15 fixed point scaling factor to reduce by 0 dB, 1dB, ..., 50 dB
+// dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014)
+// float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15)
+static const std::vector<int16_t> DECIBEL_REDUCTION_TABLE = {
+    32767, 29201, 26022, 23189, 20665, 18415, 16410, 14624, 13032, 11613, 10349, 9222, 8218, 7324, 6527, 5816, 5183,
+    4619,  4116,  3668,  3269,  2913,  2596,  2313,  2061,  1837,  1637,  1459,  1300, 1158, 1032, 920,  820,  731,
+    651,   580,   517,   461,   411,   366,   326,   291,   259,   231,   206,   183,  163,  146,  130,  116,  103};
+
+enum MixerEventGroupBits : uint32_t {
+  COMMAND_STOP = (1 << 0),  // stops the mixer task
+  STATE_STARTING = (1 << 10),
+  STATE_RUNNING = (1 << 11),
+  STATE_STOPPING = (1 << 12),
+  STATE_STOPPED = (1 << 13),
+  ERR_ESP_NO_MEM = (1 << 19),
+  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
+};
+
+void SourceSpeaker::dump_config() {
+  ESP_LOGCONFIG(TAG, "Mixer Source Speaker");
+  ESP_LOGCONFIG(TAG, "  Buffer Duration: %" PRIu32 " ms", this->buffer_duration_ms_);
+  if (this->timeout_ms_.has_value()) {
+    ESP_LOGCONFIG(TAG, "  Timeout: %" PRIu32 " ms", this->timeout_ms_.value());
+  } else {
+    ESP_LOGCONFIG(TAG, "  Timeout: never");
+  }
+}
+
+void SourceSpeaker::setup() {
+  this->parent_->get_output_speaker()->add_audio_output_callback(
+      [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) {
+        uint32_t personal_playback_ms = std::min(new_playback_ms, this->pending_playback_ms_);
+        if (personal_playback_ms > 0) {
+          this->pending_playback_ms_ -= personal_playback_ms;
+          this->audio_output_callback_(personal_playback_ms, remainder_us, this->pending_playback_ms_, write_timestamp);
+        }
+      });
+}
+
+void SourceSpeaker::loop() {
+  switch (this->state_) {
+    case speaker::STATE_STARTING: {
+      esp_err_t err = this->start_();
+      if (err == ESP_OK) {
+        this->state_ = speaker::STATE_RUNNING;
+        this->stop_gracefully_ = false;
+        this->last_seen_data_ms_ = millis();
+        this->status_clear_error();
+      } else {
+        switch (err) {
+          case ESP_ERR_NO_MEM:
+            this->status_set_error("Failed to start mixer: not enough memory");
+            break;
+          case ESP_ERR_NOT_SUPPORTED:
+            this->status_set_error("Failed to start mixer: unsupported bits per sample");
+            break;
+          case ESP_ERR_INVALID_ARG:
+            this->status_set_error("Failed to start mixer: audio stream isn't compatible with the other audio stream.");
+            break;
+          case ESP_ERR_INVALID_STATE:
+            this->status_set_error("Failed to start mixer: mixer task failed to start");
+            break;
+          default:
+            this->status_set_error("Failed to start mixer");
+            break;
+        }
+
+        this->state_ = speaker::STATE_STOPPING;
+      }
+      break;
+    }
+    case speaker::STATE_RUNNING:
+      if (!this->transfer_buffer_->has_buffered_data()) {
+        if ((this->timeout_ms_.has_value() && ((millis() - this->last_seen_data_ms_) > this->timeout_ms_.value())) ||
+            this->stop_gracefully_) {
+          this->state_ = speaker::STATE_STOPPING;
+        }
+      }
+      break;
+    case speaker::STATE_STOPPING:
+      this->stop_();
+      this->stop_gracefully_ = false;
+      this->state_ = speaker::STATE_STOPPED;
+      break;
+    case speaker::STATE_STOPPED:
+      break;
+  }
+}
+
+size_t SourceSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+  if (this->is_stopped()) {
+    this->start();
+  }
+  size_t bytes_written = 0;
+  if (this->ring_buffer_.use_count() == 1) {
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_.lock();
+    bytes_written = temp_ring_buffer->write_without_replacement(data, length, ticks_to_wait);
+    if (bytes_written > 0) {
+      this->last_seen_data_ms_ = millis();
+    }
+  }
+  return bytes_written;
+}
+
+void SourceSpeaker::start() { this->state_ = speaker::STATE_STARTING; }
+
+esp_err_t SourceSpeaker::start_() {
+  const size_t ring_buffer_size = this->audio_stream_info_.ms_to_bytes(this->buffer_duration_ms_);
+  if (this->transfer_buffer_.use_count() == 0) {
+    this->transfer_buffer_ =
+        audio::AudioSourceTransferBuffer::create(this->audio_stream_info_.ms_to_bytes(TRANSFER_BUFFER_DURATION_MS));
+
+    if (this->transfer_buffer_ == nullptr) {
+      return ESP_ERR_NO_MEM;
+    }
+    std::shared_ptr<RingBuffer> temp_ring_buffer;
+
+    if (!this->ring_buffer_.use_count()) {
+      temp_ring_buffer = RingBuffer::create(ring_buffer_size);
+      this->ring_buffer_ = temp_ring_buffer;
+    }
+
+    if (!this->ring_buffer_.use_count()) {
+      return ESP_ERR_NO_MEM;
+    } else {
+      this->transfer_buffer_->set_source(temp_ring_buffer);
+    }
+  }
+
+  return this->parent_->start(this->audio_stream_info_);
+}
+
+void SourceSpeaker::stop() {
+  if (this->state_ != speaker::STATE_STOPPED) {
+    this->state_ = speaker::STATE_STOPPING;
+  }
+}
+
+void SourceSpeaker::stop_() {
+  this->transfer_buffer_.reset();  // deallocates the transfer buffer
+}
+
+void SourceSpeaker::finish() { this->stop_gracefully_ = true; }
+
+bool SourceSpeaker::has_buffered_data() const {
+  return ((this->transfer_buffer_.use_count() > 0) && this->transfer_buffer_->has_buffered_data());
+}
+
+void SourceSpeaker::set_mute_state(bool mute_state) {
+  this->mute_state_ = mute_state;
+  this->parent_->get_output_speaker()->set_mute_state(mute_state);
+}
+
+void SourceSpeaker::set_volume(float volume) {
+  this->volume_ = volume;
+  this->parent_->get_output_speaker()->set_volume(volume);
+}
+
+size_t SourceSpeaker::process_data_from_source(TickType_t ticks_to_wait) {
+  if (!this->transfer_buffer_.use_count()) {
+    return 0;
+  }
+
+  // Store current offset, as these samples are already ducked
+  const size_t current_length = this->transfer_buffer_->available();
+
+  size_t bytes_read = this->transfer_buffer_->transfer_data_from_source(ticks_to_wait);
+
+  uint32_t samples_to_duck = this->audio_stream_info_.bytes_to_samples(bytes_read);
+  if (samples_to_duck > 0) {
+    int16_t *current_buffer = reinterpret_cast<int16_t *>(this->transfer_buffer_->get_buffer_start() + current_length);
+
+    duck_samples(current_buffer, samples_to_duck, &this->current_ducking_db_reduction_,
+                 &this->ducking_transition_samples_remaining_, this->samples_per_ducking_step_,
+                 this->db_change_per_ducking_step_);
+  }
+
+  return bytes_read;
+}
+
+void SourceSpeaker::apply_ducking(uint8_t decibel_reduction, uint32_t duration) {
+  if (this->target_ducking_db_reduction_ != decibel_reduction) {
+    this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_;
+
+    this->target_ducking_db_reduction_ = decibel_reduction;
+
+    uint8_t total_ducking_steps = 0;
+    if (this->target_ducking_db_reduction_ > this->current_ducking_db_reduction_) {
+      // The dB reduction level is increasing (which results in quieter audio)
+      total_ducking_steps = this->target_ducking_db_reduction_ - this->current_ducking_db_reduction_ - 1;
+      this->db_change_per_ducking_step_ = 1;
+    } else {
+      // The dB reduction level is decreasing (which results in louder audio)
+      total_ducking_steps = this->current_ducking_db_reduction_ - this->target_ducking_db_reduction_ - 1;
+      this->db_change_per_ducking_step_ = -1;
+    }
+    if ((duration > 0) && (total_ducking_steps > 0)) {
+      this->ducking_transition_samples_remaining_ = this->audio_stream_info_.ms_to_samples(duration);
+
+      this->samples_per_ducking_step_ = this->ducking_transition_samples_remaining_ / total_ducking_steps;
+      this->ducking_transition_samples_remaining_ =
+          this->samples_per_ducking_step_ * total_ducking_steps;  // Adjust for integer division rounding
+
+      this->current_ducking_db_reduction_ += this->db_change_per_ducking_step_;
+    } else {
+      this->ducking_transition_samples_remaining_ = 0;
+      this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_;
+    }
+  }
+}
+
+void SourceSpeaker::duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck,
+                                 int8_t *current_ducking_db_reduction, uint32_t *ducking_transition_samples_remaining,
+                                 uint32_t samples_per_ducking_step, int8_t db_change_per_ducking_step) {
+  if (*ducking_transition_samples_remaining > 0) {
+    // Ducking level is still transitioning
+
+    // Takes the ceiling of input_samples_to_duck/samples_per_ducking_step
+    uint32_t ducking_steps_in_batch =
+        input_samples_to_duck / samples_per_ducking_step + (input_samples_to_duck % samples_per_ducking_step != 0);
+
+    for (uint32_t i = 0; i < ducking_steps_in_batch; ++i) {
+      uint32_t samples_left_in_step = *ducking_transition_samples_remaining % samples_per_ducking_step;
+
+      if (samples_left_in_step == 0) {
+        samples_left_in_step = samples_per_ducking_step;
+      }
+
+      uint32_t samples_to_duck = std::min(input_samples_to_duck, samples_left_in_step);
+      samples_to_duck = std::min(samples_to_duck, *ducking_transition_samples_remaining);
+
+      // Ensure we only point to valid index in the Q15 scaling factor table
+      uint8_t safe_db_reduction_index =
+          clamp<uint8_t>(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1);
+      int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index];
+
+      audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, samples_to_duck);
+
+      if (samples_left_in_step - samples_to_duck == 0) {
+        // After scaling the current samples, we are ready to transition to the next step
+        *current_ducking_db_reduction += db_change_per_ducking_step;
+      }
+
+      input_buffer += samples_to_duck;
+      *ducking_transition_samples_remaining -= samples_to_duck;
+      input_samples_to_duck -= samples_to_duck;
+    }
+  }
+
+  if ((*current_ducking_db_reduction > 0) && (input_samples_to_duck > 0)) {
+    // Audio is ducked, but its not in the middle of a transition step
+
+    uint8_t safe_db_reduction_index =
+        clamp<uint8_t>(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1);
+    int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index];
+
+    audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, input_samples_to_duck);
+  }
+}
+
+void MixerSpeaker::dump_config() {
+  ESP_LOGCONFIG(TAG, "Speaker Mixer:");
+  ESP_LOGCONFIG(TAG, "  Number of output channels: %u", this->output_channels_);
+}
+
+void MixerSpeaker::setup() {
+  this->event_group_ = xEventGroupCreate();
+
+  if (this->event_group_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to create event group");
+    this->mark_failed();
+    return;
+  }
+}
+
+void MixerSpeaker::loop() {
+  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
+
+  if (event_group_bits & MixerEventGroupBits::STATE_STARTING) {
+    ESP_LOGD(TAG, "Starting speaker mixer");
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STARTING);
+  }
+  if (event_group_bits & MixerEventGroupBits::ERR_ESP_NO_MEM) {
+    this->status_set_error("Failed to allocate the mixer's internal buffer");
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ERR_ESP_NO_MEM);
+  }
+  if (event_group_bits & MixerEventGroupBits::STATE_RUNNING) {
+    ESP_LOGD(TAG, "Started speaker mixer");
+    this->status_clear_error();
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_RUNNING);
+  }
+  if (event_group_bits & MixerEventGroupBits::STATE_STOPPING) {
+    ESP_LOGD(TAG, "Stopping speaker mixer");
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STOPPING);
+  }
+  if (event_group_bits & MixerEventGroupBits::STATE_STOPPED) {
+    if (this->delete_task_() == ESP_OK) {
+      xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ALL_BITS);
+    }
+  }
+
+  if (this->task_handle_ != nullptr) {
+    bool all_stopped = true;
+
+    for (auto &speaker : this->source_speakers_) {
+      all_stopped &= speaker->is_stopped();
+    }
+
+    if (all_stopped) {
+      this->stop();
+    }
+  }
+}
+
+esp_err_t MixerSpeaker::start(audio::AudioStreamInfo &stream_info) {
+  if (!this->audio_stream_info_.has_value()) {
+    if (stream_info.get_bits_per_sample() != 16) {
+      // Audio streams that don't have 16 bits per sample are not supported
+      return ESP_ERR_NOT_SUPPORTED;
+    }
+
+    this->audio_stream_info_ = audio::AudioStreamInfo(stream_info.get_bits_per_sample(), this->output_channels_,
+                                                      stream_info.get_sample_rate());
+    this->output_speaker_->set_audio_stream_info(this->audio_stream_info_.value());
+  } else {
+    if (!this->queue_mode_ && (stream_info.get_sample_rate() != this->audio_stream_info_.value().get_sample_rate())) {
+      // The two audio streams must have the same sample rate to mix properly if not in queue mode
+      return ESP_ERR_INVALID_ARG;
+    }
+  }
+
+  return this->start_task_();
+}
+
+esp_err_t MixerSpeaker::start_task_() {
+  if (this->task_stack_buffer_ == nullptr) {
+    if (this->task_stack_in_psram_) {
+      RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+      this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE);
+    } else {
+      RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+      this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE);
+    }
+  }
+
+  if (this->task_stack_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if (this->task_handle_ == nullptr) {
+    this->task_handle_ = xTaskCreateStatic(audio_mixer_task, "mixer", TASK_STACK_SIZE, (void *) this,
+                                           MIXER_TASK_PRIORITY, this->task_stack_buffer_, &this->task_stack_);
+  }
+
+  if (this->task_handle_ == nullptr) {
+    return ESP_ERR_INVALID_STATE;
+  }
+
+  return ESP_OK;
+}
+
+esp_err_t MixerSpeaker::delete_task_() {
+  if (!this->task_created_) {
+    this->task_handle_ = nullptr;
+
+    if (this->task_stack_buffer_ != nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE);
+      }
+
+      this->task_stack_buffer_ = nullptr;
+    }
+
+    return ESP_OK;
+  }
+
+  return ESP_ERR_INVALID_STATE;
+}
+
+void MixerSpeaker::stop() { xEventGroupSetBits(this->event_group_, MixerEventGroupBits::COMMAND_STOP); }
+
+void MixerSpeaker::copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info,
+                               int16_t *output_buffer, audio::AudioStreamInfo output_stream_info,
+                               uint32_t frames_to_transfer) {
+  uint8_t input_channels = input_stream_info.get_channels();
+  uint8_t output_channels = output_stream_info.get_channels();
+  const uint8_t max_input_channel_index = input_channels - 1;
+
+  if (input_channels == output_channels) {
+    size_t bytes_to_copy = input_stream_info.frames_to_bytes(frames_to_transfer);
+    memcpy(output_buffer, input_buffer, bytes_to_copy);
+
+    return;
+  }
+
+  for (uint32_t frame_index = 0; frame_index < frames_to_transfer; ++frame_index) {
+    for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) {
+      uint8_t input_channel_index = std::min(output_channel_index, max_input_channel_index);
+      output_buffer[output_channels * frame_index + output_channel_index] =
+          input_buffer[input_channels * frame_index + input_channel_index];
+    }
+  }
+}
+
+void MixerSpeaker::mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info,
+                                     const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info,
+                                     int16_t *output_buffer, audio::AudioStreamInfo output_stream_info,
+                                     uint32_t frames_to_mix) {
+  const uint8_t primary_channels = primary_stream_info.get_channels();
+  const uint8_t secondary_channels = secondary_stream_info.get_channels();
+  const uint8_t output_channels = output_stream_info.get_channels();
+
+  const uint8_t max_primary_channel_index = primary_channels - 1;
+  const uint8_t max_secondary_channel_index = secondary_channels - 1;
+
+  for (uint32_t frames_index = 0; frames_index < frames_to_mix; ++frames_index) {
+    for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) {
+      const uint32_t secondary_channel_index = std::min(output_channel_index, max_secondary_channel_index);
+      const int32_t secondary_sample = secondary_buffer[frames_index * secondary_channels + secondary_channel_index];
+
+      const uint32_t primary_channel_index = std::min(output_channel_index, max_primary_channel_index);
+      const int32_t primary_sample =
+          static_cast<int32_t>(primary_buffer[frames_index * primary_channels + primary_channel_index]);
+
+      const int32_t added_sample = secondary_sample + primary_sample;
+
+      output_buffer[frames_index * output_channels + output_channel_index] =
+          static_cast<int16_t>(clamp<int32_t>(added_sample, MIN_AUDIO_SAMPLE_VALUE, MAX_AUDIO_SAMPLE_VALUE));
+    }
+  }
+}
+
+void MixerSpeaker::audio_mixer_task(void *params) {
+  MixerSpeaker *this_mixer = (MixerSpeaker *) params;
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STARTING);
+
+  this_mixer->task_created_ = true;
+
+  std::unique_ptr<audio::AudioSinkTransferBuffer> output_transfer_buffer = audio::AudioSinkTransferBuffer::create(
+      this_mixer->audio_stream_info_.value().ms_to_bytes(TRANSFER_BUFFER_DURATION_MS));
+
+  if (output_transfer_buffer == nullptr) {
+    xEventGroupSetBits(this_mixer->event_group_,
+                       MixerEventGroupBits::STATE_STOPPED | MixerEventGroupBits::ERR_ESP_NO_MEM);
+
+    this_mixer->task_created_ = false;
+    vTaskDelete(nullptr);
+  }
+
+  output_transfer_buffer->set_sink(this_mixer->output_speaker_);
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_RUNNING);
+
+  bool sent_finished = false;
+
+  while (true) {
+    uint32_t event_group_bits = xEventGroupGetBits(this_mixer->event_group_);
+    if (event_group_bits & MixerEventGroupBits::COMMAND_STOP) {
+      break;
+    }
+
+    output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS));
+
+    const uint32_t output_frames_free =
+        this_mixer->audio_stream_info_.value().bytes_to_frames(output_transfer_buffer->free());
+
+    std::vector<SourceSpeaker *> speakers_with_data;
+    std::vector<std::shared_ptr<audio::AudioSourceTransferBuffer>> transfer_buffers_with_data;
+
+    for (auto &speaker : this_mixer->source_speakers_) {
+      if (speaker->get_transfer_buffer().use_count() > 0) {
+        std::shared_ptr<audio::AudioSourceTransferBuffer> transfer_buffer = speaker->get_transfer_buffer().lock();
+        speaker->process_data_from_source(0);  // Transfers and ducks audio from source ring buffers
+
+        if ((transfer_buffer->available() > 0) && !speaker->get_pause_state()) {
+          // Store the locked transfer buffers in their own vector to avoid releasing ownership until after the loop
+          transfer_buffers_with_data.push_back(transfer_buffer);
+          speakers_with_data.push_back(speaker);
+        }
+      }
+    }
+
+    if (transfer_buffers_with_data.empty()) {
+      // No audio available for transferring, block task temporarily
+      delay(TASK_DELAY_MS);
+      continue;
+    }
+
+    uint32_t frames_to_mix = output_frames_free;
+
+    if ((transfer_buffers_with_data.size() == 1) || this_mixer->queue_mode_) {
+      // Only one speaker has audio data, just copy samples over
+
+      audio::AudioStreamInfo active_stream_info = speakers_with_data[0]->get_audio_stream_info();
+
+      if (active_stream_info.get_sample_rate() ==
+          this_mixer->output_speaker_->get_audio_stream_info().get_sample_rate()) {
+        // Speaker's sample rate matches the output speaker's, copy directly
+
+        const uint32_t frames_available_in_buffer =
+            active_stream_info.bytes_to_frames(transfer_buffers_with_data[0]->available());
+        frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer);
+        copy_frames(reinterpret_cast<int16_t *>(transfer_buffers_with_data[0]->get_buffer_start()), active_stream_info,
+                    reinterpret_cast<int16_t *>(output_transfer_buffer->get_buffer_end()),
+                    this_mixer->audio_stream_info_.value(), frames_to_mix);
+
+        // Update source speaker buffer length
+        transfer_buffers_with_data[0]->decrease_buffer_length(active_stream_info.frames_to_bytes(frames_to_mix));
+        speakers_with_data[0]->accumulated_frames_read_ += frames_to_mix;
+
+        // Add new audio duration to the source speaker pending playback
+        speakers_with_data[0]->pending_playback_ms_ +=
+            active_stream_info.frames_to_milliseconds_with_remainder(&speakers_with_data[0]->accumulated_frames_read_);
+
+        // Update output transfer buffer length
+        output_transfer_buffer->increase_buffer_length(
+            this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix));
+      } else {
+        // Speaker's stream info doesn't match the output speaker's, so it's a new source speaker
+        if (!this_mixer->output_speaker_->is_stopped()) {
+          if (!sent_finished) {
+            this_mixer->output_speaker_->finish();
+            sent_finished = true;  // Avoid repeatedly sending the finish command
+          }
+        } else {
+          // Speaker has finished writing the current audio, update the stream information and restart the speaker
+          this_mixer->audio_stream_info_ =
+              audio::AudioStreamInfo(active_stream_info.get_bits_per_sample(), this_mixer->output_channels_,
+                                     active_stream_info.get_sample_rate());
+          this_mixer->output_speaker_->set_audio_stream_info(this_mixer->audio_stream_info_.value());
+          this_mixer->output_speaker_->start();
+          sent_finished = false;
+        }
+      }
+    } else {
+      // Determine how many frames to mix
+      for (int i = 0; i < transfer_buffers_with_data.size(); ++i) {
+        const uint32_t frames_available_in_buffer =
+            speakers_with_data[i]->get_audio_stream_info().bytes_to_frames(transfer_buffers_with_data[i]->available());
+        frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer);
+      }
+      int16_t *primary_buffer = reinterpret_cast<int16_t *>(transfer_buffers_with_data[0]->get_buffer_start());
+      audio::AudioStreamInfo primary_stream_info = speakers_with_data[0]->get_audio_stream_info();
+
+      // Mix two streams together
+      for (int i = 1; i < transfer_buffers_with_data.size(); ++i) {
+        mix_audio_samples(primary_buffer, primary_stream_info,
+                          reinterpret_cast<int16_t *>(transfer_buffers_with_data[i]->get_buffer_start()),
+                          speakers_with_data[i]->get_audio_stream_info(),
+                          reinterpret_cast<int16_t *>(output_transfer_buffer->get_buffer_end()),
+                          this_mixer->audio_stream_info_.value(), frames_to_mix);
+
+        speakers_with_data[i]->pending_playback_ms_ +=
+            speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder(
+                &speakers_with_data[i]->accumulated_frames_read_);
+
+        if (i != transfer_buffers_with_data.size() - 1) {
+          // Need to mix more streams together, point primary buffer and stream info to the already mixed output
+          primary_buffer = reinterpret_cast<int16_t *>(output_transfer_buffer->get_buffer_end());
+          primary_stream_info = this_mixer->audio_stream_info_.value();
+        }
+      }
+
+      // Update source transfer buffer lengths and add new audio durations to the source speaker pending playbacks
+      for (int i = 0; i < transfer_buffers_with_data.size(); ++i) {
+        transfer_buffers_with_data[i]->decrease_buffer_length(
+            speakers_with_data[i]->get_audio_stream_info().frames_to_bytes(frames_to_mix));
+        speakers_with_data[i]->accumulated_frames_read_ += frames_to_mix;
+
+        speakers_with_data[i]->pending_playback_ms_ +=
+            speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder(
+                &speakers_with_data[i]->accumulated_frames_read_);
+      }
+
+      // Update output transfer buffer length
+      output_transfer_buffer->increase_buffer_length(
+          this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix));
+    }
+  }
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPING);
+
+  output_transfer_buffer.reset();
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPED);
+  this_mixer->task_created_ = false;
+  vTaskDelete(nullptr);
+}
+
+}  // namespace mixer_speaker
+}  // namespace esphome
+
+#endif
--- a/esphome/components/mixer/speaker/mixer_speaker.h
+++ b/esphome/components/mixer/speaker/mixer_speaker.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/audio/audio_transfer_buffer.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/component.h"
+
+#include <freertos/event_groups.h>
+#include <freertos/FreeRTOS.h>
+
+namespace esphome {
+namespace mixer_speaker {
+
+/* Classes for mixing several source speaker audio streams and writing it to another speaker component.
+ *  - Volume controls are passed through to the output speaker
+ *  - Directly handles pausing at the SourceSpeaker level; pause state is not passed through to the output speaker.
+ *  - Audio sent to the SourceSpeaker's must have 16 bits per sample.
+ *  - Audio sent to the SourceSpeaker can have any number of channels. They are duplicated or ignored as needed to match
+ *    the number of channels required for the output speaker.
+ *  - In queue mode, the audio sent to the SoureSpeakers can have different sample rates.
+ *  - In non-queue mode, the audio sent to the SourceSpeakers must have the same sample rates.
+ *  - SourceSpeaker has an internal ring buffer. It also allocates a shared_ptr for an AudioTranserBuffer object.
+ *  - Audio Data Flow:
+ *      - Audio data played on a SourceSpeaker first writes to its internal ring buffer.
+ *      - MixerSpeaker task temporarily takes shared ownership of each SourceSpeaker's AudioTransferBuffer.
+ *      - MixerSpeaker calls SourceSpeaker's `process_data_from_source`, which tranfers audio from the SourceSpeaker's
+ *        ring buffer to its AudioTransferBuffer. Audio ducking is applied at this step.
+ *      - In queue mode, MixerSpeaker prioritizes the earliest configured SourceSpeaker with audio data. Audio data is
+ *        sent to the output speaker.
+ *      - In non-queue mode, MixerSpeaker adds all the audio data in each SourceSpeaker into one stream that is written
+ *        to the output speaker.
+ */
+
+class MixerSpeaker;
+
+class SourceSpeaker : public speaker::Speaker, public Component {
+ public:
+  void dump_config() override;
+  void setup() override;
+  void loop() override;
+
+  size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override;
+  size_t play(const uint8_t *data, size_t length) override { return this->play(data, length, 0); }
+
+  void start() override;
+  void stop() override;
+  void finish() override;
+
+  bool has_buffered_data() const override;
+
+  /// @brief Mute state changes are passed to the parent's output speaker
+  void set_mute_state(bool mute_state) override;
+
+  /// @brief Volume state changes are passed to the parent's output speaker
+  void set_volume(float volume) override;
+
+  void set_pause_state(bool pause_state) override { this->pause_state_ = pause_state; }
+  bool get_pause_state() const override { return this->pause_state_; }
+
+  /// @brief Transfers audio from the ring buffer into the transfer buffer. Ducks audio while transferring.
+  /// @param ticks_to_wait FreeRTOS ticks to wait while waiting to read from the ring buffer.
+  /// @return Number of bytes transferred from the ring buffer.
+  size_t process_data_from_source(TickType_t ticks_to_wait);
+
+  /// @brief Sets the ducking level for the source speaker.
+  /// @param decibel_reduction (uint8_t) The dB reduction level. For example, 0 is no change, 10 is a reduction by 10 dB
+  /// @param duration (uint32_t) The number of milliseconds to transition from the current level to the new level
+  void apply_ducking(uint8_t decibel_reduction, uint32_t duration);
+
+  void set_buffer_duration(uint32_t buffer_duration_ms) { this->buffer_duration_ms_ = buffer_duration_ms; }
+  void set_parent(MixerSpeaker *parent) { this->parent_ = parent; }
+  void set_timeout(uint32_t ms) { this->timeout_ms_ = ms; }
+
+  std::weak_ptr<audio::AudioSourceTransferBuffer> get_transfer_buffer() { return this->transfer_buffer_; }
+
+ protected:
+  friend class MixerSpeaker;
+  esp_err_t start_();
+  void stop_();
+
+  /// @brief Ducks audio samples by a specified amount. When changing the ducking amount, it can transition gradually
+  /// over a specified amount of samples.
+  /// @param input_buffer buffer with audio samples to be ducked in place
+  /// @param input_samples_to_duck number of samples to process in ``input_buffer``
+  /// @param current_ducking_db_reduction pointer to the current dB reduction
+  /// @param ducking_transition_samples_remaining pointer to the total number of samples left before the the
+  ///         transition is finished
+  /// @param samples_per_ducking_step total number of samples per ducking step for the transition
+  /// @param db_change_per_ducking_step the change in dB reduction per step
+  static void duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck, int8_t *current_ducking_db_reduction,
+                           uint32_t *ducking_transition_samples_remaining, uint32_t samples_per_ducking_step,
+                           int8_t db_change_per_ducking_step);
+
+  MixerSpeaker *parent_;
+
+  std::shared_ptr<audio::AudioSourceTransferBuffer> transfer_buffer_;
+  std::weak_ptr<RingBuffer> ring_buffer_;
+
+  uint32_t buffer_duration_ms_;
+  uint32_t last_seen_data_ms_{0};
+  optional<uint32_t> timeout_ms_;
+  bool stop_gracefully_{false};
+
+  bool pause_state_{false};
+
+  int8_t target_ducking_db_reduction_{0};
+  int8_t current_ducking_db_reduction_{0};
+  int8_t db_change_per_ducking_step_{1};
+  uint32_t ducking_transition_samples_remaining_{0};
+  uint32_t samples_per_ducking_step_{0};
+
+  uint32_t accumulated_frames_read_{0};
+
+  uint32_t pending_playback_ms_{0};
+};
+
+class MixerSpeaker : public Component {
+ public:
+  void dump_config() override;
+  void setup() override;
+  void loop() override;
+
+  void add_source_speaker(SourceSpeaker *source_speaker) { this->source_speakers_.push_back(source_speaker); }
+
+  /// @brief Starts the mixer task. Called by a source speaker giving the current audio stream information
+  /// @param stream_info The calling source speakers audio stream information
+  /// @return ESP_ERR_NOT_SUPPORTED if the incoming stream is incompatible due to unsupported bits per sample
+  ///         ESP_ERR_INVALID_ARG if the incoming stream is incompatible to be mixed with the other input audio stream
+  ///         ESP_ERR_NO_MEM if there isn't enough memory for the task's stack
+  ///         ESP_ERR_INVALID_STATE if the task fails to start
+  ///         ESP_OK if the incoming stream is compatible and the mixer task starts
+  esp_err_t start(audio::AudioStreamInfo &stream_info);
+
+  void stop();
+
+  void set_output_channels(uint8_t output_channels) { this->output_channels_ = output_channels; }
+  void set_output_speaker(speaker::Speaker *speaker) { this->output_speaker_ = speaker; }
+  void set_queue_mode(bool queue_mode) { this->queue_mode_ = queue_mode; }
+  void set_task_stack_in_psram(bool task_stack_in_psram) { this->task_stack_in_psram_ = task_stack_in_psram; }
+
+  speaker::Speaker *get_output_speaker() const { return this->output_speaker_; }
+
+ protected:
+  /// @brief Copies audio frames from the input buffer to the output buffer taking into account the number of channels
+  /// in each stream. If the output stream has more channels, the input samples are duplicated. If the output stream has
+  /// less channels, the extra channel input samples are dropped.
+  /// @param input_buffer
+  /// @param input_stream_info
+  /// @param output_buffer
+  /// @param output_stream_info
+  /// @param frames_to_transfer number of frames (consisting of a sample for each channel) to copy from the input buffer
+  static void copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info, int16_t *output_buffer,
+                          audio::AudioStreamInfo output_stream_info, uint32_t frames_to_transfer);
+
+  /// @brief Mixes the primary and secondary streams taking into account the number of channels in each stream. Primary
+  /// and secondary samples are duplicated or dropped as necessary to ensure the output stream has the configured number
+  /// of channels. Output samples are clamped to the corresponding int16 min or max values if the mixed sample
+  /// overflows.
+  /// @param primary_buffer (int16_t *) samples buffer for the primary stream
+  /// @param primary_stream_info stream info for the primary stream
+  /// @param secondary_buffer (int16_t *) samples buffer for secondary stream
+  /// @param secondary_stream_info stream info for the secondary stream
+  /// @param output_buffer (int16_t *) buffer for the mixed samples
+  /// @param output_stream_info stream info for the output buffer
+  /// @param frames_to_mix number of frames in the primary and secondary buffers to mix together
+  static void mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info,
+                                const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info,
+                                int16_t *output_buffer, audio::AudioStreamInfo output_stream_info,
+                                uint32_t frames_to_mix);
+
+  static void audio_mixer_task(void *params);
+
+  /// @brief Starts the mixer task after allocating memory for the task stack.
+  /// @return ESP_ERR_NO_MEM if there isn't enough memory for the task's stack
+  ///         ESP_ERR_INVALID_STATE if the task didn't start
+  ///         ESP_OK if successful
+  esp_err_t start_task_();
+
+  /// @brief If the task is stopped, it sets the task handle to the nullptr and deallocates its stack
+  /// @return ESP_OK if the task was stopped, ESP_ERR_INVALID_STATE otherwise.
+  esp_err_t delete_task_();
+
+  EventGroupHandle_t event_group_{nullptr};
+
+  std::vector<SourceSpeaker *> source_speakers_;
+  speaker::Speaker *output_speaker_{nullptr};
+
+  uint8_t output_channels_;
+  bool queue_mode_;
+  bool task_stack_in_psram_{false};
+
+  bool task_created_{false};
+
+  TaskHandle_t task_handle_{nullptr};
+  StaticTask_t task_stack_;
+  StackType_t *task_stack_buffer_{nullptr};
+
+  optional<audio::AudioStreamInfo> audio_stream_info_;
+};
+
+}  // namespace mixer_speaker
+}  // namespace esphome
+
+#endif
--- a/esphome/components/mpr121/mpr121.h
+++ b/esphome/components/mpr121/mpr121.h
@@ -117,6 +117,8 @@ class MPR121GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { this->inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { this->flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  MPR121Component *parent_;
  uint8_t pin_;
--- a/esphome/components/pca6416a/pca6416a.h
+++ b/esphome/components/pca6416a/pca6416a.h
@@ -52,6 +52,8 @@ class PCA6416AGPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  PCA6416AComponent *parent_;
  uint8_t pin_;
--- a/esphome/components/pca9554/pca9554.h
+++ b/esphome/components/pca9554/pca9554.h
@@ -65,6 +65,8 @@ class PCA9554GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  PCA9554Component *parent_;
  uint8_t pin_;
--- a/esphome/components/pcf8574/pcf8574.h
+++ b/esphome/components/pcf8574/pcf8574.h
@@ -54,6 +54,8 @@ class PCF8574GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  PCF8574Component *parent_;
  uint8_t pin_;
--- a/esphome/components/resampler/init.py
+++ b/esphome/components/resampler/init.py
--- a/esphome/components/resampler/speaker/init.py
+++ b/esphome/components/resampler/speaker/init.py
@@ -0,0 +1,103 @@
+import esphome.codegen as cg
+from esphome.components import audio, esp32, speaker
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_BUFFER_DURATION,
+    CONF_FILTERS,
+    CONF_ID,
+    CONF_NUM_CHANNELS,
+    CONF_OUTPUT_SPEAKER,
+    CONF_SAMPLE_RATE,
+    CONF_TASK_STACK_IN_PSRAM,
+    PLATFORM_ESP32,
+)
+from esphome.core.entity_helpers import inherit_property_from
+
+AUTO_LOAD = ["audio"]
+CODEOWNERS = ["@kahrendt"]
+
+resampler_ns = cg.esphome_ns.namespace("resampler")
+ResamplerSpeaker = resampler_ns.class_(
+    "ResamplerSpeaker", cg.Component, speaker.Speaker
+)
+
+CONF_TAPS = "taps"
+
+
+def _set_stream_limits(config):
+    audio.set_stream_limits(
+        min_bits_per_sample=16,
+        max_bits_per_sample=32,
+    )(config)
+
+    return config
+
+
+def _validate_audio_compatability(config):
+    inherit_property_from(CONF_BITS_PER_SAMPLE, CONF_OUTPUT_SPEAKER)(config)
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER)(config)
+    inherit_property_from(CONF_SAMPLE_RATE, CONF_OUTPUT_SPEAKER)(config)
+
+    audio.final_validate_audio_schema(
+        "source_speaker",
+        audio_device=CONF_OUTPUT_SPEAKER,
+        bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        channels=config.get(CONF_NUM_CHANNELS),
+        sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+
+def _validate_taps(taps):
+    value = cv.int_range(min=16, max=128)(taps)
+    if value % 4 != 0:
+        raise cv.Invalid("Number of taps must be divisible by 4")
+    return value
+
+
+CONFIG_SCHEMA = cv.All(
+    speaker.SPEAKER_SCHEMA.extend(
+        {
+            cv.GenerateID(): cv.declare_id(ResamplerSpeaker),
+            cv.Required(CONF_OUTPUT_SPEAKER): cv.use_id(speaker.Speaker),
+            cv.Optional(
+                CONF_BUFFER_DURATION, default="100ms"
+            ): cv.positive_time_period_milliseconds,
+            cv.SplitDefault(CONF_TASK_STACK_IN_PSRAM, esp32_idf=False): cv.All(
+                cv.boolean, cv.only_with_esp_idf
+            ),
+            cv.Optional(CONF_FILTERS, default=16): cv.int_range(min=2, max=1024),
+            cv.Optional(CONF_TAPS, default=16): _validate_taps,
+        }
+    ).extend(cv.COMPONENT_SCHEMA),
+    cv.only_on([PLATFORM_ESP32]),
+    _set_stream_limits,
+)
+
+
+FINAL_VALIDATE_SCHEMA = _validate_audio_compatability
+
+
+async def to_code(config):
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+    await speaker.register_speaker(var, config)
+
+    output_spkr = await cg.get_variable(config[CONF_OUTPUT_SPEAKER])
+    cg.add(var.set_output_speaker(output_spkr))
+
+    cg.add(var.set_buffer_duration(config[CONF_BUFFER_DURATION]))
+
+    if task_stack_in_psram := config.get(CONF_TASK_STACK_IN_PSRAM):
+        cg.add(var.set_task_stack_in_psram(task_stack_in_psram))
+        if task_stack_in_psram:
+            if config[CONF_TASK_STACK_IN_PSRAM]:
+                esp32.add_idf_sdkconfig_option(
+                    "CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY", True
+                )
+
+    cg.add(var.set_target_bits_per_sample(config[CONF_BITS_PER_SAMPLE]))
+    cg.add(var.set_target_sample_rate(config[CONF_SAMPLE_RATE]))
+
+    cg.add(var.set_filters(config[CONF_FILTERS]))
+    cg.add(var.set_taps(config[CONF_TAPS]))
--- a/esphome/components/resampler/speaker/resampler_speaker.cpp
+++ b/esphome/components/resampler/speaker/resampler_speaker.cpp
@@ -0,0 +1,318 @@
+#include "resampler_speaker.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/components/audio/audio_resampler.h"
+
+#include "esphome/core/helpers.h"
+#include "esphome/core/log.h"
+
+#include <algorithm>
+#include <cstring>
+
+namespace esphome {
+namespace resampler {
+
+static const UBaseType_t RESAMPLER_TASK_PRIORITY = 1;
+
+static const uint32_t TRANSFER_BUFFER_DURATION_MS = 50;
+
+static const uint32_t TASK_DELAY_MS = 20;
+static const uint32_t TASK_STACK_SIZE = 3072;
+
+static const char *const TAG = "resampler_speaker";
+
+enum ResamplingEventGroupBits : uint32_t {
+  COMMAND_STOP = (1 << 0),  // stops the resampler task
+  STATE_STARTING = (1 << 10),
+  STATE_RUNNING = (1 << 11),
+  STATE_STOPPING = (1 << 12),
+  STATE_STOPPED = (1 << 13),
+  ERR_ESP_NO_MEM = (1 << 19),
+  ERR_ESP_NOT_SUPPORTED = (1 << 20),
+  ERR_ESP_FAIL = (1 << 21),
+  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
+};
+
+void ResamplerSpeaker::setup() {
+  this->event_group_ = xEventGroupCreate();
+
+  if (this->event_group_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to create event group");
+    this->mark_failed();
+    return;
+  }
+
+  this->output_speaker_->add_audio_output_callback(
+      [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) {
+        int32_t adjustment = this->playback_differential_ms_;
+        this->playback_differential_ms_ -= adjustment;
+        int32_t adjusted_playback_ms = static_cast<int32_t>(new_playback_ms) + adjustment;
+        this->audio_output_callback_(adjusted_playback_ms, remainder_us, pending_ms, write_timestamp);
+      });
+}
+
+void ResamplerSpeaker::loop() {
+  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
+
+  if (event_group_bits & ResamplingEventGroupBits::STATE_STARTING) {
+    ESP_LOGD(TAG, "Starting resampler task");
+    xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::STATE_STARTING);
+  }
+
+  if (event_group_bits & ResamplingEventGroupBits::ERR_ESP_NO_MEM) {
+    this->status_set_error("Resampler task failed to allocate the internal buffers");
+    xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::ERR_ESP_NO_MEM);
+    this->state_ = speaker::STATE_STOPPING;
+  }
+  if (event_group_bits & ResamplingEventGroupBits::ERR_ESP_NOT_SUPPORTED) {
+    this->status_set_error("Cannot resample due to an unsupported audio stream");
+    xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::ERR_ESP_NOT_SUPPORTED);
+    this->state_ = speaker::STATE_STOPPING;
+  }
+  if (event_group_bits & ResamplingEventGroupBits::ERR_ESP_FAIL) {
+    this->status_set_error("Resampler task failed");
+    xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::ERR_ESP_FAIL);
+    this->state_ = speaker::STATE_STOPPING;
+  }
+
+  if (event_group_bits & ResamplingEventGroupBits::STATE_RUNNING) {
+    ESP_LOGD(TAG, "Started resampler task");
+    this->status_clear_error();
+    xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::STATE_RUNNING);
+  }
+  if (event_group_bits & ResamplingEventGroupBits::STATE_STOPPING) {
+    ESP_LOGD(TAG, "Stopping resampler task");
+    xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::STATE_STOPPING);
+  }
+  if (event_group_bits & ResamplingEventGroupBits::STATE_STOPPED) {
+    if (this->delete_task_() == ESP_OK) {
+      ESP_LOGD(TAG, "Stopped resampler task");
+      xEventGroupClearBits(this->event_group_, ResamplingEventGroupBits::ALL_BITS);
+    }
+  }
+
+  switch (this->state_) {
+    case speaker::STATE_STARTING: {
+      esp_err_t err = this->start_();
+      if (err == ESP_OK) {
+        this->status_clear_error();
+        this->state_ = speaker::STATE_RUNNING;
+      } else {
+        switch (err) {
+          case ESP_ERR_INVALID_STATE:
+            this->status_set_error("Failed to start resampler: resampler task failed to start");
+            break;
+          case ESP_ERR_NO_MEM:
+            this->status_set_error("Failed to start resampler: not enough memory for task stack");
+          default:
+            this->status_set_error("Failed to start resampler");
+            break;
+        }
+
+        this->state_ = speaker::STATE_STOPPING;
+      }
+      break;
+    }
+    case speaker::STATE_RUNNING:
+      if (this->output_speaker_->is_stopped()) {
+        this->state_ = speaker::STATE_STOPPING;
+      }
+
+      break;
+    case speaker::STATE_STOPPING:
+      this->stop_();
+      this->state_ = speaker::STATE_STOPPED;
+      break;
+    case speaker::STATE_STOPPED:
+      break;
+  }
+}
+
+size_t ResamplerSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+  if (this->is_stopped()) {
+    this->start();
+  }
+
+  size_t bytes_written = 0;
+  if ((this->output_speaker_->is_running()) && (!this->requires_resampling_())) {
+    bytes_written = this->output_speaker_->play(data, length, ticks_to_wait);
+  } else {
+    if (this->ring_buffer_.use_count() == 1) {
+      std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_.lock();
+      bytes_written = temp_ring_buffer->write_without_replacement(data, length, ticks_to_wait);
+    }
+  }
+
+  return bytes_written;
+}
+
+void ResamplerSpeaker::start() { this->state_ = speaker::STATE_STARTING; }
+
+esp_err_t ResamplerSpeaker::start_() {
+  this->target_stream_info_ = audio::AudioStreamInfo(
+      this->target_bits_per_sample_, this->audio_stream_info_.get_channels(), this->target_sample_rate_);
+
+  this->output_speaker_->set_audio_stream_info(this->target_stream_info_);
+  this->output_speaker_->start();
+
+  if (this->requires_resampling_()) {
+    // Start the resampler task to handle converting sample rates
+    return this->start_task_();
+  }
+
+  return ESP_OK;
+}
+
+esp_err_t ResamplerSpeaker::start_task_() {
+  if (this->task_stack_buffer_ == nullptr) {
+    if (this->task_stack_in_psram_) {
+      RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+      this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE);
+    } else {
+      RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+      this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE);
+    }
+  }
+
+  if (this->task_stack_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if (this->task_handle_ == nullptr) {
+    this->task_handle_ = xTaskCreateStatic(resample_task, "sample", TASK_STACK_SIZE, (void *) this,
+                                           RESAMPLER_TASK_PRIORITY, this->task_stack_buffer_, &this->task_stack_);
+  }
+
+  if (this->task_handle_ == nullptr) {
+    return ESP_ERR_INVALID_STATE;
+  }
+
+  return ESP_OK;
+}
+
+void ResamplerSpeaker::stop() { this->state_ = speaker::STATE_STOPPING; }
+
+void ResamplerSpeaker::stop_() {
+  if (this->task_handle_ != nullptr) {
+    xEventGroupSetBits(this->event_group_, ResamplingEventGroupBits::COMMAND_STOP);
+  }
+  this->output_speaker_->stop();
+}
+
+esp_err_t ResamplerSpeaker::delete_task_() {
+  if (!this->task_created_) {
+    this->task_handle_ = nullptr;
+
+    if (this->task_stack_buffer_ != nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE);
+      }
+
+      this->task_stack_buffer_ = nullptr;
+    }
+
+    return ESP_OK;
+  }
+
+  return ESP_ERR_INVALID_STATE;
+}
+
+void ResamplerSpeaker::finish() { this->output_speaker_->finish(); }
+
+bool ResamplerSpeaker::has_buffered_data() const {
+  bool has_ring_buffer_data = false;
+  if (this->requires_resampling_() && (this->ring_buffer_.use_count() > 0)) {
+    has_ring_buffer_data = (this->ring_buffer_.lock()->available() > 0);
+  }
+  return (has_ring_buffer_data || this->output_speaker_->has_buffered_data());
+}
+
+void ResamplerSpeaker::set_mute_state(bool mute_state) {
+  this->mute_state_ = mute_state;
+  this->output_speaker_->set_mute_state(mute_state);
+}
+
+void ResamplerSpeaker::set_volume(float volume) {
+  this->volume_ = volume;
+  this->output_speaker_->set_volume(volume);
+}
+
+bool ResamplerSpeaker::requires_resampling_() const {
+  return (this->audio_stream_info_.get_sample_rate() != this->target_sample_rate_) ||
+         (this->audio_stream_info_.get_bits_per_sample() != this->target_bits_per_sample_);
+}
+
+void ResamplerSpeaker::resample_task(void *params) {
+  ResamplerSpeaker *this_resampler = (ResamplerSpeaker *) params;
+
+  this_resampler->task_created_ = true;
+  xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::STATE_STARTING);
+
+  std::unique_ptr<audio::AudioResampler> resampler =
+      make_unique<audio::AudioResampler>(this_resampler->audio_stream_info_.ms_to_bytes(TRANSFER_BUFFER_DURATION_MS),
+                                         this_resampler->target_stream_info_.ms_to_bytes(TRANSFER_BUFFER_DURATION_MS));
+
+  esp_err_t err = resampler->start(this_resampler->audio_stream_info_, this_resampler->target_stream_info_,
+                                   this_resampler->taps_, this_resampler->filters_);
+
+  if (err == ESP_OK) {
+    std::shared_ptr<RingBuffer> temp_ring_buffer =
+        RingBuffer::create(this_resampler->audio_stream_info_.ms_to_bytes(this_resampler->buffer_duration_ms_));
+
+    if (temp_ring_buffer.use_count() == 0) {
+      err = ESP_ERR_NO_MEM;
+    } else {
+      this_resampler->ring_buffer_ = temp_ring_buffer;
+      resampler->add_source(this_resampler->ring_buffer_);
+
+      this_resampler->output_speaker_->set_audio_stream_info(this_resampler->target_stream_info_);
+      resampler->add_sink(this_resampler->output_speaker_);
+    }
+  }
+
+  if (err == ESP_OK) {
+    xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::STATE_RUNNING);
+  } else if (err == ESP_ERR_NO_MEM) {
+    xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::ERR_ESP_NO_MEM);
+  } else if (err == ESP_ERR_NOT_SUPPORTED) {
+    xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::ERR_ESP_NOT_SUPPORTED);
+  }
+
+  this_resampler->playback_differential_ms_ = 0;
+  while (err == ESP_OK) {
+    uint32_t event_bits = xEventGroupGetBits(this_resampler->event_group_);
+
+    if (event_bits & ResamplingEventGroupBits::COMMAND_STOP) {
+      break;
+    }
+
+    // Stop gracefully if the decoder is done
+    int32_t ms_differential = 0;
+    audio::AudioResamplerState resampler_state = resampler->resample(false, &ms_differential);
+
+    this_resampler->playback_differential_ms_ += ms_differential;
+
+    if (resampler_state == audio::AudioResamplerState::FINISHED) {
+      break;
+    } else if (resampler_state == audio::AudioResamplerState::FAILED) {
+      xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::ERR_ESP_FAIL);
+      break;
+    }
+  }
+
+  xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::STATE_STOPPING);
+  resampler.reset();
+  xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::STATE_STOPPED);
+  this_resampler->task_created_ = false;
+  vTaskDelete(nullptr);
+}
+
+}  // namespace resampler
+}  // namespace esphome
+
+#endif
--- a/esphome/components/resampler/speaker/resampler_speaker.h
+++ b/esphome/components/resampler/speaker/resampler_speaker.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/audio/audio_transfer_buffer.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/component.h"
+
+#include <freertos/event_groups.h>
+#include <freertos/FreeRTOS.h>
+
+namespace esphome {
+namespace resampler {
+
+class ResamplerSpeaker : public Component, public speaker::Speaker {
+ public:
+  float get_setup_priority() const override { return esphome::setup_priority::DATA; }
+  void setup() override;
+  void loop() override;
+
+  size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override;
+  size_t play(const uint8_t *data, size_t length) override { return this->play(data, length, 0); }
+
+  void start() override;
+  void stop() override;
+  void finish() override;
+
+  void set_pause_state(bool pause_state) override { this->output_speaker_->set_pause_state(pause_state); }
+  bool get_pause_state() const override { return this->output_speaker_->get_pause_state(); }
+
+  bool has_buffered_data() const override;
+
+  /// @brief Mute state changes are passed to the parent's output speaker
+  void set_mute_state(bool mute_state) override;
+
+  /// @brief Volume state changes are passed to the parent's output speaker
+  void set_volume(float volume) override;
+
+  void set_output_speaker(speaker::Speaker *speaker) { this->output_speaker_ = speaker; }
+  void set_task_stack_in_psram(bool task_stack_in_psram) { this->task_stack_in_psram_ = task_stack_in_psram; }
+
+  void set_target_bits_per_sample(uint8_t target_bits_per_sample) {
+    this->target_bits_per_sample_ = target_bits_per_sample;
+  }
+  void set_target_sample_rate(uint32_t target_sample_rate) { this->target_sample_rate_ = target_sample_rate; }
+
+  void set_filters(uint16_t filters) { this->filters_ = filters; }
+  void set_taps(uint16_t taps) { this->taps_ = taps; }
+
+  void set_buffer_duration(uint32_t buffer_duration_ms) { this->buffer_duration_ms_ = buffer_duration_ms; }
+
+ protected:
+  /// @brief Starts the output speaker after setting the resampled stream info. If resampling is required, it starts the
+  /// task.
+  /// @return ESP_OK if resampling is required
+  ///         return value of start_task_() if resampling is required
+  esp_err_t start_();
+
+  /// @brief Starts the resampler task after allocating the task stack
+  /// @return ESP_OK if successful,
+  ///         ESP_ERR_NO_MEM if the task stack couldn't be allocated
+  ///         ESP_ERR_INVALID_STATE if the task wasn't created
+  esp_err_t start_task_();
+
+  /// @brief Stops the output speaker. If the resampling task is running, it sends the stop command.
+  void stop_();
+
+  /// @brief Deallocates the task stack and resets the pointers.
+  /// @return ESP_OK if successful
+  ///         ESP_ERR_INVALID_STATE if the task hasn't stopped itself
+  esp_err_t delete_task_();
+
+  inline bool requires_resampling_() const;
+  static void resample_task(void *params);
+
+  EventGroupHandle_t event_group_{nullptr};
+
+  std::weak_ptr<RingBuffer> ring_buffer_;
+
+  speaker::Speaker *output_speaker_{nullptr};
+
+  bool task_stack_in_psram_{false};
+  bool task_created_{false};
+
+  TaskHandle_t task_handle_{nullptr};
+  StaticTask_t task_stack_;
+  StackType_t *task_stack_buffer_{nullptr};
+
+  audio::AudioStreamInfo target_stream_info_;
+
+  uint16_t taps_;
+  uint16_t filters_;
+
+  uint8_t target_bits_per_sample_;
+  uint32_t target_sample_rate_;
+
+  uint32_t buffer_duration_ms_;
+
+  int32_t playback_differential_ms_{0};
+};
+
+}  // namespace resampler
+}  // namespace esphome
+
+#endif
--- a/esphome/components/rp2040/gpio.h
+++ b/esphome/components/rp2040/gpio.h
@@ -22,6 +22,7 @@ class RP2040GPIOPin : public InternalGPIOPin {
  void detach_interrupt() const override;
  ISRInternalGPIOPin to_isr() const override;
  uint8_t get_pin() const override { return pin_; }
+  gpio::Flags get_flags() const override { return flags_; }
  bool is_inverted() const override { return inverted_; }

 protected:
--- a/esphome/components/sn74hc165/sn74hc165.h
+++ b/esphome/components/sn74hc165/sn74hc165.h
@@ -52,6 +52,9 @@ class SN74HC165GPIOPin : public GPIOPin, public Parented<SN74HC165Component> {
  void set_pin(uint16_t pin) { pin_ = pin; }
  void set_inverted(bool inverted) { inverted_ = inverted; }

+  /// Always returns `gpio::Flags::FLAG_INPUT`.
+  gpio::Flags get_flags() const override { return gpio::Flags::FLAG_INPUT; }
+
 protected:
  uint16_t pin_;
  bool inverted_;
--- a/esphome/components/sn74hc595/sn74hc595.h
+++ b/esphome/components/sn74hc595/sn74hc595.h
@@ -59,6 +59,9 @@ class SN74HC595GPIOPin : public GPIOPin, public Parented<SN74HC595Component> {
  void set_pin(uint16_t pin) { pin_ = pin; }
  void set_inverted(bool inverted) { inverted_ = inverted; }

+  /// Always returns `gpio::Flags::FLAG_OUTPUT`.
+  gpio::Flags get_flags() const override { return gpio::Flags::FLAG_OUTPUT; }
+
 protected:
  uint16_t pin_;
  bool inverted_;
--- a/esphome/components/speaker/init.py
+++ b/esphome/components/speaker/init.py
@@ -1,7 +1,6 @@
 from esphome import automation
-from esphome.automation import maybe_simple_id
 import esphome.codegen as cg
-from esphome.components import audio_dac
+from esphome.components import audio, audio_dac
 import esphome.config_validation as cv
 from esphome.const import CONF_DATA, CONF_ID, CONF_VOLUME
 from esphome.core import CORE
@@ -54,13 +53,15 @@ async def register_speaker(var, config):
    await setup_speaker_core_(var, config)


-SPEAKER_SCHEMA = cv.Schema(
+SPEAKER_SCHEMA = cv.Schema.extend(audio.AUDIO_COMPONENT_SCHEMA).extend(
    {
        cv.Optional(CONF_AUDIO_DAC): cv.use_id(audio_dac.AudioDac),
    }
 )

-SPEAKER_AUTOMATION_SCHEMA = maybe_simple_id({cv.GenerateID(): cv.use_id(Speaker)})
+SPEAKER_AUTOMATION_SCHEMA = automation.maybe_simple_id(
+    {cv.GenerateID(): cv.use_id(Speaker)}
+)


 async def speaker_action(config, action_id, template_arg, args):
--- a/esphome/components/speaker/speaker.h
+++ b/esphome/components/speaker/speaker.h
@@ -9,6 +9,7 @@
 #endif

 #include "esphome/core/defines.h"
+#include "esphome/core/helpers.h"

 #include "esphome/components/audio/audio.h"
 #ifdef USE_AUDIO_DAC
@@ -56,6 +57,10 @@ class Speaker {
  // When finish() is not implemented on the platform component it should just do a normal stop.
  virtual void finish() { this->stop(); }

+  // Pauses processing incoming audio. Needs to be implemented specifically per speaker component
+  virtual void set_pause_state(bool pause_state) {}
+  virtual bool get_pause_state() const { return false; }
+
  virtual bool has_buffered_data() const = 0;

  bool is_running() const { return this->state_ == STATE_RUNNING; }
@@ -95,6 +100,19 @@ class Speaker {
    this->audio_stream_info_ = audio_stream_info;
  }

+  audio::AudioStreamInfo &get_audio_stream_info() { return this->audio_stream_info_; }
+
+  /// Callback function for sending the duration of the audio written to the speaker since the last callback.
+  /// Parameters:
+  ///   - Duration in milliseconds. Never rounded and should always be less than or equal to the actual duration.
+  ///   - Remainder duration in microseconds. Rounded duration after subtracting the previous parameter from the actual
+  ///     duration.
+  ///   - Duration of remaining, unwritten audio buffered in the speaker in milliseconds.
+  ///   - System time in microseconds when the last write was completed.
+  void add_audio_output_callback(std::function<void(uint32_t, uint32_t, uint32_t, uint32_t)> &&callback) {
+    this->audio_output_callback_.add(std::move(callback));
+  }
+
 protected:
  State state_{STATE_STOPPED};
  audio::AudioStreamInfo audio_stream_info_;
@@ -104,6 +122,8 @@ class Speaker {
 #ifdef USE_AUDIO_DAC
  audio_dac::AudioDac *audio_dac_{nullptr};
 #endif
+
+  CallbackManager<void(uint32_t, uint32_t, uint32_t, uint32_t)> audio_output_callback_{};
 };

 }  // namespace speaker
--- a/esphome/components/spi/spi.h
+++ b/esphome/components/spi/spi.h
@@ -114,6 +114,8 @@ class NullPin : public GPIOPin {

  void pin_mode(gpio::Flags flags) override {}

+  gpio::Flags get_flags() const override { return gpio::Flags::FLAG_NONE; }
+
  bool digital_read() override { return false; }

  void digital_write(bool value) override {}
--- a/esphome/components/sx1509/sx1509_gpio_pin.h
+++ b/esphome/components/sx1509/sx1509_gpio_pin.h
@@ -20,6 +20,8 @@ class SX1509GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { this->inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { this->flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  SX1509Component *parent_;
  uint8_t pin_;
--- a/esphome/components/tca9555/tca9555.h
+++ b/esphome/components/tca9555/tca9555.h
@@ -54,6 +54,8 @@ class TCA9555GPIOPin : public GPIOPin, public Parented<TCA9555Component> {
  void set_inverted(bool inverted) { this->inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { this->flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
 protected:
  uint8_t pin_;
  bool inverted_;
--- a/esphome/components/udp/init.py
+++ b/esphome/components/udp/init.py
@@ -18,7 +18,7 @@ from esphome.cpp_generator import MockObjClass

 CODEOWNERS = ["@clydebarrow"]
 DEPENDENCIES = ["network"]
-AUTO_LOAD = ["socket"]
+AUTO_LOAD = ["socket", "xxtea"]
 MULTI_CONF = True

 udp_ns = cg.esphome_ns.namespace("udp")
--- a/esphome/components/udp/udp_component.cpp
+++ b/esphome/components/udp/udp_component.cpp
@@ -3,6 +3,8 @@
 #include "esphome/components/network/util.h"
 #include "udp_component.h"

+#include "esphome/components/xxtea/xxtea.h"
+
 namespace esphome {
 namespace udp {

@@ -47,54 +49,7 @@ namespace udp {
 */
 static const char *const TAG = "udp";

-/**
- * XXTEA implementation, using 256 bit key.
- */
-
-static const uint32_t DELTA = 0x9e3779b9;
-#define MX ((((z >> 5) ^ (y << 2)) + ((y >> 3) ^ (z << 4))) ^ ((sum ^ y) + (k[(p ^ e) & 7] ^ z)))
-
-/**
- * Encrypt a block of data in-place
- */
-
-static void xxtea_encrypt(uint32_t *v, size_t n, const uint32_t *k) {
-  uint32_t z, y, sum, e;
-  size_t p;
-  size_t q = 6 + 52 / n;
-  sum = 0;
-  z = v[n - 1];
-  while (q-- != 0) {
-    sum += DELTA;
-    e = (sum >> 2);
-    for (p = 0; p != n - 1; p++) {
-      y = v[p + 1];
-      z = v[p] += MX;
-    }
-    y = v[0];
-    z = v[n - 1] += MX;
-  }
-}
-
-static void xxtea_decrypt(uint32_t *v, size_t n, const uint32_t *k) {
-  uint32_t z, y, sum, e;
-  size_t p;
-  size_t q = 6 + 52 / n;
-  sum = q * DELTA;
-  y = v[0];
-  while (q-- != 0) {
-    e = (sum >> 2);
-    for (p = n - 1; p != 0; p--) {
-      z = v[p - 1];
-      y = v[p] -= MX;
-    }
-    z = v[n - 1];
-    y = v[0] -= MX;
-    sum -= DELTA;
-  }
-}
-
-inline static size_t round4(size_t value) { return (value + 3) & ~3; }
+static size_t round4(size_t value) { return (value + 3) & ~3; }

 union FuData {
  uint32_t u32;
@@ -312,7 +267,7 @@ void UDPComponent::flush_() {
  memcpy(buffer, this->header_.data(), this->header_.size());
  memcpy(buffer + header_len, this->data_.data(), this->data_.size());
  if (this->is_encrypted_()) {
-    xxtea_encrypt(buffer + header_len, len, (uint32_t *) this->encryption_key_.data());
+    xxtea::encrypt(buffer + header_len, len, (uint32_t *) this->encryption_key_.data());
  }
  auto total_len = (header_len + len) * 4;
  this->send_packet_(buffer, total_len);
@@ -503,7 +458,7 @@ void UDPComponent::process_(uint8_t *buf, const size_t len) {
 #endif

  if (!provider.encryption_key.empty()) {
-    xxtea_decrypt((uint32_t *) buf, (end - buf) / 4, (uint32_t *) provider.encryption_key.data());
+    xxtea::decrypt((uint32_t *) buf, (end - buf) / 4, (uint32_t *) provider.encryption_key.data());
  }
  byte = *buf++;
  if (byte == ROLLING_CODE_KEY) {
--- a/esphome/components/weikai/weikai.h
+++ b/esphome/components/weikai/weikai.h
@@ -275,6 +275,8 @@ class WeikaiGPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { this->inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { this->flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
  void setup() override;
  std::string dump_summary() const override;
  void pin_mode(gpio::Flags flags) override { this->parent_->set_pin_direction_(this->pin_, flags); }
--- a/esphome/components/xl9535/xl9535.h
+++ b/esphome/components/xl9535/xl9535.h
@@ -36,6 +36,8 @@ class XL9535GPIOPin : public GPIOPin {
  void set_inverted(bool inverted) { this->inverted_ = inverted; }
  void set_flags(gpio::Flags flags) { this->flags_ = flags; }

+  gpio::Flags get_flags() const override { return this->flags_; }
+
  void setup() override;
  std::string dump_summary() const override;
  void pin_mode(gpio::Flags flags) override;
--- a/esphome/components/xxtea/init.py
+++ b/esphome/components/xxtea/init.py
@@ -0,0 +1,3 @@
+"""ESPHome XXTEA encryption component."""
+
+CODEOWNERS = ["@clydebarrow"]
--- a/esphome/components/xxtea/xxtea.cpp
+++ b/esphome/components/xxtea/xxtea.cpp
@@ -0,0 +1,46 @@
+#include "xxtea.h"
+
+namespace esphome {
+namespace xxtea {
+
+static const uint32_t DELTA = 0x9e3779b9;
+#define MX ((((z >> 5) ^ (y << 2)) + ((y >> 3) ^ (z << 4))) ^ ((sum ^ y) + (k[(p ^ e) & 7] ^ z)))
+
+void encrypt(uint32_t *v, size_t n, const uint32_t *k) {
+  uint32_t z, y, sum, e;
+  size_t p;
+  size_t q = 6 + 52 / n;
+  sum = 0;
+  z = v[n - 1];
+  while (q-- != 0) {
+    sum += DELTA;
+    e = (sum >> 2);
+    for (p = 0; p != n - 1; p++) {
+      y = v[p + 1];
+      z = v[p] += MX;
+    }
+    y = v[0];
+    z = v[n - 1] += MX;
+  }
+}
+
+void decrypt(uint32_t *v, size_t n, const uint32_t *k) {
+  uint32_t z, y, sum, e;
+  size_t p;
+  size_t q = 6 + 52 / n;
+  sum = q * DELTA;
+  y = v[0];
+  while (q-- != 0) {
+    e = (sum >> 2);
+    for (p = n - 1; p != 0; p--) {
+      z = v[p - 1];
+      y = v[p] -= MX;
+    }
+    z = v[n - 1];
+    y = v[0] -= MX;
+    sum -= DELTA;
+  }
+}
+
+}  // namespace xxtea
+}  // namespace esphome
--- a/esphome/components/xxtea/xxtea.h
+++ b/esphome/components/xxtea/xxtea.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+namespace esphome {
+namespace xxtea {
+
+/**
+ * Encrypt a block of data in-place using XXTEA algorithm with 256-bit key
+ * @param v Data to encrypt (as array of 32-bit words)
+ * @param n Number of 32-bit words in data
+ * @param k Key (array of 8 32-bit words)
+ */
+void encrypt(uint32_t *v, size_t n, const uint32_t *k);
+
+/**
+ * Decrypt a block of data in-place using XXTEA algorithm with 256-bit key
+ * @param v Data to decrypt (as array of 32-bit words)
+ * @param n Number of 32-bit words in data
+ * @param k Key (array of 8 32-bit words)
+ */
+void decrypt(uint32_t *v, size_t n, const uint32_t *k);
+
+}  // namespace xxtea
+}  // namespace esphome
--- a/esphome/const.py
+++ b/esphome/const.py
@@ -94,6 +94,7 @@ CONF_BRIGHTNESS = "brightness"
 CONF_BRIGHTNESS_LIMITS = "brightness_limits"
 CONF_BROKER = "broker"
 CONF_BSSID = "bssid"
+CONF_BUFFER_DURATION = "buffer_duration"
 CONF_BUFFER_SIZE = "buffer_size"
 CONF_BUILD_PATH = "build_path"
 CONF_BUS_VOLTAGE = "bus_voltage"
@@ -527,6 +528,7 @@ CONF_NAME_FONT = "name_font"
 CONF_NBITS = "nbits"
 CONF_NEC = "nec"
 CONF_NETWORKS = "networks"
+CONF_NEVER = "never"
 CONF_NEW_PASSWORD = "new_password"
 CONF_NITROGEN_DIOXIDE = "nitrogen_dioxide"
 CONF_NOISE_LEVEL = "noise_level"
@@ -615,6 +617,7 @@ CONF_OTA = "ota"
 CONF_OUTDOOR_TEMPERATURE = "outdoor_temperature"
 CONF_OUTPUT = "output"
 CONF_OUTPUT_ID = "output_id"
+CONF_OUTPUT_SPEAKER = "output_speaker"
 CONF_OUTPUTS = "outputs"
 CONF_OVERSAMPLING = "oversampling"
 CONF_PACKAGES = "packages"
@@ -859,6 +862,7 @@ CONF_TARGET_TEMPERATURE_LOW = "target_temperature_low"
 CONF_TARGET_TEMPERATURE_LOW_COMMAND_TOPIC = "target_temperature_low_command_topic"
 CONF_TARGET_TEMPERATURE_LOW_STATE_TOPIC = "target_temperature_low_state_topic"
 CONF_TARGET_TEMPERATURE_STATE_TOPIC = "target_temperature_state_topic"
+CONF_TASK_STACK_IN_PSRAM = "task_stack_in_psram"
 CONF_TEMPERATURE = "temperature"
 CONF_TEMPERATURE_COMPENSATION = "temperature_compensation"
 CONF_TEMPERATURE_OFFSET = "temperature_offset"
--- a/esphome/core/init.py
+++ b/esphome/core/init.py
@@ -689,7 +689,7 @@ class EsphomeCore:
        _LOGGER.debug("Adding: %s", expression)
        return expression

-    def add_global(self, expression):
+    def add_global(self, expression, prepend=False):
        from esphome.cpp_generator import Expression, Statement, statement

        if isinstance(expression, Expression):
@@ -698,7 +698,10 @@ class EsphomeCore:
            raise ValueError(
                f"Add '{expression}' must be expression or statement, not {type(expression)}"
            )
-        self.global_statements.append(expression)
+        if prepend:
+            self.global_statements.insert(0, expression)
+        else:
+            self.global_statements.append(expression)
        _LOGGER.debug("Adding global: %s", expression)
        return expression

--- a/esphome/core/config.py
+++ b/esphome/core/config.py
@@ -72,6 +72,9 @@ def validate_hostname(config):


 def valid_include(value):
+    # Look for "<...>" includes
+    if value.startswith("<") and value.endswith(">"):
+        return value
    try:
        return cv.directory(value)
    except cv.Invalid:
@@ -360,7 +363,19 @@ async def to_code(config):
        CORE.add_job(add_arduino_global_workaround)

    if config[CONF_INCLUDES]:
-        CORE.add_job(add_includes, config[CONF_INCLUDES])
+        # Get the <...> includes
+        system_includes = []
+        other_includes = []
+        for include in config[CONF_INCLUDES]:
+            if include.startswith("<") and include.endswith(">"):
+                system_includes.append(include)
+            else:
+                other_includes.append(include)
+        # <...> includes should be at the start
+        for include in system_includes:
+            cg.add_global(cg.RawStatement(f"#include {include}"), prepend=True)
+        # Other includes should be at the end
+        CORE.add_job(add_includes, other_includes)

    if project_conf := config.get(CONF_PROJECT):
        cg.add_define("ESPHOME_PROJECT_NAME", project_conf[CONF_NAME])
--- a/esphome/core/defines.h
+++ b/esphome/core/defines.h
@@ -16,6 +16,8 @@

 // Feature flags
 #define USE_ALARM_CONTROL_PANEL
+#define USE_AUDIO_FLAC_SUPPORT
+#define USE_AUDIO_MP3_SUPPORT
 #define USE_API
 #define USE_API_NOISE
 #define USE_API_PLAINTEXT
--- a/esphome/core/gpio.h
+++ b/esphome/core/gpio.h
@@ -53,6 +53,13 @@ class GPIOPin {

  virtual void pin_mode(gpio::Flags flags) = 0;

+  /**
+   * @brief Retrieve GPIO pin flags.
+   *
+   * @return The GPIO flags describing the pin mode and properties.
+   */
+  virtual gpio::Flags get_flags() const = 0;
+
  virtual bool digital_read() = 0;

  virtual void digital_write(bool value) = 0;
--- a/esphome/cpp_generator.py
+++ b/esphome/cpp_generator.py
@@ -588,9 +588,9 @@ def add(expression: Union[Expression, Statement]):
    CORE.add(expression)


-def add_global(expression: Union[SafeExpType, Statement]):
+def add_global(expression: Union[SafeExpType, Statement], prepend: bool = False):
    """Add an expression to the codegen global storage (above setup())."""
-    CORE.add_global(expression)
+    CORE.add_global(expression, prepend)


 def add_library(name: str, version: Optional[str], repository: Optional[str] = None):
--- a/platformio.ini
+++ b/platformio.ini
@@ -127,7 +127,8 @@ lib_deps =
    ESPmDNS                              ; mdns (Arduino built-in)
    DNSServer                            ; captive_portal (Arduino built-in)
    esphome/ESP32-audioI2S@2.0.7         ; i2s_audio
-    droscy/esp_wireguard@0.4.2          ; wireguard
+    droscy/esp_wireguard@0.4.2           ; wireguard
+    esphome/esp-audio-libs@1.1.1         ; audio

 build_flags =
    ${common:arduino.build_flags}
@@ -148,6 +149,7 @@ lib_deps =
    ${common:idf.lib_deps}
    droscy/esp_wireguard@0.4.2              ; wireguard
    kahrendt/ESPMicroSpeechFeatures@1.1.0   ; micro_wake_word
+    esphome/esp-audio-libs@1.1.1            ; audio
 build_flags =
    ${common:idf.build_flags}
    -Wno-nonnull-compare
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ esptool==4.7.0
 click==8.1.7
 esphome-dashboard==20241217.1
 aioesphomeapi==24.6.2
-zeroconf==0.132.2
+zeroconf==0.143.0
 puremagic==1.27
 ruamel.yaml==0.18.6 # dashboard_import
 glyphsets==1.0.0
--- a/tests/components/a02yyuw/common.yaml
+++ b/tests/components/a02yyuw/common.yaml
@@ -0,0 +1,11 @@
+uart:
+  - id: uart_a02yyuw
+    tx_pin: ${tx_pin}
+    rx_pin: ${rx_pin}
+    baud_rate: 9600
+
+sensor:
+  - platform: a02yyuw
+    id: a02yyuw_sensor
+    name: a02yyuw Distance
+    uart_id: uart_a02yyuw
--- a/tests/components/a02yyuw/test.esp32-ard.yaml
+++ b/tests/components/a02yyuw/test.esp32-ard.yaml
@@ -1,13 +1,5 @@
-uart:
-  - id: uart_a02yyuw
-    tx_pin:
-      number: 17
-    rx_pin:
-      number: 16
-    baud_rate: 9600
+substitutions:
+  tx_pin: GPIO17
+  rx_pin: GPIO16

-sensor:
-  - platform: a02yyuw
-    id: a02yyuw_sensor
-    name: a02yyuw Distance
-    uart_id: uart_a02yyuw
+<<: !include common.yaml
--- a/tests/components/a02yyuw/test.esp32-c3-ard.yaml
+++ b/tests/components/a02yyuw/test.esp32-c3-ard.yaml
@@ -1,13 +1,5 @@
-uart:
-  - id: uart_a02yyuw
-    tx_pin:
-      number: 4
-    rx_pin:
-      number: 5
-    baud_rate: 9600
+substitutions:
+  tx_pin: GPIO4
+  rx_pin: GPIO5

-sensor:
-  - platform: a02yyuw
-    id: a02yyuw_sensor
-    name: a02yyuw Distance
-    uart_id: uart_a02yyuw
+<<: !include common.yaml
--- a/tests/components/a02yyuw/test.esp32-c3-idf.yaml
+++ b/tests/components/a02yyuw/test.esp32-c3-idf.yaml
@@ -1,13 +1,5 @@
-uart:
-  - id: uart_a02yyuw
-    tx_pin:
-      number: 4
-    rx_pin:
-      number: 5
-    baud_rate: 9600
+substitutions:
+  tx_pin: GPIO4
+  rx_pin: GPIO5

-sensor:
-  - platform: a02yyuw
-    id: a02yyuw_sensor
-    name: a02yyuw Distance
-    uart_id: uart_a02yyuw
+<<: !include common.yaml
--- a/tests/components/a02yyuw/test.esp32-idf.yaml
+++ b/tests/components/a02yyuw/test.esp32-idf.yaml
@@ -1,13 +1,5 @@
-uart:
-  - id: uart_a02yyuw
-    tx_pin:
-      number: 17
-    rx_pin:
-      number: 16
-    baud_rate: 9600
+substitutions:
+  tx_pin: GPIO17
+  rx_pin: GPIO16

-sensor:
-  - platform: a02yyuw
-    id: a02yyuw_sensor
-    name: a02yyuw Distance
-    uart_id: uart_a02yyuw
+<<: !include common.yaml
--- a/tests/components/a02yyuw/test.esp8266-ard.yaml
+++ b/tests/components/a02yyuw/test.esp8266-ard.yaml
@@ -1,13 +1,5 @@
-uart:
-  - id: uart_a02yyuw
-    tx_pin:
-      number: 4
-    rx_pin:
-      number: 5
-    baud_rate: 9600
+substitutions:
+  tx_pin: GPIO4
+  rx_pin: GPIO5

-sensor:
-  - platform: a02yyuw
-    id: a02yyuw_sensor
-    name: a02yyuw Distance
-    uart_id: uart_a02yyuw
+<<: !include common.yaml
--- a/tests/components/a02yyuw/test.rp2040-ard.yaml
+++ b/tests/components/a02yyuw/test.rp2040-ard.yaml
@@ -1,13 +1,5 @@
-uart:
-  - id: uart_a02yyuw
-    tx_pin:
-      number: 4
-    rx_pin:
-      number: 5
-    baud_rate: 9600
+substitutions:
+  tx_pin: GPIO4
+  rx_pin: GPIO5

-sensor:
-  - platform: a02yyuw
-    id: a02yyuw_sensor
-    name: a02yyuw Distance
-    uart_id: uart_a02yyuw
+<<: !include common.yaml
--- a/tests/components/a4988/common.yaml
+++ b/tests/components/a4988/common.yaml
@@ -0,0 +1,9 @@
+stepper:
+  - platform: a4988
+    id: a4988_stepper
+    step_pin: ${step_pin}
+    dir_pin: ${dir_pin}
+    sleep_pin: ${sleep_pin}
+    max_speed: 250 steps/s
+    acceleration: 100 steps/s^2
+    deceleration: 200 steps/s^2
--- a/tests/components/a4988/test.esp32-ard.yaml
+++ b/tests/components/a4988/test.esp32-ard.yaml
@@ -1,12 +1,6 @@
-stepper:
-  - platform: a4988
-    id: a4988_stepper
-    step_pin:
-      number: 22
-    dir_pin:
-      number: 23
-    sleep_pin:
-      number: 25
-    max_speed: 250 steps/s
-    acceleration: 100 steps/s^2
-    deceleration: 200 steps/s^2
+substitutions:
+  step_pin: GPIO22
+  dir_pin: GPIO23
+  sleep_pin: GPIO25
+
+<<: !include common.yaml
--- a/tests/components/a4988/test.esp32-c3-ard.yaml
+++ b/tests/components/a4988/test.esp32-c3-ard.yaml
@@ -1,12 +1,6 @@
-stepper:
-  - platform: a4988
-    id: a4988_stepper
-    step_pin:
-      number: 2
-    dir_pin:
-      number: 3
-    sleep_pin:
-      number: 5
-    max_speed: 250 steps/s
-    acceleration: 100 steps/s^2
-    deceleration: 200 steps/s^2
+substitutions:
+  step_pin: GPIO2
+  dir_pin: GPIO3
+  sleep_pin: GPIO5
+
+<<: !include common.yaml
--- a/tests/components/a4988/test.esp32-c3-idf.yaml
+++ b/tests/components/a4988/test.esp32-c3-idf.yaml
@@ -1,12 +1,6 @@
-stepper:
-  - platform: a4988
-    id: a4988_stepper
-    step_pin:
-      number: 2
-    dir_pin:
-      number: 3
-    sleep_pin:
-      number: 5
-    max_speed: 250 steps/s
-    acceleration: 100 steps/s^2
-    deceleration: 200 steps/s^2
+substitutions:
+  step_pin: GPIO2
+  dir_pin: GPIO3
+  sleep_pin: GPIO5
+
+<<: !include common.yaml
--- a/tests/components/a4988/test.esp32-idf.yaml
+++ b/tests/components/a4988/test.esp32-idf.yaml
@@ -1,12 +1,6 @@
-stepper:
-  - platform: a4988
-    id: a4988_stepper
-    step_pin:
-      number: 22
-    dir_pin:
-      number: 23
-    sleep_pin:
-      number: 25
-    max_speed: 250 steps/s
-    acceleration: 100 steps/s^2
-    deceleration: 200 steps/s^2
+substitutions:
+  step_pin: GPIO22
+  dir_pin: GPIO23
+  sleep_pin: GPIO25
+
+<<: !include common.yaml
--- a/tests/components/a4988/test.esp8266-ard.yaml
+++ b/tests/components/a4988/test.esp8266-ard.yaml
@@ -1,12 +1,6 @@
-stepper:
-  - platform: a4988
-    id: a4988_stepper
-    step_pin:
-      number: 1
-    dir_pin:
-      number: 2
-    sleep_pin:
-      number: 5
-    max_speed: 250 steps/s
-    acceleration: 100 steps/s^2
-    deceleration: 200 steps/s^2
+substitutions:
+  step_pin: GPIO1
+  dir_pin: GPIO2
+  sleep_pin: GPIO5
+
+<<: !include common.yaml
--- a/tests/components/a4988/test.rp2040-ard.yaml
+++ b/tests/components/a4988/test.rp2040-ard.yaml
@@ -1,12 +1,6 @@
-stepper:
-  - platform: a4988
-    id: a4988_stepper
-    step_pin:
-      number: 2
-    dir_pin:
-      number: 3
-    sleep_pin:
-      number: 5
-    max_speed: 250 steps/s
-    acceleration: 100 steps/s^2
-    deceleration: 200 steps/s^2
+substitutions:
+  step_pin: GPIO2
+  dir_pin: GPIO3
+  sleep_pin: GPIO5
+
+<<: !include common.yaml
--- a/tests/components/ac_dimmer/common.yaml
+++ b/tests/components/ac_dimmer/common.yaml
@@ -0,0 +1,5 @@
+output:
+  - platform: ac_dimmer
+    id: ac_dimmer_1
+    gate_pin: ${gate_pin}
+    zero_cross_pin: ${zero_cross_pin}
--- a/tests/components/ac_dimmer/test.esp32-ard.yaml
+++ b/tests/components/ac_dimmer/test.esp32-ard.yaml
@@ -1,7 +1,5 @@
-output:
-  - platform: ac_dimmer
-    id: ac_dimmer_1
-    gate_pin:
-      number: 12
-    zero_cross_pin:
-      number: 13
+substitutions:
+  gate_pin: GPIO18
+  zero_cross_pin: GPIO19
+
+<<: !include common.yaml
--- a/tests/components/ac_dimmer/test.esp32-c3-ard.yaml
+++ b/tests/components/ac_dimmer/test.esp32-c3-ard.yaml
@@ -1,7 +1,5 @@
-output:
-  - platform: ac_dimmer
-    id: ac_dimmer_1
-    gate_pin:
-      number: 5
-    zero_cross_pin:
-      number: 6
+substitutions:
+  gate_pin: GPIO5
+  zero_cross_pin: GPIO4
+
+<<: !include common.yaml
--- a/tests/components/ac_dimmer/test.esp8266-ard.yaml
+++ b/tests/components/ac_dimmer/test.esp8266-ard.yaml
@@ -1,7 +1,5 @@
-output:
-  - platform: ac_dimmer
-    id: ac_dimmer_1
-    gate_pin:
-      number: 5
-    zero_cross_pin:
-      number: 4
+substitutions:
+  gate_pin: GPIO5
+  zero_cross_pin: GPIO4
+
+<<: !include common.yaml
--- a/tests/components/ac_dimmer/test.rp2040-ard.yaml
+++ b/tests/components/ac_dimmer/test.rp2040-ard.yaml
@@ -1,7 +1,5 @@
-output:
-  - platform: ac_dimmer
-    id: ac_dimmer_1
-    gate_pin:
-      number: 5
-    zero_cross_pin:
-      number: 6
+substitutions:
+  gate_pin: GPIO5
+  zero_cross_pin: GPIO4
+
+<<: !include common.yaml
--- a/tests/components/adc/test.esp32-c3-ard.yaml
+++ b/tests/components/adc/test.esp32-c3-ard.yaml
@@ -2,4 +2,4 @@ sensor:
  - platform: adc
    id: my_sensor
    pin: 4
-    attenuation: 11db
+    attenuation: 12db
--- a/tests/components/adc/test.esp32-s2-ard.yaml
+++ b/tests/components/adc/test.esp32-s2-ard.yaml
@@ -2,4 +2,4 @@ sensor:
  - platform: adc
    id: my_sensor
    pin: 1
-    attenuation: 11db
+    attenuation: 12db
--- a/tests/components/adc/test.esp32-s3-ard.yaml
+++ b/tests/components/adc/test.esp32-s3-ard.yaml
@@ -2,4 +2,4 @@ sensor:
  - platform: adc
    id: my_sensor
    pin: 1
-    attenuation: 11db
+    attenuation: 12db
--- a/tests/components/adc128s102/common.yaml
+++ b/tests/components/adc128s102/common.yaml
@@ -0,0 +1,14 @@
+spi:
+  - id: spi_adc128s102
+    clk_pin: ${clk_pin}
+    mosi_pin: ${mosi_pin}
+    miso_pin: ${miso_pin}
+
+adc128s102:
+  cs_pin: ${cs_pin}
+  id: adc128s102_adc
+
+sensor:
+  - platform: adc128s102
+    id: adc128s102_channel_0
+    channel: 0
--- a/tests/components/adc128s102/test.esp32-ard.yaml
+++ b/tests/components/adc128s102/test.esp32-ard.yaml
@@ -1,14 +1,7 @@
-spi:
-  - id: spi_adc128s102
-    clk_pin: 16
-    mosi_pin: 17
-    miso_pin: 15
+substitutions:
+  clk_pin: GPIO16
+  mosi_pin: GPIO17
+  miso_pin: GPIO15
+  cs_pin: GPIO12

-adc128s102:
-  cs_pin: 12
-  id: adc128s102_adc
-
-sensor:
-  - platform: adc128s102
-    id: adc128s102_channel_0
-    channel: 0
+<<: !include common.yaml
--- a/tests/components/adc128s102/test.esp32-c3-ard.yaml
+++ b/tests/components/adc128s102/test.esp32-c3-ard.yaml
@@ -1,14 +1,7 @@
-spi:
-  - id: spi_adc128s102
-    clk_pin: 6
-    mosi_pin: 7
-    miso_pin: 5
+substitutions:
+  clk_pin: GPIO6
+  mosi_pin: GPIO7
+  miso_pin: GPIO5
+  cs_pin: GPIO2

-adc128s102:
-  cs_pin: 8
-  id: adc128s102_adc
-
-sensor:
-  - platform: adc128s102
-    id: adc128s102_channel_0
-    channel: 0
+<<: !include common.yaml
--- a/tests/components/adc128s102/test.esp32-c3-idf.yaml
+++ b/tests/components/adc128s102/test.esp32-c3-idf.yaml
@@ -1,14 +1,7 @@
-spi:
-  - id: spi_adc128s102
-    clk_pin: 6
-    mosi_pin: 7
-    miso_pin: 5
+substitutions:
+  clk_pin: GPIO6
+  mosi_pin: GPIO7
+  miso_pin: GPIO5
+  cs_pin: GPIO2

-adc128s102:
-  cs_pin: 8
-  id: adc128s102_adc
-
-sensor:
-  - platform: adc128s102
-    id: adc128s102_channel_0
-    channel: 0
+<<: !include common.yaml
--- a/tests/components/adc128s102/test.esp32-idf.yaml
+++ b/tests/components/adc128s102/test.esp32-idf.yaml
@@ -1,14 +1,7 @@
-spi:
-  - id: spi_adc128s102
-    clk_pin: 16
-    mosi_pin: 17
-    miso_pin: 15
+substitutions:
+  clk_pin: GPIO16
+  mosi_pin: GPIO17
+  miso_pin: GPIO15
+  cs_pin: GPIO12

-adc128s102:
-  cs_pin: 12
-  id: adc128s102_adc
-
-sensor:
-  - platform: adc128s102
-    id: adc128s102_channel_0
-    channel: 0
+<<: !include common.yaml
--- a/tests/components/adc128s102/test.esp8266-ard.yaml
+++ b/tests/components/adc128s102/test.esp8266-ard.yaml
@@ -1,14 +1,7 @@
-spi:
-  - id: spi_adc128s102
-    clk_pin: 14
-    mosi_pin: 13
-    miso_pin: 12
+substitutions:
+  clk_pin: GPIO14
+  mosi_pin: GPIO13
+  miso_pin: GPIO12
+  cs_pin: GPIO15

-adc128s102:
-  cs_pin: 15
-  id: adc128s102_adc
-
-sensor:
-  - platform: adc128s102
-    id: adc128s102_channel_0
-    channel: 0
+<<: !include common.yaml
--- a/tests/components/adc128s102/test.rp2040-ard.yaml
+++ b/tests/components/adc128s102/test.rp2040-ard.yaml
@@ -1,14 +1,7 @@
-spi:
-  - id: spi_adc128s102
-    clk_pin: 2
-    mosi_pin: 3
-    miso_pin: 4
+substitutions:
+  clk_pin: GPIO2
+  mosi_pin: GPIO3
+  miso_pin: GPIO4
+  cs_pin: GPIO5

-adc128s102:
-  cs_pin: 5
-  id: adc128s102_adc
-
-sensor:
-  - platform: adc128s102
-    id: adc128s102_channel_0
-    channel: 0
+<<: !include common.yaml
--- a/tests/components/addressable_light/common-ard-esp32_rmt_led_strip.yaml
+++ b/tests/components/addressable_light/common-ard-esp32_rmt_led_strip.yaml
@@ -5,7 +5,7 @@ light:
    chipset: ws2812
    rgb_order: GRB
    num_leds: 256
-    pin: 2
+    pin: ${pin}
    rmt_channel: 0

 display:
--- a/tests/components/addressable_light/common-ard-fastled.yaml
+++ b/tests/components/addressable_light/common-ard-fastled.yaml
@@ -3,7 +3,7 @@ light:
    id: led_matrix_32x8
    name: led_matrix_32x8
    chipset: WS2812B
-    pin: 2
+    pin: ${pin}
    num_leds: 256
    rgb_order: GRB
    default_transition_length: 0s
--- a/tests/components/addressable_light/common-idf-esp32_rmt_led_strip.yaml
+++ b/tests/components/addressable_light/common-idf-esp32_rmt_led_strip.yaml
@@ -5,7 +5,7 @@ light:
    chipset: ws2812
    rgb_order: GRB
    num_leds: 256
-    pin: 2
+    pin: ${pin}

 display:
  - platform: addressable_light
--- a/Show More
+++ b/Show More