diff --git a/CODEOWNERS b/CODEOWNERS
index 9fbf191be0..d26e153c1a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -277,6 +277,7 @@ esphome/components/mics_4514/* @jesserockz
 esphome/components/midea/* @dudanov
 esphome/components/midea_ir/* @dudanov
 esphome/components/mitsubishi/* @RubyBailey
+esphome/components/mixer/speaker/* @kahrendt
 esphome/components/mlx90393/* @functionpointer
 esphome/components/mlx90614/* @jesserockz
 esphome/components/mmc5603/* @benhoff
diff --git a/esphome/components/mixer/__init__.py b/esphome/components/mixer/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/esphome/components/mixer/speaker/__init__.py b/esphome/components/mixer/speaker/__init__.py
new file mode 100644
index 0000000000..a451f2b7b4
--- /dev/null
+++ b/esphome/components/mixer/speaker/__init__.py
@@ -0,0 +1,172 @@
+from esphome import automation
+import esphome.codegen as cg
+from esphome.components import audio, esp32, speaker
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_BUFFER_DURATION,
+    CONF_DURATION,
+    CONF_ID,
+    CONF_NEVER,
+    CONF_NUM_CHANNELS,
+    CONF_OUTPUT_SPEAKER,
+    CONF_SAMPLE_RATE,
+    CONF_TASK_STACK_IN_PSRAM,
+    CONF_TIMEOUT,
+    PLATFORM_ESP32,
+)
+from esphome.core.entity_helpers import inherit_property_from
+import esphome.final_validate as fv
+
+AUTO_LOAD = ["audio"]
+CODEOWNERS = ["@kahrendt"]
+
+mixer_speaker_ns = cg.esphome_ns.namespace("mixer_speaker")
+MixerSpeaker = mixer_speaker_ns.class_("MixerSpeaker", cg.Component)
+SourceSpeaker = mixer_speaker_ns.class_("SourceSpeaker", cg.Component, speaker.Speaker)
+
+CONF_DECIBEL_REDUCTION = "decibel_reduction"
+CONF_QUEUE_MODE = "queue_mode"
+CONF_SOURCE_SPEAKERS = "source_speakers"
+
+DuckingApplyAction = mixer_speaker_ns.class_(
+    "DuckingApplyAction", automation.Action, cg.Parented.template(SourceSpeaker)
+)
+
+
+SOURCE_SPEAKER_SCHEMA = speaker.SPEAKER_SCHEMA.extend(
+    {
+        cv.GenerateID(): cv.declare_id(SourceSpeaker),
+        cv.Optional(
+            CONF_BUFFER_DURATION, default="100ms"
+        ): cv.positive_time_period_milliseconds,
+        cv.Optional(CONF_TIMEOUT, default="500ms"): cv.Any(
+            cv.positive_time_period_milliseconds,
+            cv.one_of(CONF_NEVER, lower=True),
+        ),
+        cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(16, 16),
+    }
+)
+
+
+def _set_stream_limits(config):
+    audio.set_stream_limits(
+        min_bits_per_sample=16,
+        max_bits_per_sample=16,
+    )(config)
+
+    return config
+
+
+def _validate_source_speaker(config):
+    fconf = fv.full_config.get()
+
+    # Get ID for the output speaker and add it to the source speakrs config to easily inherit properties
+    path = fconf.get_path_for_id(config[CONF_ID])[:-3]
+    path.append(CONF_OUTPUT_SPEAKER)
+    output_speaker_id = fconf.get_config_for_path(path)
+    config[CONF_OUTPUT_SPEAKER] = output_speaker_id
+
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER)(config)
+    inherit_property_from(CONF_SAMPLE_RATE, CONF_OUTPUT_SPEAKER)(config)
+
+    audio.final_validate_audio_schema(
+        "mixer",
+        audio_device=CONF_OUTPUT_SPEAKER,
+        bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        channels=config.get(CONF_NUM_CHANNELS),
+        sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
+CONFIG_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.GenerateID(): cv.declare_id(MixerSpeaker),
+            cv.Required(CONF_OUTPUT_SPEAKER): cv.use_id(speaker.Speaker),
+            cv.Required(CONF_SOURCE_SPEAKERS): cv.All(
+                cv.ensure_list(SOURCE_SPEAKER_SCHEMA),
+                cv.Length(min=2, max=8),
+                [_set_stream_limits],
+            ),
+            cv.Optional(CONF_NUM_CHANNELS): cv.int_range(min=1, max=2),
+            cv.Optional(CONF_QUEUE_MODE, default=False): cv.boolean,
+            cv.SplitDefault(CONF_TASK_STACK_IN_PSRAM, esp32_idf=False): cv.All(
+                cv.boolean, cv.only_with_esp_idf
+            ),
+        }
+    ),
+    cv.only_on([PLATFORM_ESP32]),
+)
+
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Optional(CONF_SOURCE_SPEAKERS): [_validate_source_speaker],
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER),
+)
+
+
+async def to_code(config):
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+
+    spkr = await cg.get_variable(config[CONF_OUTPUT_SPEAKER])
+
+    cg.add(var.set_output_channels(config[CONF_NUM_CHANNELS]))
+    cg.add(var.set_output_speaker(spkr))
+    cg.add(var.set_queue_mode(config[CONF_QUEUE_MODE]))
+
+    if task_stack_in_psram := config.get(CONF_TASK_STACK_IN_PSRAM):
+        cg.add(var.set_task_stack_in_psram(task_stack_in_psram))
+        if task_stack_in_psram:
+            if config[CONF_TASK_STACK_IN_PSRAM]:
+                esp32.add_idf_sdkconfig_option(
+                    "CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY", True
+                )
+
+    for speaker_config in config[CONF_SOURCE_SPEAKERS]:
+        source_speaker = cg.new_Pvariable(speaker_config[CONF_ID])
+
+        cg.add(source_speaker.set_buffer_duration(speaker_config[CONF_BUFFER_DURATION]))
+
+        if speaker_config[CONF_TIMEOUT] != CONF_NEVER:
+            cg.add(source_speaker.set_timeout(speaker_config[CONF_TIMEOUT]))
+
+        await cg.register_component(source_speaker, speaker_config)
+        await cg.register_parented(source_speaker, config[CONF_ID])
+        await speaker.register_speaker(source_speaker, speaker_config)
+
+        cg.add(var.add_source_speaker(source_speaker))
+
+
+@automation.register_action(
+    "mixer_speaker.apply_ducking",
+    DuckingApplyAction,
+    cv.Schema(
+        {
+            cv.GenerateID(): cv.use_id(SourceSpeaker),
+            cv.Required(CONF_DECIBEL_REDUCTION): cv.templatable(
+                cv.int_range(min=0, max=51)
+            ),
+            cv.Optional(CONF_DURATION, default="0.0s"): cv.templatable(
+                cv.positive_time_period_milliseconds
+            ),
+        }
+    ),
+)
+async def ducking_set_to_code(config, action_id, template_arg, args):
+    var = cg.new_Pvariable(action_id, template_arg)
+    await cg.register_parented(var, config[CONF_ID])
+    decibel_reduction = await cg.templatable(
+        config[CONF_DECIBEL_REDUCTION], args, cg.uint8
+    )
+    cg.add(var.set_decibel_reduction(decibel_reduction))
+    duration = await cg.templatable(config[CONF_DURATION], args, cg.uint32)
+    cg.add(var.set_duration(duration))
+    return var
diff --git a/esphome/components/mixer/speaker/automation.h b/esphome/components/mixer/speaker/automation.h
new file mode 100644
index 0000000000..b688fa2c1e
--- /dev/null
+++ b/esphome/components/mixer/speaker/automation.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "mixer_speaker.h"
+
+#ifdef USE_ESP32
+
+namespace esphome {
+namespace mixer_speaker {
+template<typename... Ts> class DuckingApplyAction : public Action<Ts...>, public Parented<SourceSpeaker> {
+  TEMPLATABLE_VALUE(uint8_t, decibel_reduction)
+  TEMPLATABLE_VALUE(uint32_t, duration)
+  void play(Ts... x) override {
+    this->parent_->apply_ducking(this->decibel_reduction_.value(x...), this->duration_.value(x...));
+  }
+};
+}  // namespace mixer_speaker
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/mixer/speaker/mixer_speaker.cpp b/esphome/components/mixer/speaker/mixer_speaker.cpp
new file mode 100644
index 0000000000..60cff95eb2
--- /dev/null
+++ b/esphome/components/mixer/speaker/mixer_speaker.cpp
@@ -0,0 +1,624 @@
+#include "mixer_speaker.h"
+
+#ifdef USE_ESP32
+
+#include "esphome/core/hal.h"
+#include "esphome/core/helpers.h"
+#include "esphome/core/log.h"
+
+#include <algorithm>
+#include <cstring>
+
+namespace esphome {
+namespace mixer_speaker {
+
+static const UBaseType_t MIXER_TASK_PRIORITY = 10;
+
+static const uint32_t TRANSFER_BUFFER_DURATION_MS = 50;
+static const uint32_t TASK_DELAY_MS = 25;
+
+static const size_t TASK_STACK_SIZE = 4096;
+
+static const int16_t MAX_AUDIO_SAMPLE_VALUE = INT16_MAX;
+static const int16_t MIN_AUDIO_SAMPLE_VALUE = INT16_MIN;
+
+static const char *const TAG = "speaker_mixer";
+
+// Gives the Q15 fixed point scaling factor to reduce by 0 dB, 1dB, ..., 50 dB
+// dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014)
+// float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15)
+static const std::vector<int16_t> DECIBEL_REDUCTION_TABLE = {
+    32767, 29201, 26022, 23189, 20665, 18415, 16410, 14624, 13032, 11613, 10349, 9222, 8218, 7324, 6527, 5816, 5183,
+    4619,  4116,  3668,  3269,  2913,  2596,  2313,  2061,  1837,  1637,  1459,  1300, 1158, 1032, 920,  820,  731,
+    651,   580,   517,   461,   411,   366,   326,   291,   259,   231,   206,   183,  163,  146,  130,  116,  103};
+
+enum MixerEventGroupBits : uint32_t {
+  COMMAND_STOP = (1 << 0),  // stops the mixer task
+  STATE_STARTING = (1 << 10),
+  STATE_RUNNING = (1 << 11),
+  STATE_STOPPING = (1 << 12),
+  STATE_STOPPED = (1 << 13),
+  ERR_ESP_NO_MEM = (1 << 19),
+  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
+};
+
+void SourceSpeaker::dump_config() {
+  ESP_LOGCONFIG(TAG, "Mixer Source Speaker");
+  ESP_LOGCONFIG(TAG, "  Buffer Duration: %" PRIu32 " ms", this->buffer_duration_ms_);
+  if (this->timeout_ms_.has_value()) {
+    ESP_LOGCONFIG(TAG, "  Timeout: %" PRIu32 " ms", this->timeout_ms_.value());
+  } else {
+    ESP_LOGCONFIG(TAG, "  Timeout: never");
+  }
+}
+
+void SourceSpeaker::setup() {
+  this->parent_->get_output_speaker()->add_audio_output_callback(
+      [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) {
+        uint32_t personal_playback_ms = std::min(new_playback_ms, this->pending_playback_ms_);
+        if (personal_playback_ms > 0) {
+          this->pending_playback_ms_ -= personal_playback_ms;
+          this->audio_output_callback_(personal_playback_ms, remainder_us, this->pending_playback_ms_, write_timestamp);
+        }
+      });
+}
+
+void SourceSpeaker::loop() {
+  switch (this->state_) {
+    case speaker::STATE_STARTING: {
+      esp_err_t err = this->start_();
+      if (err == ESP_OK) {
+        this->state_ = speaker::STATE_RUNNING;
+        this->stop_gracefully_ = false;
+        this->last_seen_data_ms_ = millis();
+        this->status_clear_error();
+      } else {
+        switch (err) {
+          case ESP_ERR_NO_MEM:
+            this->status_set_error("Failed to start mixer: not enough memory");
+            break;
+          case ESP_ERR_NOT_SUPPORTED:
+            this->status_set_error("Failed to start mixer: unsupported bits per sample");
+            break;
+          case ESP_ERR_INVALID_ARG:
+            this->status_set_error("Failed to start mixer: audio stream isn't compatible with the other audio stream.");
+            break;
+          case ESP_ERR_INVALID_STATE:
+            this->status_set_error("Failed to start mixer: mixer task failed to start");
+            break;
+          default:
+            this->status_set_error("Failed to start mixer");
+            break;
+        }
+
+        this->state_ = speaker::STATE_STOPPING;
+      }
+      break;
+    }
+    case speaker::STATE_RUNNING:
+      if (!this->transfer_buffer_->has_buffered_data()) {
+        if ((this->timeout_ms_.has_value() && ((millis() - this->last_seen_data_ms_) > this->timeout_ms_.value())) ||
+            this->stop_gracefully_) {
+          this->state_ = speaker::STATE_STOPPING;
+        }
+      }
+      break;
+    case speaker::STATE_STOPPING:
+      this->stop_();
+      this->stop_gracefully_ = false;
+      this->state_ = speaker::STATE_STOPPED;
+      break;
+    case speaker::STATE_STOPPED:
+      break;
+  }
+}
+
+size_t SourceSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+  if (this->is_stopped()) {
+    this->start();
+  }
+  size_t bytes_written = 0;
+  if (this->ring_buffer_.use_count() == 1) {
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->ring_buffer_.lock();
+    bytes_written = temp_ring_buffer->write_without_replacement(data, length, ticks_to_wait);
+    if (bytes_written > 0) {
+      this->last_seen_data_ms_ = millis();
+    }
+  }
+  return bytes_written;
+}
+
+void SourceSpeaker::start() { this->state_ = speaker::STATE_STARTING; }
+
+esp_err_t SourceSpeaker::start_() {
+  const size_t ring_buffer_size = this->audio_stream_info_.ms_to_bytes(this->buffer_duration_ms_);
+  if (this->transfer_buffer_.use_count() == 0) {
+    this->transfer_buffer_ =
+        audio::AudioSourceTransferBuffer::create(this->audio_stream_info_.ms_to_bytes(TRANSFER_BUFFER_DURATION_MS));
+
+    if (this->transfer_buffer_ == nullptr) {
+      return ESP_ERR_NO_MEM;
+    }
+    std::shared_ptr<RingBuffer> temp_ring_buffer;
+
+    if (!this->ring_buffer_.use_count()) {
+      temp_ring_buffer = RingBuffer::create(ring_buffer_size);
+      this->ring_buffer_ = temp_ring_buffer;
+    }
+
+    if (!this->ring_buffer_.use_count()) {
+      return ESP_ERR_NO_MEM;
+    } else {
+      this->transfer_buffer_->set_source(temp_ring_buffer);
+    }
+  }
+
+  return this->parent_->start(this->audio_stream_info_);
+}
+
+void SourceSpeaker::stop() {
+  if (this->state_ != speaker::STATE_STOPPED) {
+    this->state_ = speaker::STATE_STOPPING;
+  }
+}
+
+void SourceSpeaker::stop_() {
+  this->transfer_buffer_.reset();  // deallocates the transfer buffer
+}
+
+void SourceSpeaker::finish() { this->stop_gracefully_ = true; }
+
+bool SourceSpeaker::has_buffered_data() const {
+  return ((this->transfer_buffer_.use_count() > 0) && this->transfer_buffer_->has_buffered_data());
+}
+
+void SourceSpeaker::set_mute_state(bool mute_state) {
+  this->mute_state_ = mute_state;
+  this->parent_->get_output_speaker()->set_mute_state(mute_state);
+}
+
+void SourceSpeaker::set_volume(float volume) {
+  this->volume_ = volume;
+  this->parent_->get_output_speaker()->set_volume(volume);
+}
+
+size_t SourceSpeaker::process_data_from_source(TickType_t ticks_to_wait) {
+  if (!this->transfer_buffer_.use_count()) {
+    return 0;
+  }
+
+  // Store current offset, as these samples are already ducked
+  const size_t current_length = this->transfer_buffer_->available();
+
+  size_t bytes_read = this->transfer_buffer_->transfer_data_from_source(ticks_to_wait);
+
+  uint32_t samples_to_duck = this->audio_stream_info_.bytes_to_samples(bytes_read);
+  if (samples_to_duck > 0) {
+    int16_t *current_buffer = reinterpret_cast<int16_t *>(this->transfer_buffer_->get_buffer_start() + current_length);
+
+    duck_samples(current_buffer, samples_to_duck, &this->current_ducking_db_reduction_,
+                 &this->ducking_transition_samples_remaining_, this->samples_per_ducking_step_,
+                 this->db_change_per_ducking_step_);
+  }
+
+  return bytes_read;
+}
+
+void SourceSpeaker::apply_ducking(uint8_t decibel_reduction, uint32_t duration) {
+  if (this->target_ducking_db_reduction_ != decibel_reduction) {
+    this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_;
+
+    this->target_ducking_db_reduction_ = decibel_reduction;
+
+    uint8_t total_ducking_steps = 0;
+    if (this->target_ducking_db_reduction_ > this->current_ducking_db_reduction_) {
+      // The dB reduction level is increasing (which results in quieter audio)
+      total_ducking_steps = this->target_ducking_db_reduction_ - this->current_ducking_db_reduction_ - 1;
+      this->db_change_per_ducking_step_ = 1;
+    } else {
+      // The dB reduction level is decreasing (which results in louder audio)
+      total_ducking_steps = this->current_ducking_db_reduction_ - this->target_ducking_db_reduction_ - 1;
+      this->db_change_per_ducking_step_ = -1;
+    }
+    if ((duration > 0) && (total_ducking_steps > 0)) {
+      this->ducking_transition_samples_remaining_ = this->audio_stream_info_.ms_to_samples(duration);
+
+      this->samples_per_ducking_step_ = this->ducking_transition_samples_remaining_ / total_ducking_steps;
+      this->ducking_transition_samples_remaining_ =
+          this->samples_per_ducking_step_ * total_ducking_steps;  // Adjust for integer division rounding
+
+      this->current_ducking_db_reduction_ += this->db_change_per_ducking_step_;
+    } else {
+      this->ducking_transition_samples_remaining_ = 0;
+      this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_;
+    }
+  }
+}
+
+void SourceSpeaker::duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck,
+                                 int8_t *current_ducking_db_reduction, uint32_t *ducking_transition_samples_remaining,
+                                 uint32_t samples_per_ducking_step, int8_t db_change_per_ducking_step) {
+  if (*ducking_transition_samples_remaining > 0) {
+    // Ducking level is still transitioning
+
+    // Takes the ceiling of input_samples_to_duck/samples_per_ducking_step
+    uint32_t ducking_steps_in_batch =
+        input_samples_to_duck / samples_per_ducking_step + (input_samples_to_duck % samples_per_ducking_step != 0);
+
+    for (uint32_t i = 0; i < ducking_steps_in_batch; ++i) {
+      uint32_t samples_left_in_step = *ducking_transition_samples_remaining % samples_per_ducking_step;
+
+      if (samples_left_in_step == 0) {
+        samples_left_in_step = samples_per_ducking_step;
+      }
+
+      uint32_t samples_to_duck = std::min(input_samples_to_duck, samples_left_in_step);
+      samples_to_duck = std::min(samples_to_duck, *ducking_transition_samples_remaining);
+
+      // Ensure we only point to valid index in the Q15 scaling factor table
+      uint8_t safe_db_reduction_index =
+          clamp<uint8_t>(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1);
+      int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index];
+
+      audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, samples_to_duck);
+
+      if (samples_left_in_step - samples_to_duck == 0) {
+        // After scaling the current samples, we are ready to transition to the next step
+        *current_ducking_db_reduction += db_change_per_ducking_step;
+      }
+
+      input_buffer += samples_to_duck;
+      *ducking_transition_samples_remaining -= samples_to_duck;
+      input_samples_to_duck -= samples_to_duck;
+    }
+  }
+
+  if ((*current_ducking_db_reduction > 0) && (input_samples_to_duck > 0)) {
+    // Audio is ducked, but its not in the middle of a transition step
+
+    uint8_t safe_db_reduction_index =
+        clamp<uint8_t>(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1);
+    int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index];
+
+    audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, input_samples_to_duck);
+  }
+}
+
+void MixerSpeaker::dump_config() {
+  ESP_LOGCONFIG(TAG, "Speaker Mixer:");
+  ESP_LOGCONFIG(TAG, "  Number of output channels: %u", this->output_channels_);
+}
+
+void MixerSpeaker::setup() {
+  this->event_group_ = xEventGroupCreate();
+
+  if (this->event_group_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to create event group");
+    this->mark_failed();
+    return;
+  }
+}
+
+void MixerSpeaker::loop() {
+  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
+
+  if (event_group_bits & MixerEventGroupBits::STATE_STARTING) {
+    ESP_LOGD(TAG, "Starting speaker mixer");
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STARTING);
+  }
+  if (event_group_bits & MixerEventGroupBits::ERR_ESP_NO_MEM) {
+    this->status_set_error("Failed to allocate the mixer's internal buffer");
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ERR_ESP_NO_MEM);
+  }
+  if (event_group_bits & MixerEventGroupBits::STATE_RUNNING) {
+    ESP_LOGD(TAG, "Started speaker mixer");
+    this->status_clear_error();
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_RUNNING);
+  }
+  if (event_group_bits & MixerEventGroupBits::STATE_STOPPING) {
+    ESP_LOGD(TAG, "Stopping speaker mixer");
+    xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STOPPING);
+  }
+  if (event_group_bits & MixerEventGroupBits::STATE_STOPPED) {
+    if (this->delete_task_() == ESP_OK) {
+      xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ALL_BITS);
+    }
+  }
+
+  if (this->task_handle_ != nullptr) {
+    bool all_stopped = true;
+
+    for (auto &speaker : this->source_speakers_) {
+      all_stopped &= speaker->is_stopped();
+    }
+
+    if (all_stopped) {
+      this->stop();
+    }
+  }
+}
+
+esp_err_t MixerSpeaker::start(audio::AudioStreamInfo &stream_info) {
+  if (!this->audio_stream_info_.has_value()) {
+    if (stream_info.get_bits_per_sample() != 16) {
+      // Audio streams that don't have 16 bits per sample are not supported
+      return ESP_ERR_NOT_SUPPORTED;
+    }
+
+    this->audio_stream_info_ = audio::AudioStreamInfo(stream_info.get_bits_per_sample(), this->output_channels_,
+                                                      stream_info.get_sample_rate());
+    this->output_speaker_->set_audio_stream_info(this->audio_stream_info_.value());
+  } else {
+    if (!this->queue_mode_ && (stream_info.get_sample_rate() != this->audio_stream_info_.value().get_sample_rate())) {
+      // The two audio streams must have the same sample rate to mix properly if not in queue mode
+      return ESP_ERR_INVALID_ARG;
+    }
+  }
+
+  return this->start_task_();
+}
+
+esp_err_t MixerSpeaker::start_task_() {
+  if (this->task_stack_buffer_ == nullptr) {
+    if (this->task_stack_in_psram_) {
+      RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+      this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE);
+    } else {
+      RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+      this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE);
+    }
+  }
+
+  if (this->task_stack_buffer_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if (this->task_handle_ == nullptr) {
+    this->task_handle_ = xTaskCreateStatic(audio_mixer_task, "mixer", TASK_STACK_SIZE, (void *) this,
+                                           MIXER_TASK_PRIORITY, this->task_stack_buffer_, &this->task_stack_);
+  }
+
+  if (this->task_handle_ == nullptr) {
+    return ESP_ERR_INVALID_STATE;
+  }
+
+  return ESP_OK;
+}
+
+esp_err_t MixerSpeaker::delete_task_() {
+  if (!this->task_created_) {
+    this->task_handle_ = nullptr;
+
+    if (this->task_stack_buffer_ != nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE);
+      }
+
+      this->task_stack_buffer_ = nullptr;
+    }
+
+    return ESP_OK;
+  }
+
+  return ESP_ERR_INVALID_STATE;
+}
+
+void MixerSpeaker::stop() { xEventGroupSetBits(this->event_group_, MixerEventGroupBits::COMMAND_STOP); }
+
+void MixerSpeaker::copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info,
+                               int16_t *output_buffer, audio::AudioStreamInfo output_stream_info,
+                               uint32_t frames_to_transfer) {
+  uint8_t input_channels = input_stream_info.get_channels();
+  uint8_t output_channels = output_stream_info.get_channels();
+  const uint8_t max_input_channel_index = input_channels - 1;
+
+  if (input_channels == output_channels) {
+    size_t bytes_to_copy = input_stream_info.frames_to_bytes(frames_to_transfer);
+    memcpy(output_buffer, input_buffer, bytes_to_copy);
+
+    return;
+  }
+
+  for (uint32_t frame_index = 0; frame_index < frames_to_transfer; ++frame_index) {
+    for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) {
+      uint8_t input_channel_index = std::min(output_channel_index, max_input_channel_index);
+      output_buffer[output_channels * frame_index + output_channel_index] =
+          input_buffer[input_channels * frame_index + input_channel_index];
+    }
+  }
+}
+
+void MixerSpeaker::mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info,
+                                     const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info,
+                                     int16_t *output_buffer, audio::AudioStreamInfo output_stream_info,
+                                     uint32_t frames_to_mix) {
+  const uint8_t primary_channels = primary_stream_info.get_channels();
+  const uint8_t secondary_channels = secondary_stream_info.get_channels();
+  const uint8_t output_channels = output_stream_info.get_channels();
+
+  const uint8_t max_primary_channel_index = primary_channels - 1;
+  const uint8_t max_secondary_channel_index = secondary_channels - 1;
+
+  for (uint32_t frames_index = 0; frames_index < frames_to_mix; ++frames_index) {
+    for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) {
+      const uint32_t secondary_channel_index = std::min(output_channel_index, max_secondary_channel_index);
+      const int32_t secondary_sample = secondary_buffer[frames_index * secondary_channels + secondary_channel_index];
+
+      const uint32_t primary_channel_index = std::min(output_channel_index, max_primary_channel_index);
+      const int32_t primary_sample =
+          static_cast<int32_t>(primary_buffer[frames_index * primary_channels + primary_channel_index]);
+
+      const int32_t added_sample = secondary_sample + primary_sample;
+
+      output_buffer[frames_index * output_channels + output_channel_index] =
+          static_cast<int16_t>(clamp<int32_t>(added_sample, MIN_AUDIO_SAMPLE_VALUE, MAX_AUDIO_SAMPLE_VALUE));
+    }
+  }
+}
+
+void MixerSpeaker::audio_mixer_task(void *params) {
+  MixerSpeaker *this_mixer = (MixerSpeaker *) params;
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STARTING);
+
+  this_mixer->task_created_ = true;
+
+  std::unique_ptr<audio::AudioSinkTransferBuffer> output_transfer_buffer = audio::AudioSinkTransferBuffer::create(
+      this_mixer->audio_stream_info_.value().ms_to_bytes(TRANSFER_BUFFER_DURATION_MS));
+
+  if (output_transfer_buffer == nullptr) {
+    xEventGroupSetBits(this_mixer->event_group_,
+                       MixerEventGroupBits::STATE_STOPPED | MixerEventGroupBits::ERR_ESP_NO_MEM);
+
+    this_mixer->task_created_ = false;
+    vTaskDelete(nullptr);
+  }
+
+  output_transfer_buffer->set_sink(this_mixer->output_speaker_);
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_RUNNING);
+
+  bool sent_finished = false;
+
+  while (true) {
+    uint32_t event_group_bits = xEventGroupGetBits(this_mixer->event_group_);
+    if (event_group_bits & MixerEventGroupBits::COMMAND_STOP) {
+      break;
+    }
+
+    output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS));
+
+    const uint32_t output_frames_free =
+        this_mixer->audio_stream_info_.value().bytes_to_frames(output_transfer_buffer->free());
+
+    std::vector<SourceSpeaker *> speakers_with_data;
+    std::vector<std::shared_ptr<audio::AudioSourceTransferBuffer>> transfer_buffers_with_data;
+
+    for (auto &speaker : this_mixer->source_speakers_) {
+      if (speaker->get_transfer_buffer().use_count() > 0) {
+        std::shared_ptr<audio::AudioSourceTransferBuffer> transfer_buffer = speaker->get_transfer_buffer().lock();
+        speaker->process_data_from_source(0);  // Transfers and ducks audio from source ring buffers
+
+        if ((transfer_buffer->available() > 0) && !speaker->get_pause_state()) {
+          // Store the locked transfer buffers in their own vector to avoid releasing ownership until after the loop
+          transfer_buffers_with_data.push_back(transfer_buffer);
+          speakers_with_data.push_back(speaker);
+        }
+      }
+    }
+
+    if (transfer_buffers_with_data.empty()) {
+      // No audio available for transferring, block task temporarily
+      delay(TASK_DELAY_MS);
+      continue;
+    }
+
+    uint32_t frames_to_mix = output_frames_free;
+
+    if ((transfer_buffers_with_data.size() == 1) || this_mixer->queue_mode_) {
+      // Only one speaker has audio data, just copy samples over
+
+      audio::AudioStreamInfo active_stream_info = speakers_with_data[0]->get_audio_stream_info();
+
+      if (active_stream_info.get_sample_rate() ==
+          this_mixer->output_speaker_->get_audio_stream_info().get_sample_rate()) {
+        // Speaker's sample rate matches the output speaker's, copy directly
+
+        const uint32_t frames_available_in_buffer =
+            active_stream_info.bytes_to_frames(transfer_buffers_with_data[0]->available());
+        frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer);
+        copy_frames(reinterpret_cast<int16_t *>(transfer_buffers_with_data[0]->get_buffer_start()), active_stream_info,
+                    reinterpret_cast<int16_t *>(output_transfer_buffer->get_buffer_end()),
+                    this_mixer->audio_stream_info_.value(), frames_to_mix);
+
+        // Update source speaker buffer length
+        transfer_buffers_with_data[0]->decrease_buffer_length(active_stream_info.frames_to_bytes(frames_to_mix));
+        speakers_with_data[0]->accumulated_frames_read_ += frames_to_mix;
+
+        // Add new audio duration to the source speaker pending playback
+        speakers_with_data[0]->pending_playback_ms_ +=
+            active_stream_info.frames_to_milliseconds_with_remainder(&speakers_with_data[0]->accumulated_frames_read_);
+
+        // Update output transfer buffer length
+        output_transfer_buffer->increase_buffer_length(
+            this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix));
+      } else {
+        // Speaker's stream info doesn't match the output speaker's, so it's a new source speaker
+        if (!this_mixer->output_speaker_->is_stopped()) {
+          if (!sent_finished) {
+            this_mixer->output_speaker_->finish();
+            sent_finished = true;  // Avoid repeatedly sending the finish command
+          }
+        } else {
+          // Speaker has finished writing the current audio, update the stream information and restart the speaker
+          this_mixer->audio_stream_info_ =
+              audio::AudioStreamInfo(active_stream_info.get_bits_per_sample(), this_mixer->output_channels_,
+                                     active_stream_info.get_sample_rate());
+          this_mixer->output_speaker_->set_audio_stream_info(this_mixer->audio_stream_info_.value());
+          this_mixer->output_speaker_->start();
+          sent_finished = false;
+        }
+      }
+    } else {
+      // Determine how many frames to mix
+      for (int i = 0; i < transfer_buffers_with_data.size(); ++i) {
+        const uint32_t frames_available_in_buffer =
+            speakers_with_data[i]->get_audio_stream_info().bytes_to_frames(transfer_buffers_with_data[i]->available());
+        frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer);
+      }
+      int16_t *primary_buffer = reinterpret_cast<int16_t *>(transfer_buffers_with_data[0]->get_buffer_start());
+      audio::AudioStreamInfo primary_stream_info = speakers_with_data[0]->get_audio_stream_info();
+
+      // Mix two streams together
+      for (int i = 1; i < transfer_buffers_with_data.size(); ++i) {
+        mix_audio_samples(primary_buffer, primary_stream_info,
+                          reinterpret_cast<int16_t *>(transfer_buffers_with_data[i]->get_buffer_start()),
+                          speakers_with_data[i]->get_audio_stream_info(),
+                          reinterpret_cast<int16_t *>(output_transfer_buffer->get_buffer_end()),
+                          this_mixer->audio_stream_info_.value(), frames_to_mix);
+
+        speakers_with_data[i]->pending_playback_ms_ +=
+            speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder(
+                &speakers_with_data[i]->accumulated_frames_read_);
+
+        if (i != transfer_buffers_with_data.size() - 1) {
+          // Need to mix more streams together, point primary buffer and stream info to the already mixed output
+          primary_buffer = reinterpret_cast<int16_t *>(output_transfer_buffer->get_buffer_end());
+          primary_stream_info = this_mixer->audio_stream_info_.value();
+        }
+      }
+
+      // Update source transfer buffer lengths and add new audio durations to the source speaker pending playbacks
+      for (int i = 0; i < transfer_buffers_with_data.size(); ++i) {
+        transfer_buffers_with_data[i]->decrease_buffer_length(
+            speakers_with_data[i]->get_audio_stream_info().frames_to_bytes(frames_to_mix));
+        speakers_with_data[i]->accumulated_frames_read_ += frames_to_mix;
+
+        speakers_with_data[i]->pending_playback_ms_ +=
+            speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder(
+                &speakers_with_data[i]->accumulated_frames_read_);
+      }
+
+      // Update output transfer buffer length
+      output_transfer_buffer->increase_buffer_length(
+          this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix));
+    }
+  }
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPING);
+
+  output_transfer_buffer.reset();
+
+  xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPED);
+  this_mixer->task_created_ = false;
+  vTaskDelete(nullptr);
+}
+
+}  // namespace mixer_speaker
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/mixer/speaker/mixer_speaker.h b/esphome/components/mixer/speaker/mixer_speaker.h
new file mode 100644
index 0000000000..b2cb3e1e39
--- /dev/null
+++ b/esphome/components/mixer/speaker/mixer_speaker.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/audio/audio_transfer_buffer.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/component.h"
+
+#include <freertos/event_groups.h>
+#include <freertos/FreeRTOS.h>
+
+namespace esphome {
+namespace mixer_speaker {
+
+/* Classes for mixing several source speaker audio streams and writing it to another speaker component.
+ *  - Volume controls are passed through to the output speaker
+ *  - Directly handles pausing at the SourceSpeaker level; pause state is not passed through to the output speaker.
+ *  - Audio sent to the SourceSpeaker's must have 16 bits per sample.
+ *  - Audio sent to the SourceSpeaker can have any number of channels. They are duplicated or ignored as needed to match
+ *    the number of channels required for the output speaker.
+ *  - In queue mode, the audio sent to the SoureSpeakers can have different sample rates.
+ *  - In non-queue mode, the audio sent to the SourceSpeakers must have the same sample rates.
+ *  - SourceSpeaker has an internal ring buffer. It also allocates a shared_ptr for an AudioTranserBuffer object.
+ *  - Audio Data Flow:
+ *      - Audio data played on a SourceSpeaker first writes to its internal ring buffer.
+ *      - MixerSpeaker task temporarily takes shared ownership of each SourceSpeaker's AudioTransferBuffer.
+ *      - MixerSpeaker calls SourceSpeaker's `process_data_from_source`, which tranfers audio from the SourceSpeaker's
+ *        ring buffer to its AudioTransferBuffer. Audio ducking is applied at this step.
+ *      - In queue mode, MixerSpeaker prioritizes the earliest configured SourceSpeaker with audio data. Audio data is
+ *        sent to the output speaker.
+ *      - In non-queue mode, MixerSpeaker adds all the audio data in each SourceSpeaker into one stream that is written
+ *        to the output speaker.
+ */
+
+class MixerSpeaker;
+
+class SourceSpeaker : public speaker::Speaker, public Component {
+ public:
+  void dump_config() override;
+  void setup() override;
+  void loop() override;
+
+  size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override;
+  size_t play(const uint8_t *data, size_t length) override { return this->play(data, length, 0); }
+
+  void start() override;
+  void stop() override;
+  void finish() override;
+
+  bool has_buffered_data() const override;
+
+  /// @brief Mute state changes are passed to the parent's output speaker
+  void set_mute_state(bool mute_state) override;
+
+  /// @brief Volume state changes are passed to the parent's output speaker
+  void set_volume(float volume) override;
+
+  void set_pause_state(bool pause_state) override { this->pause_state_ = pause_state; }
+  bool get_pause_state() const override { return this->pause_state_; }
+
+  /// @brief Transfers audio from the ring buffer into the transfer buffer. Ducks audio while transferring.
+  /// @param ticks_to_wait FreeRTOS ticks to wait while waiting to read from the ring buffer.
+  /// @return Number of bytes transferred from the ring buffer.
+  size_t process_data_from_source(TickType_t ticks_to_wait);
+
+  /// @brief Sets the ducking level for the source speaker.
+  /// @param decibel_reduction (uint8_t) The dB reduction level. For example, 0 is no change, 10 is a reduction by 10 dB
+  /// @param duration (uint32_t) The number of milliseconds to transition from the current level to the new level
+  void apply_ducking(uint8_t decibel_reduction, uint32_t duration);
+
+  void set_buffer_duration(uint32_t buffer_duration_ms) { this->buffer_duration_ms_ = buffer_duration_ms; }
+  void set_parent(MixerSpeaker *parent) { this->parent_ = parent; }
+  void set_timeout(uint32_t ms) { this->timeout_ms_ = ms; }
+
+  std::weak_ptr<audio::AudioSourceTransferBuffer> get_transfer_buffer() { return this->transfer_buffer_; }
+
+ protected:
+  friend class MixerSpeaker;
+  esp_err_t start_();
+  void stop_();
+
+  /// @brief Ducks audio samples by a specified amount. When changing the ducking amount, it can transition gradually
+  /// over a specified amount of samples.
+  /// @param input_buffer buffer with audio samples to be ducked in place
+  /// @param input_samples_to_duck number of samples to process in ``input_buffer``
+  /// @param current_ducking_db_reduction pointer to the current dB reduction
+  /// @param ducking_transition_samples_remaining pointer to the total number of samples left before the the
+  ///         transition is finished
+  /// @param samples_per_ducking_step total number of samples per ducking step for the transition
+  /// @param db_change_per_ducking_step the change in dB reduction per step
+  static void duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck, int8_t *current_ducking_db_reduction,
+                           uint32_t *ducking_transition_samples_remaining, uint32_t samples_per_ducking_step,
+                           int8_t db_change_per_ducking_step);
+
+  MixerSpeaker *parent_;
+
+  std::shared_ptr<audio::AudioSourceTransferBuffer> transfer_buffer_;
+  std::weak_ptr<RingBuffer> ring_buffer_;
+
+  uint32_t buffer_duration_ms_;
+  uint32_t last_seen_data_ms_{0};
+  optional<uint32_t> timeout_ms_;
+  bool stop_gracefully_{false};
+
+  bool pause_state_{false};
+
+  int8_t target_ducking_db_reduction_{0};
+  int8_t current_ducking_db_reduction_{0};
+  int8_t db_change_per_ducking_step_{1};
+  uint32_t ducking_transition_samples_remaining_{0};
+  uint32_t samples_per_ducking_step_{0};
+
+  uint32_t accumulated_frames_read_{0};
+
+  uint32_t pending_playback_ms_{0};
+};
+
+class MixerSpeaker : public Component {
+ public:
+  void dump_config() override;
+  void setup() override;
+  void loop() override;
+
+  void add_source_speaker(SourceSpeaker *source_speaker) { this->source_speakers_.push_back(source_speaker); }
+
+  /// @brief Starts the mixer task. Called by a source speaker giving the current audio stream information
+  /// @param stream_info The calling source speakers audio stream information
+  /// @return ESP_ERR_NOT_SUPPORTED if the incoming stream is incompatible due to unsupported bits per sample
+  ///         ESP_ERR_INVALID_ARG if the incoming stream is incompatible to be mixed with the other input audio stream
+  ///         ESP_ERR_NO_MEM if there isn't enough memory for the task's stack
+  ///         ESP_ERR_INVALID_STATE if the task fails to start
+  ///         ESP_OK if the incoming stream is compatible and the mixer task starts
+  esp_err_t start(audio::AudioStreamInfo &stream_info);
+
+  void stop();
+
+  void set_output_channels(uint8_t output_channels) { this->output_channels_ = output_channels; }
+  void set_output_speaker(speaker::Speaker *speaker) { this->output_speaker_ = speaker; }
+  void set_queue_mode(bool queue_mode) { this->queue_mode_ = queue_mode; }
+  void set_task_stack_in_psram(bool task_stack_in_psram) { this->task_stack_in_psram_ = task_stack_in_psram; }
+
+  speaker::Speaker *get_output_speaker() const { return this->output_speaker_; }
+
+ protected:
+  /// @brief Copies audio frames from the input buffer to the output buffer taking into account the number of channels
+  /// in each stream. If the output stream has more channels, the input samples are duplicated. If the output stream has
+  /// less channels, the extra channel input samples are dropped.
+  /// @param input_buffer
+  /// @param input_stream_info
+  /// @param output_buffer
+  /// @param output_stream_info
+  /// @param frames_to_transfer number of frames (consisting of a sample for each channel) to copy from the input buffer
+  static void copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info, int16_t *output_buffer,
+                          audio::AudioStreamInfo output_stream_info, uint32_t frames_to_transfer);
+
+  /// @brief Mixes the primary and secondary streams taking into account the number of channels in each stream. Primary
+  /// and secondary samples are duplicated or dropped as necessary to ensure the output stream has the configured number
+  /// of channels. Output samples are clamped to the corresponding int16 min or max values if the mixed sample
+  /// overflows.
+  /// @param primary_buffer (int16_t *) samples buffer for the primary stream
+  /// @param primary_stream_info stream info for the primary stream
+  /// @param secondary_buffer (int16_t *) samples buffer for secondary stream
+  /// @param secondary_stream_info stream info for the secondary stream
+  /// @param output_buffer (int16_t *) buffer for the mixed samples
+  /// @param output_stream_info stream info for the output buffer
+  /// @param frames_to_mix number of frames in the primary and secondary buffers to mix together
+  static void mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info,
+                                const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info,
+                                int16_t *output_buffer, audio::AudioStreamInfo output_stream_info,
+                                uint32_t frames_to_mix);
+
+  static void audio_mixer_task(void *params);
+
+  /// @brief Starts the mixer task after allocating memory for the task stack.
+  /// @return ESP_ERR_NO_MEM if there isn't enough memory for the task's stack
+  ///         ESP_ERR_INVALID_STATE if the task didn't start
+  ///         ESP_OK if successful
+  esp_err_t start_task_();
+
+  /// @brief If the task is stopped, it sets the task handle to the nullptr and deallocates its stack
+  /// @return ESP_OK if the task was stopped, ESP_ERR_INVALID_STATE otherwise.
+  esp_err_t delete_task_();
+
+  EventGroupHandle_t event_group_{nullptr};
+
+  std::vector<SourceSpeaker *> source_speakers_;
+  speaker::Speaker *output_speaker_{nullptr};
+
+  uint8_t output_channels_;
+  bool queue_mode_;
+  bool task_stack_in_psram_{false};
+
+  bool task_created_{false};
+
+  TaskHandle_t task_handle_{nullptr};
+  StaticTask_t task_stack_;
+  StackType_t *task_stack_buffer_{nullptr};
+
+  optional<audio::AudioStreamInfo> audio_stream_info_;
+};
+
+}  // namespace mixer_speaker
+}  // namespace esphome
+
+#endif
diff --git a/tests/components/mixer/common.yaml b/tests/components/mixer/common.yaml
new file mode 100644
index 0000000000..e171b9499c
--- /dev/null
+++ b/tests/components/mixer/common.yaml
@@ -0,0 +1,23 @@
+esphome:
+  on_boot:
+    then:
+      - mixer_speaker.apply_ducking:
+          id: source_speaker_1_id
+          decibel_reduction: 10
+          duration: 1s
+
+i2s_audio:
+  i2s_lrclk_pin: ${lrclk_pin}
+  i2s_bclk_pin: ${bclk_pin}
+  i2s_mclk_pin: ${mclk_pin}
+
+speaker:
+  - platform: i2s_audio
+    id: speaker_id
+    dac_type: external
+    i2s_dout_pin: ${dout_pin}
+  - platform: mixer
+    output_speaker: speaker_id
+    source_speakers:
+      - id: source_speaker_1_id
+      - id: source_speaker_2_id
diff --git a/tests/components/mixer/test.esp32-ard.yaml b/tests/components/mixer/test.esp32-ard.yaml
new file mode 100644
index 0000000000..96d2d37458
--- /dev/null
+++ b/tests/components/mixer/test.esp32-ard.yaml
@@ -0,0 +1,7 @@
+substitutions:
+  lrclk_pin: GPIO16
+  bclk_pin: GPIO17
+  mclk_pin: GPIO15
+  dout_pin: GPIO14
+
+<<: !include common.yaml
diff --git a/tests/components/mixer/test.esp32-c3-ard.yaml b/tests/components/mixer/test.esp32-c3-ard.yaml
new file mode 100644
index 0000000000..f1721f0862
--- /dev/null
+++ b/tests/components/mixer/test.esp32-c3-ard.yaml
@@ -0,0 +1,7 @@
+substitutions:
+  lrclk_pin: GPIO4
+  bclk_pin: GPIO5
+  mclk_pin: GPIO6
+  dout_pin: GPIO7
+
+<<: !include common.yaml
diff --git a/tests/components/mixer/test.esp32-c3-idf.yaml b/tests/components/mixer/test.esp32-c3-idf.yaml
new file mode 100644
index 0000000000..f1721f0862
--- /dev/null
+++ b/tests/components/mixer/test.esp32-c3-idf.yaml
@@ -0,0 +1,7 @@
+substitutions:
+  lrclk_pin: GPIO4
+  bclk_pin: GPIO5
+  mclk_pin: GPIO6
+  dout_pin: GPIO7
+
+<<: !include common.yaml
diff --git a/tests/components/mixer/test.esp32-idf.yaml b/tests/components/mixer/test.esp32-idf.yaml
new file mode 100644
index 0000000000..96d2d37458
--- /dev/null
+++ b/tests/components/mixer/test.esp32-idf.yaml
@@ -0,0 +1,7 @@
+substitutions:
+  lrclk_pin: GPIO16
+  bclk_pin: GPIO17
+  mclk_pin: GPIO15
+  dout_pin: GPIO14
+
+<<: !include common.yaml
diff --git a/tests/components/mixer/test.esp32-s3-ard.yaml b/tests/components/mixer/test.esp32-s3-ard.yaml
new file mode 100644
index 0000000000..f1721f0862
--- /dev/null
+++ b/tests/components/mixer/test.esp32-s3-ard.yaml
@@ -0,0 +1,7 @@
+substitutions:
+  lrclk_pin: GPIO4
+  bclk_pin: GPIO5
+  mclk_pin: GPIO6
+  dout_pin: GPIO7
+
+<<: !include common.yaml
diff --git a/tests/components/mixer/test.esp32-s3-idf.yaml b/tests/components/mixer/test.esp32-s3-idf.yaml
new file mode 100644
index 0000000000..f1721f0862
--- /dev/null
+++ b/tests/components/mixer/test.esp32-s3-idf.yaml
@@ -0,0 +1,7 @@
+substitutions:
+  lrclk_pin: GPIO4
+  bclk_pin: GPIO5
+  mclk_pin: GPIO6
+  dout_pin: GPIO7
+
+<<: !include common.yaml