From 84836f15db00e8eecc73fa06b169fa7265ed653b Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@nabucasa.com>
Date: Mon, 10 Feb 2025 13:00:23 -0600
Subject: [PATCH] [speaker] Media Player Components PR9 (#8171)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
---
 CODEOWNERS                                    |   1 +
 esphome/components/media_player/__init__.py   |  59 +-
 esphome/components/media_player/automation.h  |  13 +-
 .../components/media_player/media_player.cpp  |   8 +
 .../components/media_player/media_player.h    |  12 +-
 .../speaker/media_player/__init__.py          | 458 ++++++++++++++
 .../speaker/media_player/audio_pipeline.cpp   | 560 +++++++++++++++++
 .../speaker/media_player/audio_pipeline.h     | 158 +++++
 .../speaker/media_player/automation.h         |  26 +
 .../media_player/speaker_media_player.cpp     | 577 ++++++++++++++++++
 .../media_player/speaker_media_player.h       | 160 +++++
 tests/components/media_player/common.yaml     |   2 +
 .../speaker/common-media_player.yaml          |  12 +
 .../speaker/media_player.esp32-idf.yaml       |   9 +
 .../speaker/media_player.esp32-s3-idf.yaml    |   9 +
 15 files changed, 2043 insertions(+), 21 deletions(-)
 create mode 100644 esphome/components/speaker/media_player/__init__.py
 create mode 100644 esphome/components/speaker/media_player/audio_pipeline.cpp
 create mode 100644 esphome/components/speaker/media_player/audio_pipeline.h
 create mode 100644 esphome/components/speaker/media_player/automation.h
 create mode 100644 esphome/components/speaker/media_player/speaker_media_player.cpp
 create mode 100644 esphome/components/speaker/media_player/speaker_media_player.h
 create mode 100644 tests/components/speaker/common-media_player.yaml
 create mode 100644 tests/components/speaker/media_player.esp32-idf.yaml
 create mode 100644 tests/components/speaker/media_player.esp32-s3-idf.yaml

diff --git a/CODEOWNERS b/CODEOWNERS
index d4b3d7eff9..26e36befe5 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -390,6 +390,7 @@ esphome/components/sn74hc165/* @jesserockz
 esphome/components/socket/* @esphome/core
 esphome/components/sonoff_d1/* @anatoly-savchenkov
 esphome/components/speaker/* @jesserockz @kahrendt
+esphome/components/speaker/media_player/* @kahrendt @synesthesiam
 esphome/components/spi/* @clydebarrow @esphome/core
 esphome/components/spi_device/* @clydebarrow
 esphome/components/spi_led_strip/* @clydebarrow
diff --git a/esphome/components/media_player/__init__.py b/esphome/components/media_player/__init__.py
index a46b30db29..b2543ac05f 100644
--- a/esphome/components/media_player/__init__.py
+++ b/esphome/components/media_player/__init__.py
@@ -1,5 +1,4 @@
 from esphome import automation
-from esphome.automation import maybe_simple_id
 import esphome.codegen as cg
 import esphome.config_validation as cv
 from esphome.const import (
@@ -21,6 +20,16 @@ media_player_ns = cg.esphome_ns.namespace("media_player")
 
 MediaPlayer = media_player_ns.class_("MediaPlayer")
 
+MediaPlayerSupportedFormat = media_player_ns.struct("MediaPlayerSupportedFormat")
+
+MediaPlayerFormatPurpose = media_player_ns.enum(
+    "MediaPlayerFormatPurpose", is_class=True
+)
+MEDIA_PLAYER_FORMAT_PURPOSE_ENUM = {
+    "default": MediaPlayerFormatPurpose.PURPOSE_DEFAULT,
+    "announcement": MediaPlayerFormatPurpose.PURPOSE_ANNOUNCEMENT,
+}
+
 
 PlayAction = media_player_ns.class_(
     "PlayAction", automation.Action, cg.Parented.template(MediaPlayer)
@@ -47,7 +56,7 @@ VolumeSetAction = media_player_ns.class_(
     "VolumeSetAction", automation.Action, cg.Parented.template(MediaPlayer)
 )
 
-
+CONF_ANNOUNCEMENT = "announcement"
 CONF_ON_PLAY = "on_play"
 CONF_ON_PAUSE = "on_pause"
 CONF_ON_ANNOUNCEMENT = "on_announcement"
@@ -125,7 +134,16 @@ MEDIA_PLAYER_SCHEMA = cv.ENTITY_BASE_SCHEMA.extend(
 )
 
 
-MEDIA_PLAYER_ACTION_SCHEMA = maybe_simple_id({cv.GenerateID(): cv.use_id(MediaPlayer)})
+MEDIA_PLAYER_ACTION_SCHEMA = cv.Schema(
+    {
+        cv.GenerateID(): cv.use_id(MediaPlayer),
+        cv.Optional(CONF_ANNOUNCEMENT, default=False): cv.templatable(cv.boolean),
+    }
+)
+
+MEDIA_PLAYER_CONDITION_SCHEMA = automation.maybe_simple_id(
+    {cv.GenerateID(): cv.use_id(MediaPlayer)}
+)
 
 
 @automation.register_action(
@@ -135,6 +153,7 @@ MEDIA_PLAYER_ACTION_SCHEMA = maybe_simple_id({cv.GenerateID(): cv.use_id(MediaPl
         {
             cv.GenerateID(): cv.use_id(MediaPlayer),
             cv.Required(CONF_MEDIA_URL): cv.templatable(cv.url),
+            cv.Optional(CONF_ANNOUNCEMENT, default=False): cv.templatable(cv.boolean),
         },
         key=CONF_MEDIA_URL,
     ),
@@ -143,7 +162,9 @@ async def media_player_play_media_action(config, action_id, template_arg, args):
     var = cg.new_Pvariable(action_id, template_arg)
     await cg.register_parented(var, config[CONF_ID])
     media_url = await cg.templatable(config[CONF_MEDIA_URL], args, cg.std_string)
+    announcement = await cg.templatable(config[CONF_ANNOUNCEMENT], args, cg.bool_)
     cg.add(var.set_media_url(media_url))
+    cg.add(var.set_announcement(announcement))
     return var
 
 
@@ -161,19 +182,27 @@ async def media_player_play_media_action(config, action_id, template_arg, args):
 @automation.register_action(
     "media_player.volume_down", VolumeDownAction, MEDIA_PLAYER_ACTION_SCHEMA
 )
-@automation.register_condition(
-    "media_player.is_idle", IsIdleCondition, MEDIA_PLAYER_ACTION_SCHEMA
-)
-@automation.register_condition(
-    "media_player.is_paused", IsPausedCondition, MEDIA_PLAYER_ACTION_SCHEMA
-)
-@automation.register_condition(
-    "media_player.is_playing", IsPlayingCondition, MEDIA_PLAYER_ACTION_SCHEMA
-)
-@automation.register_condition(
-    "media_player.is_announcing", IsAnnouncingCondition, MEDIA_PLAYER_ACTION_SCHEMA
-)
 async def media_player_action(config, action_id, template_arg, args):
+    var = cg.new_Pvariable(action_id, template_arg)
+    await cg.register_parented(var, config[CONF_ID])
+    announcement = await cg.templatable(config[CONF_ANNOUNCEMENT], args, cg.bool_)
+    cg.add(var.set_announcement(announcement))
+    return var
+
+
+@automation.register_condition(
+    "media_player.is_idle", IsIdleCondition, MEDIA_PLAYER_CONDITION_SCHEMA
+)
+@automation.register_condition(
+    "media_player.is_paused", IsPausedCondition, MEDIA_PLAYER_CONDITION_SCHEMA
+)
+@automation.register_condition(
+    "media_player.is_playing", IsPlayingCondition, MEDIA_PLAYER_CONDITION_SCHEMA
+)
+@automation.register_condition(
+    "media_player.is_announcing", IsAnnouncingCondition, MEDIA_PLAYER_CONDITION_SCHEMA
+)
+async def media_player_condition(config, action_id, template_arg, args):
     var = cg.new_Pvariable(action_id, template_arg)
     await cg.register_parented(var, config[CONF_ID])
     return var
diff --git a/esphome/components/media_player/automation.h b/esphome/components/media_player/automation.h
index 7b9220c4a5..422c224a85 100644
--- a/esphome/components/media_player/automation.h
+++ b/esphome/components/media_player/automation.h
@@ -10,7 +10,10 @@ namespace media_player {
 template<MediaPlayerCommand Command, typename... Ts>
 class MediaPlayerCommandAction : public Action<Ts...>, public Parented<MediaPlayer> {
  public:
-  void play(Ts... x) override { this->parent_->make_call().set_command(Command).perform(); }
+  TEMPLATABLE_VALUE(bool, announcement);
+  void play(Ts... x) override {
+    this->parent_->make_call().set_command(Command).set_announcement(this->announcement_.value(x...)).perform();
+  }
 };
 
 template<typename... Ts>
@@ -28,7 +31,13 @@ using VolumeDownAction = MediaPlayerCommandAction<MediaPlayerCommand::MEDIA_PLAY
 
 template<typename... Ts> class PlayMediaAction : public Action<Ts...>, public Parented<MediaPlayer> {
   TEMPLATABLE_VALUE(std::string, media_url)
-  void play(Ts... x) override { this->parent_->make_call().set_media_url(this->media_url_.value(x...)).perform(); }
+  TEMPLATABLE_VALUE(bool, announcement)
+  void play(Ts... x) override {
+    this->parent_->make_call()
+        .set_media_url(this->media_url_.value(x...))
+        .set_announcement(this->announcement_.value(x...))
+        .perform();
+  }
 };
 
 template<typename... Ts> class VolumeSetAction : public Action<Ts...>, public Parented<MediaPlayer> {
diff --git a/esphome/components/media_player/media_player.cpp b/esphome/components/media_player/media_player.cpp
index b5190d8573..01304d9135 100644
--- a/esphome/components/media_player/media_player.cpp
+++ b/esphome/components/media_player/media_player.cpp
@@ -41,6 +41,14 @@ const char *media_player_command_to_string(MediaPlayerCommand command) {
       return "VOLUME_UP";
     case MEDIA_PLAYER_COMMAND_VOLUME_DOWN:
       return "VOLUME_DOWN";
+    case MEDIA_PLAYER_COMMAND_ENQUEUE:
+      return "ENQUEUE";
+    case MEDIA_PLAYER_COMMAND_REPEAT_ONE:
+      return "REPEAT_ONE";
+    case MEDIA_PLAYER_COMMAND_REPEAT_OFF:
+      return "REPEAT_OFF";
+    case MEDIA_PLAYER_COMMAND_CLEAR_PLAYLIST:
+      return "CLEAR_PLAYLIST";
     default:
       return "UNKNOWN";
   }
diff --git a/esphome/components/media_player/media_player.h b/esphome/components/media_player/media_player.h
index 78b3ed6216..ee5889901c 100644
--- a/esphome/components/media_player/media_player.h
+++ b/esphome/components/media_player/media_player.h
@@ -24,6 +24,10 @@ enum MediaPlayerCommand : uint8_t {
   MEDIA_PLAYER_COMMAND_TOGGLE = 5,
   MEDIA_PLAYER_COMMAND_VOLUME_UP = 6,
   MEDIA_PLAYER_COMMAND_VOLUME_DOWN = 7,
+  MEDIA_PLAYER_COMMAND_ENQUEUE = 8,
+  MEDIA_PLAYER_COMMAND_REPEAT_ONE = 9,
+  MEDIA_PLAYER_COMMAND_REPEAT_OFF = 10,
+  MEDIA_PLAYER_COMMAND_CLEAR_PLAYLIST = 11,
 };
 const char *media_player_command_to_string(MediaPlayerCommand command);
 
@@ -72,10 +76,10 @@ class MediaPlayerCall {
 
   void perform();
 
-  const optional<MediaPlayerCommand> &get_command() const { return command_; }
-  const optional<std::string> &get_media_url() const { return media_url_; }
-  const optional<float> &get_volume() const { return volume_; }
-  const optional<bool> &get_announcement() const { return announcement_; }
+  const optional<MediaPlayerCommand> &get_command() const { return this->command_; }
+  const optional<std::string> &get_media_url() const { return this->media_url_; }
+  const optional<float> &get_volume() const { return this->volume_; }
+  const optional<bool> &get_announcement() const { return this->announcement_; }
 
  protected:
   void validate_();
diff --git a/esphome/components/speaker/media_player/__init__.py b/esphome/components/speaker/media_player/__init__.py
new file mode 100644
index 0000000000..14b72cacc0
--- /dev/null
+++ b/esphome/components/speaker/media_player/__init__.py
@@ -0,0 +1,458 @@
+"""Speaker Media Player Setup."""
+
+import hashlib
+import logging
+from pathlib import Path
+
+from esphome import automation, external_files
+import esphome.codegen as cg
+from esphome.components import audio, esp32, media_player, speaker
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_BUFFER_SIZE,
+    CONF_FILE,
+    CONF_FILES,
+    CONF_FORMAT,
+    CONF_ID,
+    CONF_NUM_CHANNELS,
+    CONF_PATH,
+    CONF_RAW_DATA_ID,
+    CONF_SAMPLE_RATE,
+    CONF_SPEAKER,
+    CONF_TASK_STACK_IN_PSRAM,
+    CONF_TYPE,
+    CONF_URL,
+)
+from esphome.core import CORE, HexInt
+from esphome.core.entity_helpers import inherit_property_from
+from esphome.external_files import download_content
+
+_LOGGER = logging.getLogger(__name__)
+
+AUTO_LOAD = ["audio", "psram"]
+
+CODEOWNERS = ["@kahrendt", "@synesthesiam"]
+DOMAIN = "media_player"
+
+TYPE_LOCAL = "local"
+TYPE_WEB = "web"
+
+CONF_ANNOUNCEMENT = "announcement"
+CONF_ANNOUNCEMENT_PIPELINE = "announcement_pipeline"
+CONF_CODEC_SUPPORT_ENABLED = "codec_support_enabled"
+CONF_ENQUEUE = "enqueue"
+CONF_MEDIA_FILE = "media_file"
+CONF_MEDIA_PIPELINE = "media_pipeline"
+CONF_ON_MUTE = "on_mute"
+CONF_ON_UNMUTE = "on_unmute"
+CONF_ON_VOLUME = "on_volume"
+CONF_STREAM = "stream"
+CONF_VOLUME_INCREMENT = "volume_increment"
+CONF_VOLUME_MIN = "volume_min"
+CONF_VOLUME_MAX = "volume_max"
+
+
+speaker_ns = cg.esphome_ns.namespace("speaker")
+SpeakerMediaPlayer = speaker_ns.class_(
+    "SpeakerMediaPlayer",
+    media_player.MediaPlayer,
+    cg.Component,
+)
+
+AudioPipeline = speaker_ns.class_("AudioPipeline")
+AudioPipelineType = speaker_ns.enum("AudioPipelineType", is_class=True)
+AUDIO_PIPELINE_TYPE_ENUM = {
+    "MEDIA": AudioPipelineType.MEDIA,
+    "ANNOUNCEMENT": AudioPipelineType.ANNOUNCEMENT,
+}
+
+PlayOnDeviceMediaAction = speaker_ns.class_(
+    "PlayOnDeviceMediaAction",
+    automation.Action,
+    cg.Parented.template(SpeakerMediaPlayer),
+)
+StopStreamAction = speaker_ns.class_(
+    "StopStreamAction", automation.Action, cg.Parented.template(SpeakerMediaPlayer)
+)
+
+
+def _compute_local_file_path(value: dict) -> Path:
+    url = value[CONF_URL]
+    h = hashlib.new("sha256")
+    h.update(url.encode())
+    key = h.hexdigest()[:8]
+    base_dir = external_files.compute_local_file_dir(DOMAIN)
+    _LOGGER.debug("_compute_local_file_path: base_dir=%s", base_dir / key)
+    return base_dir / key
+
+
+def _download_web_file(value):
+    url = value[CONF_URL]
+    path = _compute_local_file_path(value)
+
+    download_content(url, path)
+    _LOGGER.debug("download_web_file: path=%s", path)
+    return value
+
+
+# Returns a media_player.MediaPlayerSupportedFormat struct with the configured
+# format, sample rate, number of channels, purpose, and bytes per sample
+def _get_supported_format_struct(pipeline, type):
+    args = [
+        media_player.MediaPlayerSupportedFormat,
+    ]
+
+    if pipeline[CONF_FORMAT] == "FLAC":
+        args.append(("format", "flac"))
+    elif pipeline[CONF_FORMAT] == "MP3":
+        args.append(("format", "mp3"))
+    elif pipeline[CONF_FORMAT] == "WAV":
+        args.append(("format", "wav"))
+
+    args.append(("sample_rate", pipeline[CONF_SAMPLE_RATE]))
+    args.append(("num_channels", pipeline[CONF_NUM_CHANNELS]))
+
+    if type == "MEDIA":
+        args.append(
+            (
+                "purpose",
+                media_player.MEDIA_PLAYER_FORMAT_PURPOSE_ENUM["default"],
+            )
+        )
+    elif type == "ANNOUNCEMENT":
+        args.append(
+            (
+                "purpose",
+                media_player.MEDIA_PLAYER_FORMAT_PURPOSE_ENUM["announcement"],
+            )
+        )
+    if pipeline[CONF_FORMAT] != "MP3":
+        args.append(("sample_bytes", 2))
+
+    return cg.StructInitializer(*args)
+
+
+def _file_schema(value):
+    if isinstance(value, str):
+        return _validate_file_shorthand(value)
+    return TYPED_FILE_SCHEMA(value)
+
+
+def _read_audio_file_and_type(file_config):
+    conf_file = file_config[CONF_FILE]
+    file_source = conf_file[CONF_TYPE]
+    if file_source == TYPE_LOCAL:
+        path = CORE.relative_config_path(conf_file[CONF_PATH])
+    elif file_source == TYPE_WEB:
+        path = _compute_local_file_path(conf_file)
+    else:
+        raise cv.Invalid("Unsupported file source.")
+
+    with open(path, "rb") as f:
+        data = f.read()
+
+    import puremagic
+
+    file_type: str = puremagic.from_string(data)
+    if file_type.startswith("."):
+        file_type = file_type[1:]
+
+    media_file_type = audio.AUDIO_FILE_TYPE_ENUM["NONE"]
+    if file_type in ("wav"):
+        media_file_type = audio.AUDIO_FILE_TYPE_ENUM["WAV"]
+    elif file_type in ("mp3", "mpeg", "mpga"):
+        media_file_type = audio.AUDIO_FILE_TYPE_ENUM["MP3"]
+    elif file_type in ("flac"):
+        media_file_type = audio.AUDIO_FILE_TYPE_ENUM["FLAC"]
+
+    return data, media_file_type
+
+
+def _validate_file_shorthand(value):
+    value = cv.string_strict(value)
+    if value.startswith("http://") or value.startswith("https://"):
+        return _file_schema(
+            {
+                CONF_TYPE: TYPE_WEB,
+                CONF_URL: value,
+            }
+        )
+    return _file_schema(
+        {
+            CONF_TYPE: TYPE_LOCAL,
+            CONF_PATH: value,
+        }
+    )
+
+
+def _validate_pipeline(config):
+    # Inherit transcoder settings from speaker if not manually set
+    inherit_property_from(CONF_NUM_CHANNELS, CONF_SPEAKER)(config)
+    inherit_property_from(CONF_SAMPLE_RATE, CONF_SPEAKER)(config)
+
+    # Validate the transcoder settings is compatible with the speaker
+    audio.final_validate_audio_schema(
+        "speaker media_player",
+        audio_device=CONF_SPEAKER,
+        bits_per_sample=16,
+        channels=config.get(CONF_NUM_CHANNELS),
+        sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
+def _validate_repeated_speaker(config):
+    if (announcement_config := config.get(CONF_ANNOUNCEMENT_PIPELINE)) and (
+        media_config := config.get(CONF_MEDIA_PIPELINE)
+    ):
+        if announcement_config[CONF_SPEAKER] == media_config[CONF_SPEAKER]:
+            raise cv.Invalid(
+                "The announcement and media pipelines cannot use the same speaker. Use the `mixer` speaker component to create two source speakers."
+            )
+
+    return config
+
+
+def _validate_supported_local_file(config):
+    for file_config in config.get(CONF_FILES, []):
+        _, media_file_type = _read_audio_file_and_type(file_config)
+        if str(media_file_type) == str(audio.AUDIO_FILE_TYPE_ENUM["NONE"]):
+            raise cv.Invalid("Unsupported local media file.")
+        if not config[CONF_CODEC_SUPPORT_ENABLED] and str(media_file_type) != str(
+            audio.AUDIO_FILE_TYPE_ENUM["WAV"]
+        ):
+            # Only wav files are supported
+            raise cv.Invalid(
+                f"Unsupported local media file type, set {CONF_CODEC_SUPPORT_ENABLED} to true or convert the media file to wav"
+            )
+
+    return config
+
+
+LOCAL_SCHEMA = cv.Schema(
+    {
+        cv.Required(CONF_PATH): cv.file_,
+    }
+)
+
+WEB_SCHEMA = cv.All(
+    {
+        cv.Required(CONF_URL): cv.url,
+    },
+    _download_web_file,
+)
+
+
+TYPED_FILE_SCHEMA = cv.typed_schema(
+    {
+        TYPE_LOCAL: LOCAL_SCHEMA,
+        TYPE_WEB: WEB_SCHEMA,
+    },
+)
+
+
+MEDIA_FILE_TYPE_SCHEMA = cv.Schema(
+    {
+        cv.Required(CONF_ID): cv.declare_id(audio.AudioFile),
+        cv.Required(CONF_FILE): _file_schema,
+        cv.GenerateID(CONF_RAW_DATA_ID): cv.declare_id(cg.uint8),
+    }
+)
+
+PIPELINE_SCHEMA = cv.Schema(
+    {
+        cv.GenerateID(): cv.declare_id(AudioPipeline),
+        cv.Required(CONF_SPEAKER): cv.use_id(speaker.Speaker),
+        cv.Optional(CONF_FORMAT, default="FLAC"): cv.enum(audio.AUDIO_FILE_TYPE_ENUM),
+        cv.Optional(CONF_SAMPLE_RATE): cv.int_range(min=1),
+        cv.Optional(CONF_NUM_CHANNELS): cv.int_range(1, 2),
+    }
+)
+
+CONFIG_SCHEMA = cv.All(
+    media_player.MEDIA_PLAYER_SCHEMA.extend(
+        {
+            cv.GenerateID(): cv.declare_id(SpeakerMediaPlayer),
+            cv.Required(CONF_ANNOUNCEMENT_PIPELINE): PIPELINE_SCHEMA,
+            cv.Optional(CONF_MEDIA_PIPELINE): PIPELINE_SCHEMA,
+            cv.Optional(CONF_BUFFER_SIZE, default=1000000): cv.int_range(
+                min=4000, max=4000000
+            ),
+            cv.Optional(CONF_CODEC_SUPPORT_ENABLED, default=True): cv.boolean,
+            cv.Optional(CONF_FILES): cv.ensure_list(MEDIA_FILE_TYPE_SCHEMA),
+            cv.Optional(CONF_TASK_STACK_IN_PSRAM, default=False): cv.boolean,
+            cv.Optional(CONF_VOLUME_INCREMENT, default=0.05): cv.percentage,
+            cv.Optional(CONF_VOLUME_MAX, default=1.0): cv.percentage,
+            cv.Optional(CONF_VOLUME_MIN, default=0.0): cv.percentage,
+            cv.Optional(CONF_ON_MUTE): automation.validate_automation(single=True),
+            cv.Optional(CONF_ON_UNMUTE): automation.validate_automation(single=True),
+            cv.Optional(CONF_ON_VOLUME): automation.validate_automation(single=True),
+        }
+    ),
+    cv.only_with_esp_idf,
+    _validate_repeated_speaker,
+)
+
+
+FINAL_VALIDATE_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.Optional(CONF_ANNOUNCEMENT_PIPELINE): _validate_pipeline,
+            cv.Optional(CONF_MEDIA_PIPELINE): _validate_pipeline,
+        },
+        extra=cv.ALLOW_EXTRA,
+    ),
+    _validate_supported_local_file,
+)
+
+
+async def to_code(config):
+    if config[CONF_CODEC_SUPPORT_ENABLED]:
+        # Compile all supported audio codecs and optimize the wifi settings
+
+        cg.add_define("USE_AUDIO_FLAC_SUPPORT", True)
+        cg.add_define("USE_AUDIO_MP3_SUPPORT", True)
+
+        # Wifi settings based on https://github.com/espressif/esp-adf/issues/297#issuecomment-783811702
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_STATIC_RX_BUFFER_NUM", 16)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_DYNAMIC_RX_BUFFER_NUM", 512)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_STATIC_TX_BUFFER", True)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_TX_BUFFER_TYPE", 0)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_STATIC_TX_BUFFER_NUM", 8)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_CACHE_TX_BUFFER_NUM", 32)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_AMPDU_TX_ENABLED", True)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_TX_BA_WIN", 16)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_AMPDU_RX_ENABLED", True)
+        esp32.add_idf_sdkconfig_option("CONFIG_ESP32_WIFI_RX_BA_WIN", 32)
+        esp32.add_idf_sdkconfig_option("CONFIG_LWIP_MAX_ACTIVE_TCP", 16)
+        esp32.add_idf_sdkconfig_option("CONFIG_LWIP_MAX_LISTENING_TCP", 16)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_MAXRTX", 12)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_SYNMAXRTX", 6)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_MSS", 1436)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_MSL", 60000)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_SND_BUF_DEFAULT", 65535)
+        esp32.add_idf_sdkconfig_option(
+            "CONFIG_TCP_WND_DEFAULT", 65535
+        )  # Adjusted from referenced settings to avoid compilation error
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_RECVMBOX_SIZE", 512)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_QUEUE_OOSEQ", True)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_OVERSIZE_MSS", True)
+        esp32.add_idf_sdkconfig_option("CONFIG_LWIP_WND_SCALE", True)
+        esp32.add_idf_sdkconfig_option("CONFIG_TCP_RCV_SCALE", 3)
+        esp32.add_idf_sdkconfig_option("CONFIG_LWIP_TCPIP_RECVMBOX_SIZE", 512)
+
+        # Allocate wifi buffers in PSRAM
+        esp32.add_idf_sdkconfig_option("CONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP", True)
+
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+    await media_player.register_media_player(var, config)
+
+    cg.add_define("USE_OTA_STATE_CALLBACK")
+
+    cg.add(var.set_buffer_size(config[CONF_BUFFER_SIZE]))
+
+    cg.add(var.set_task_stack_in_psram(config[CONF_TASK_STACK_IN_PSRAM]))
+    if config[CONF_TASK_STACK_IN_PSRAM]:
+        esp32.add_idf_sdkconfig_option(
+            "CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY", True
+        )
+
+    cg.add(var.set_volume_increment(config[CONF_VOLUME_INCREMENT]))
+    cg.add(var.set_volume_max(config[CONF_VOLUME_MAX]))
+    cg.add(var.set_volume_min(config[CONF_VOLUME_MIN]))
+
+    announcement_pipeline_config = config[CONF_ANNOUNCEMENT_PIPELINE]
+    spkr = await cg.get_variable(announcement_pipeline_config[CONF_SPEAKER])
+    cg.add(var.set_announcement_speaker(spkr))
+    if announcement_pipeline_config[CONF_FORMAT] != "NONE":
+        cg.add(
+            var.set_announcement_format(
+                _get_supported_format_struct(
+                    announcement_pipeline_config, "ANNOUNCEMENT"
+                )
+            )
+        )
+
+    if media_pipeline_config := config.get(CONF_MEDIA_PIPELINE):
+        spkr = await cg.get_variable(media_pipeline_config[CONF_SPEAKER])
+        cg.add(var.set_media_speaker(spkr))
+        if media_pipeline_config[CONF_FORMAT] != "NONE":
+            cg.add(
+                var.set_media_format(
+                    _get_supported_format_struct(media_pipeline_config, "MEDIA")
+                )
+            )
+
+    if on_mute := config.get(CONF_ON_MUTE):
+        await automation.build_automation(
+            var.get_mute_trigger(),
+            [],
+            on_mute,
+        )
+    if on_unmute := config.get(CONF_ON_UNMUTE):
+        await automation.build_automation(
+            var.get_unmute_trigger(),
+            [],
+            on_unmute,
+        )
+    if on_volume := config.get(CONF_ON_VOLUME):
+        await automation.build_automation(
+            var.get_volume_trigger(),
+            [(cg.float_, "x")],
+            on_volume,
+        )
+
+    for file_config in config.get(CONF_FILES, []):
+        data, media_file_type = _read_audio_file_and_type(file_config)
+
+        rhs = [HexInt(x) for x in data]
+        prog_arr = cg.progmem_array(file_config[CONF_RAW_DATA_ID], rhs)
+
+        media_files_struct = cg.StructInitializer(
+            audio.AudioFile,
+            (
+                "data",
+                prog_arr,
+            ),
+            (
+                "length",
+                len(rhs),
+            ),
+            (
+                "file_type",
+                media_file_type,
+            ),
+        )
+
+        cg.new_Pvariable(
+            file_config[CONF_ID],
+            media_files_struct,
+        )
+
+
+@automation.register_action(
+    "media_player.speaker.play_on_device_media_file",
+    PlayOnDeviceMediaAction,
+    cv.maybe_simple_value(
+        {
+            cv.GenerateID(): cv.use_id(SpeakerMediaPlayer),
+            cv.Required(CONF_MEDIA_FILE): cv.use_id(audio.AudioFile),
+            cv.Optional(CONF_ANNOUNCEMENT, default=False): cv.templatable(cv.boolean),
+            cv.Optional(CONF_ENQUEUE, default=False): cv.templatable(cv.boolean),
+        },
+        key=CONF_MEDIA_FILE,
+    ),
+)
+async def play_on_device_media_media_action(config, action_id, template_arg, args):
+    var = cg.new_Pvariable(action_id, template_arg)
+    await cg.register_parented(var, config[CONF_ID])
+    media_file = await cg.get_variable(config[CONF_MEDIA_FILE])
+    announcement = await cg.templatable(config[CONF_ANNOUNCEMENT], args, cg.bool_)
+    enqueue = await cg.templatable(config[CONF_ENQUEUE], args, cg.bool_)
+
+    cg.add(var.set_audio_file(media_file))
+    cg.add(var.set_announcement(announcement))
+    cg.add(var.set_enqueue(enqueue))
+    return var
diff --git a/esphome/components/speaker/media_player/audio_pipeline.cpp b/esphome/components/speaker/media_player/audio_pipeline.cpp
new file mode 100644
index 0000000000..73ec5a3334
--- /dev/null
+++ b/esphome/components/speaker/media_player/audio_pipeline.cpp
@@ -0,0 +1,560 @@
+#include "audio_pipeline.h"
+
+#ifdef USE_ESP_IDF
+
+#include "esphome/core/defines.h"
+#include "esphome/core/hal.h"
+#include "esphome/core/helpers.h"
+#include "esphome/core/log.h"
+
+namespace esphome {
+namespace speaker {
+
+static const uint32_t INITIAL_BUFFER_MS = 1000;  // Start playback after buffering this duration of the file
+
+static const uint32_t READ_TASK_STACK_SIZE = 5 * 1024;
+static const uint32_t DECODE_TASK_STACK_SIZE = 3 * 1024;
+
+static const uint32_t INFO_ERROR_QUEUE_COUNT = 5;
+
+static const char *const TAG = "speaker_media_player.pipeline";
+
+enum EventGroupBits : uint32_t {
+  // MESSAGE_* bits are only set by their respective tasks
+
+  // Stops all activity in the pipeline elements; cleared by process_state() and set by stop() or by each task
+  PIPELINE_COMMAND_STOP = (1 << 0),
+
+  // Read audio from an HTTP source; cleared by reader task and set by start_url
+  READER_COMMAND_INIT_HTTP = (1 << 4),
+  // Read audio from an audio file from the flash; cleared by reader task and set by start_file
+  READER_COMMAND_INIT_FILE = (1 << 5),
+
+  // Audio file type is read after checking it is supported; cleared by decoder task
+  READER_MESSAGE_LOADED_MEDIA_TYPE = (1 << 6),
+  // Reader is done (either through a failure or just end of the stream); cleared by reader task
+  READER_MESSAGE_FINISHED = (1 << 7),
+  // Error reading the file; cleared by process_state()
+  READER_MESSAGE_ERROR = (1 << 8),
+
+  // Decoder is done (either through a faiilure or the end of the stream); cleared by decoder task
+  DECODER_MESSAGE_FINISHED = (1 << 12),
+  // Error decoding the file; cleared by process_state() by decoder task
+  DECODER_MESSAGE_ERROR = (1 << 13),
+};
+
+AudioPipeline::AudioPipeline(speaker::Speaker *speaker, size_t buffer_size, bool task_stack_in_psram,
+                             std::string base_name, UBaseType_t priority)
+    : base_name_(std::move(base_name)),
+      priority_(priority),
+      task_stack_in_psram_(task_stack_in_psram),
+      speaker_(speaker),
+      buffer_size_(buffer_size) {
+  this->allocate_communications_();
+  this->transfer_buffer_size_ = std::min(buffer_size_ / 4, DEFAULT_TRANSFER_BUFFER_SIZE);
+}
+
+void AudioPipeline::start_url(const std::string &uri) {
+  if (this->is_playing_) {
+    xEventGroupSetBits(this->event_group_, PIPELINE_COMMAND_STOP);
+  }
+  this->current_uri_ = uri;
+  this->pending_url_ = true;
+}
+
+void AudioPipeline::start_file(audio::AudioFile *audio_file) {
+  if (this->is_playing_) {
+    xEventGroupSetBits(this->event_group_, PIPELINE_COMMAND_STOP);
+  }
+  this->current_audio_file_ = audio_file;
+  this->pending_file_ = true;
+}
+
+esp_err_t AudioPipeline::stop() {
+  xEventGroupSetBits(this->event_group_, EventGroupBits::PIPELINE_COMMAND_STOP);
+
+  return ESP_OK;
+}
+void AudioPipeline::set_pause_state(bool pause_state) {
+  this->speaker_->set_pause_state(pause_state);
+
+  this->pause_state_ = pause_state;
+}
+
+void AudioPipeline::suspend_tasks() {
+  if (this->read_task_handle_ != nullptr) {
+    vTaskSuspend(this->read_task_handle_);
+  }
+  if (this->decode_task_handle_ != nullptr) {
+    vTaskSuspend(this->decode_task_handle_);
+  }
+}
+
+void AudioPipeline::resume_tasks() {
+  if (this->read_task_handle_ != nullptr) {
+    vTaskResume(this->read_task_handle_);
+  }
+  if (this->decode_task_handle_ != nullptr) {
+    vTaskResume(this->decode_task_handle_);
+  }
+}
+
+AudioPipelineState AudioPipeline::process_state() {
+  /*
+   * Log items from info error queue
+   */
+  InfoErrorEvent event;
+  if (this->info_error_queue_ != nullptr) {
+    while (xQueueReceive(this->info_error_queue_, &event, 0)) {
+      switch (event.source) {
+        case InfoErrorSource::READER:
+          if (event.err.has_value()) {
+            ESP_LOGE(TAG, "Media reader encountered an error: %s", esp_err_to_name(event.err.value()));
+          } else if (event.file_type.has_value()) {
+            ESP_LOGD(TAG, "Reading %s file type", audio_file_type_to_string(event.file_type.value()));
+          }
+
+          break;
+        case InfoErrorSource::DECODER:
+          if (event.err.has_value()) {
+            ESP_LOGE(TAG, "Decoder encountered an error: %s", esp_err_to_name(event.err.value()));
+          }
+
+          if (event.audio_stream_info.has_value()) {
+            ESP_LOGD(TAG, "Decoded audio has %d channels, %" PRId32 " Hz sample rate, and %d bits per sample",
+                     event.audio_stream_info.value().get_channels(), event.audio_stream_info.value().get_sample_rate(),
+                     event.audio_stream_info.value().get_bits_per_sample());
+          }
+
+          if (event.decoding_err.has_value()) {
+            switch (event.decoding_err.value()) {
+              case DecodingError::FAILED_HEADER:
+                ESP_LOGE(TAG, "Failed to parse the file's header.");
+                break;
+              case DecodingError::INCOMPATIBLE_BITS_PER_SAMPLE:
+                ESP_LOGE(TAG, "Incompatible bits per sample. Only 16 bits per sample is supported");
+                break;
+              case DecodingError::INCOMPATIBLE_CHANNELS:
+                ESP_LOGE(TAG, "Incompatible number of channels. Only 1 or 2 channel audio is supported.");
+                break;
+            }
+          }
+          break;
+      }
+    }
+  }
+
+  /*
+   * Determine the current state based on the event group bits and tasks' status
+   */
+
+  EventBits_t event_bits = xEventGroupGetBits(this->event_group_);
+
+  if (this->pending_url_ || this->pending_file_) {
+    // Init command pending
+    if (!(event_bits & EventGroupBits::PIPELINE_COMMAND_STOP)) {
+      // Only start if there is no pending stop command
+      if ((this->read_task_handle_ == nullptr) || (this->decode_task_handle_ == nullptr)) {
+        // At least one task isn't running
+        this->start_tasks_();
+      }
+
+      if (this->pending_url_) {
+        xEventGroupSetBits(this->event_group_, EventGroupBits::READER_COMMAND_INIT_HTTP);
+        this->playback_ms_ = 0;
+        this->pending_url_ = false;
+      } else if (this->pending_file_) {
+        xEventGroupSetBits(this->event_group_, EventGroupBits::READER_COMMAND_INIT_FILE);
+        this->playback_ms_ = 0;
+        this->pending_file_ = false;
+      }
+
+      this->is_playing_ = true;
+      return AudioPipelineState::PLAYING;
+    }
+  }
+
+  if ((event_bits & EventGroupBits::READER_MESSAGE_FINISHED) &&
+      (!(event_bits & EventGroupBits::READER_MESSAGE_LOADED_MEDIA_TYPE) &&
+       (event_bits & EventGroupBits::DECODER_MESSAGE_FINISHED))) {
+    // Tasks are finished and there's no media in between the reader and decoder
+
+    if (event_bits & EventGroupBits::PIPELINE_COMMAND_STOP) {
+      // Stop command is fully processed, so clear the command bit
+      xEventGroupClearBits(this->event_group_, EventGroupBits::PIPELINE_COMMAND_STOP);
+    }
+
+    if (!this->is_playing_) {
+      // The tasks have been stopped for two ``process_state`` calls in a row, so delete the tasks
+      if ((this->read_task_handle_ != nullptr) || (this->decode_task_handle_ != nullptr)) {
+        this->delete_tasks_();
+        this->speaker_->stop();
+      }
+    }
+    this->is_playing_ = false;
+    return AudioPipelineState::STOPPED;
+  }
+
+  if ((event_bits & EventGroupBits::READER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, EventGroupBits::READER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_READING;
+  }
+
+  if ((event_bits & EventGroupBits::DECODER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, EventGroupBits::DECODER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_DECODING;
+  }
+
+  if (this->pause_state_) {
+    return AudioPipelineState::PAUSED;
+  }
+
+  if ((this->read_task_handle_ == nullptr) && (this->decode_task_handle_ == nullptr)) {
+    // No tasks are running, so the pipeline is stopped.
+    xEventGroupClearBits(this->event_group_, EventGroupBits::PIPELINE_COMMAND_STOP);
+    return AudioPipelineState::STOPPED;
+  }
+
+  this->is_playing_ = true;
+  return AudioPipelineState::PLAYING;
+}
+
+esp_err_t AudioPipeline::allocate_communications_() {
+  if (this->event_group_ == nullptr)
+    this->event_group_ = xEventGroupCreate();
+
+  if (this->event_group_ == nullptr) {
+    return ESP_ERR_NO_MEM;
+  }
+
+  if (this->info_error_queue_ == nullptr)
+    this->info_error_queue_ = xQueueCreate(INFO_ERROR_QUEUE_COUNT, sizeof(InfoErrorEvent));
+
+  if (this->info_error_queue_ == nullptr)
+    return ESP_ERR_NO_MEM;
+
+  return ESP_OK;
+}
+
+esp_err_t AudioPipeline::start_tasks_() {
+  if (this->read_task_handle_ == nullptr) {
+    if (this->read_task_stack_buffer_ == nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        this->read_task_stack_buffer_ = stack_allocator.allocate(READ_TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        this->read_task_stack_buffer_ = stack_allocator.allocate(READ_TASK_STACK_SIZE);
+      }
+    }
+
+    if (this->read_task_stack_buffer_ == nullptr) {
+      return ESP_ERR_NO_MEM;
+    }
+
+    if (this->read_task_handle_ == nullptr) {
+      this->read_task_handle_ =
+          xTaskCreateStatic(read_task, (this->base_name_ + "_read").c_str(), READ_TASK_STACK_SIZE, (void *) this,
+                            this->priority_, this->read_task_stack_buffer_, &this->read_task_stack_);
+    }
+
+    if (this->read_task_handle_ == nullptr) {
+      return ESP_ERR_INVALID_STATE;
+    }
+  }
+
+  if (this->decode_task_handle_ == nullptr) {
+    if (this->decode_task_stack_buffer_ == nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        this->decode_task_stack_buffer_ = stack_allocator.allocate(DECODE_TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        this->decode_task_stack_buffer_ = stack_allocator.allocate(DECODE_TASK_STACK_SIZE);
+      }
+    }
+
+    if (this->decode_task_stack_buffer_ == nullptr) {
+      return ESP_ERR_NO_MEM;
+    }
+
+    if (this->decode_task_handle_ == nullptr) {
+      this->decode_task_handle_ =
+          xTaskCreateStatic(decode_task, (this->base_name_ + "_decode").c_str(), DECODE_TASK_STACK_SIZE, (void *) this,
+                            this->priority_, this->decode_task_stack_buffer_, &this->decode_task_stack_);
+    }
+
+    if (this->decode_task_handle_ == nullptr) {
+      return ESP_ERR_INVALID_STATE;
+    }
+  }
+
+  return ESP_OK;
+}
+
+void AudioPipeline::delete_tasks_() {
+  if (this->read_task_handle_ != nullptr) {
+    vTaskDelete(this->read_task_handle_);
+
+    if (this->read_task_stack_buffer_ != nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        stack_allocator.deallocate(this->read_task_stack_buffer_, READ_TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        stack_allocator.deallocate(this->read_task_stack_buffer_, READ_TASK_STACK_SIZE);
+      }
+
+      this->read_task_stack_buffer_ = nullptr;
+      this->read_task_handle_ = nullptr;
+    }
+  }
+
+  if (this->decode_task_handle_ != nullptr) {
+    vTaskDelete(this->decode_task_handle_);
+
+    if (this->decode_task_stack_buffer_ != nullptr) {
+      if (this->task_stack_in_psram_) {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_EXTERNAL);
+        stack_allocator.deallocate(this->decode_task_stack_buffer_, DECODE_TASK_STACK_SIZE);
+      } else {
+        RAMAllocator<StackType_t> stack_allocator(RAMAllocator<StackType_t>::ALLOC_INTERNAL);
+        stack_allocator.deallocate(this->decode_task_stack_buffer_, DECODE_TASK_STACK_SIZE);
+      }
+
+      this->decode_task_stack_buffer_ = nullptr;
+      this->decode_task_handle_ = nullptr;
+    }
+  }
+}
+
+void AudioPipeline::read_task(void *params) {
+  AudioPipeline *this_pipeline = (AudioPipeline *) params;
+
+  while (true) {
+    xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::READER_MESSAGE_FINISHED);
+
+    // Wait until the pipeline notifies us the source of the media file
+    EventBits_t event_bits =
+        xEventGroupWaitBits(this_pipeline->event_group_,
+                            EventGroupBits::READER_COMMAND_INIT_FILE | EventGroupBits::READER_COMMAND_INIT_HTTP |
+                                EventGroupBits::PIPELINE_COMMAND_STOP,  // Bit message to read
+                            pdFALSE,                                    // Clear the bit on exit
+                            pdFALSE,                                    // Wait for all the bits,
+                            portMAX_DELAY);                             // Block indefinitely until bit is set
+
+    if (!(event_bits & EventGroupBits::PIPELINE_COMMAND_STOP)) {
+      xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::READER_MESSAGE_FINISHED |
+                                                            EventGroupBits::READER_COMMAND_INIT_FILE |
+                                                            EventGroupBits::READER_COMMAND_INIT_HTTP);
+      InfoErrorEvent event;
+      event.source = InfoErrorSource::READER;
+      esp_err_t err = ESP_OK;
+
+      std::unique_ptr<audio::AudioReader> reader =
+          make_unique<audio::AudioReader>(this_pipeline->transfer_buffer_size_);
+
+      if (event_bits & EventGroupBits::READER_COMMAND_INIT_FILE) {
+        err = reader->start(this_pipeline->current_audio_file_, this_pipeline->current_audio_file_type_);
+      } else {
+        err = reader->start(this_pipeline->current_uri_, this_pipeline->current_audio_file_type_);
+      }
+
+      if (err == ESP_OK) {
+        size_t file_ring_buffer_size = this_pipeline->buffer_size_;
+
+        std::shared_ptr<RingBuffer> temp_ring_buffer;
+
+        if (!this_pipeline->raw_file_ring_buffer_.use_count()) {
+          temp_ring_buffer = RingBuffer::create(file_ring_buffer_size);
+          this_pipeline->raw_file_ring_buffer_ = temp_ring_buffer;
+        }
+
+        if (!this_pipeline->raw_file_ring_buffer_.use_count()) {
+          err = ESP_ERR_NO_MEM;
+        } else {
+          reader->add_sink(this_pipeline->raw_file_ring_buffer_);
+        }
+      }
+
+      if (err != ESP_OK) {
+        // Send specific error message
+        event.err = err;
+        xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY);
+
+        // Setting up the reader failed, stop the pipeline
+        xEventGroupSetBits(this_pipeline->event_group_,
+                           EventGroupBits::READER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+      } else {
+        // Send the file type to the pipeline
+        event.file_type = this_pipeline->current_audio_file_type_;
+        xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY);
+        xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::READER_MESSAGE_LOADED_MEDIA_TYPE);
+      }
+
+      while (true) {
+        event_bits = xEventGroupGetBits(this_pipeline->event_group_);
+
+        if (event_bits & EventGroupBits::PIPELINE_COMMAND_STOP) {
+          break;
+        }
+
+        audio::AudioReaderState reader_state = reader->read();
+
+        if (reader_state == audio::AudioReaderState::FINISHED) {
+          break;
+        } else if (reader_state == audio::AudioReaderState::FAILED) {
+          xEventGroupSetBits(this_pipeline->event_group_,
+                             EventGroupBits::READER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+          break;
+        }
+      }
+      event_bits = xEventGroupGetBits(this_pipeline->event_group_);
+      if ((event_bits & EventGroupBits::READER_MESSAGE_LOADED_MEDIA_TYPE) ||
+          (this_pipeline->raw_file_ring_buffer_.use_count() == 1)) {
+        // Decoder task hasn't started yet, so delay a bit before releasing ownership of the ring buffer
+        delay(10);
+      }
+    }
+  }
+}
+
+void AudioPipeline::decode_task(void *params) {
+  AudioPipeline *this_pipeline = (AudioPipeline *) params;
+
+  while (true) {
+    xEventGroupSetBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_FINISHED);
+
+    // Wait until the reader notifies us that the media type is available
+    EventBits_t event_bits = xEventGroupWaitBits(this_pipeline->event_group_,
+                                                 EventGroupBits::READER_MESSAGE_LOADED_MEDIA_TYPE |
+                                                     EventGroupBits::PIPELINE_COMMAND_STOP,  // Bit message to read
+                                                 pdFALSE,                                    // Clear the bit on exit
+                                                 pdFALSE,                                    // Wait for all the bits,
+                                                 portMAX_DELAY);  // Block indefinitely until bit is set
+
+    if (!(event_bits & EventGroupBits::PIPELINE_COMMAND_STOP)) {
+      xEventGroupClearBits(this_pipeline->event_group_,
+                           EventGroupBits::DECODER_MESSAGE_FINISHED | EventGroupBits::READER_MESSAGE_LOADED_MEDIA_TYPE);
+      InfoErrorEvent event;
+      event.source = InfoErrorSource::DECODER;
+
+      std::unique_ptr<audio::AudioDecoder> decoder =
+          make_unique<audio::AudioDecoder>(this_pipeline->transfer_buffer_size_, this_pipeline->transfer_buffer_size_);
+
+      esp_err_t err = decoder->start(this_pipeline->current_audio_file_type_);
+      decoder->add_source(this_pipeline->raw_file_ring_buffer_);
+
+      if (err != ESP_OK) {
+        // Send specific error message
+        event.err = err;
+        xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY);
+
+        // Setting up the decoder failed, stop the pipeline
+        xEventGroupSetBits(this_pipeline->event_group_,
+                           EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+      }
+
+      bool has_stream_info = false;
+      bool started_playback = false;
+
+      size_t initial_bytes_to_buffer = 0;
+
+      while (true) {
+        event_bits = xEventGroupGetBits(this_pipeline->event_group_);
+
+        if (event_bits & EventGroupBits::PIPELINE_COMMAND_STOP) {
+          break;
+        }
+
+        // Update pause state
+        if (!started_playback) {
+          if (!(event_bits & EventGroupBits::READER_MESSAGE_FINISHED)) {
+            decoder->set_pause_output_state(true);
+          } else {
+            started_playback = true;
+          }
+        } else {
+          decoder->set_pause_output_state(this_pipeline->pause_state_);
+        }
+
+        // Stop gracefully if the reader has finished
+        audio::AudioDecoderState decoder_state = decoder->decode(event_bits & EventGroupBits::READER_MESSAGE_FINISHED);
+
+        if ((decoder_state == audio::AudioDecoderState::DECODING) ||
+            (decoder_state == audio::AudioDecoderState::FINISHED)) {
+          this_pipeline->playback_ms_ = decoder->get_playback_ms();
+        }
+
+        if (decoder_state == audio::AudioDecoderState::FINISHED) {
+          break;
+        } else if (decoder_state == audio::AudioDecoderState::FAILED) {
+          if (!has_stream_info) {
+            event.decoding_err = DecodingError::FAILED_HEADER;
+            xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY);
+          }
+          xEventGroupSetBits(this_pipeline->event_group_,
+                             EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+          break;
+        }
+
+        if (!has_stream_info && decoder->get_audio_stream_info().has_value()) {
+          has_stream_info = true;
+
+          this_pipeline->current_audio_stream_info_ = decoder->get_audio_stream_info().value();
+
+          // Send the stream information to the pipeline
+          event.audio_stream_info = this_pipeline->current_audio_stream_info_;
+
+          if (this_pipeline->current_audio_stream_info_.get_bits_per_sample() != 16) {
+            // Error state, incompatible bits per sample
+            event.decoding_err = DecodingError::INCOMPATIBLE_BITS_PER_SAMPLE;
+            xEventGroupSetBits(this_pipeline->event_group_,
+                               EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+          } else if ((this_pipeline->current_audio_stream_info_.get_channels() > 2)) {
+            // Error state, incompatible number of channels
+            event.decoding_err = DecodingError::INCOMPATIBLE_CHANNELS;
+            xEventGroupSetBits(this_pipeline->event_group_,
+                               EventGroupBits::DECODER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+          } else {
+            // Send audio directly to the speaker
+            this_pipeline->speaker_->set_audio_stream_info(this_pipeline->current_audio_stream_info_);
+            decoder->add_sink(this_pipeline->speaker_);
+          }
+
+          initial_bytes_to_buffer = std::min(this_pipeline->current_audio_stream_info_.ms_to_bytes(INITIAL_BUFFER_MS),
+                                             this_pipeline->buffer_size_ * 3 / 4);
+
+          switch (this_pipeline->current_audio_file_type_) {
+#ifdef USE_AUDIO_MP3_SUPPORT
+            case audio::AudioFileType::MP3:
+              initial_bytes_to_buffer /= 8;  // Estimate the MP3 compression factor is 8
+              break;
+#endif
+#ifdef USE_AUDIO_FLAC_SUPPORT
+            case audio::AudioFileType::FLAC:
+              initial_bytes_to_buffer /= 2;  // Estimate the FLAC compression factor is 2
+              break;
+#endif
+            default:
+              break;
+          }
+          xQueueSend(this_pipeline->info_error_queue_, &event, portMAX_DELAY);
+        }
+
+        if (!started_playback && has_stream_info) {
+          // Verify enough data is available before starting playback
+          std::shared_ptr<RingBuffer> temp_ring_buffer = this_pipeline->raw_file_ring_buffer_.lock();
+          if (temp_ring_buffer->available() >= initial_bytes_to_buffer) {
+            started_playback = true;
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace speaker
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/speaker/media_player/audio_pipeline.h b/esphome/components/speaker/media_player/audio_pipeline.h
new file mode 100644
index 0000000000..c382e1eebe
--- /dev/null
+++ b/esphome/components/speaker/media_player/audio_pipeline.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#ifdef USE_ESP_IDF
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/audio/audio_reader.h"
+#include "esphome/components/audio/audio_decoder.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/ring_buffer.h"
+
+#include "esp_err.h"
+
+#include <freertos/FreeRTOS.h>
+#include <freertos/event_groups.h>
+#include <freertos/queue.h>
+
+namespace esphome {
+namespace speaker {
+
+// Internal sink/source buffers for reader and decoder
+static const size_t DEFAULT_TRANSFER_BUFFER_SIZE = 24 * 1024;
+
+enum class AudioPipelineType : uint8_t {
+  MEDIA,
+  ANNOUNCEMENT,
+};
+
+enum class AudioPipelineState : uint8_t {
+  STARTING_FILE,
+  STARTING_URL,
+  PLAYING,
+  STOPPING,
+  STOPPED,
+  PAUSED,
+  ERROR_READING,
+  ERROR_DECODING,
+};
+
+enum class InfoErrorSource : uint8_t {
+  READER = 0,
+  DECODER,
+};
+
+enum class DecodingError : uint8_t {
+  FAILED_HEADER = 0,
+  INCOMPATIBLE_BITS_PER_SAMPLE,
+  INCOMPATIBLE_CHANNELS,
+};
+
+// Used to pass information from each task.
+struct InfoErrorEvent {
+  InfoErrorSource source;
+  optional<esp_err_t> err;
+  optional<audio::AudioFileType> file_type;
+  optional<audio::AudioStreamInfo> audio_stream_info;
+  optional<DecodingError> decoding_err;
+};
+
+class AudioPipeline {
+ public:
+  /// @param speaker ESPHome speaker component for pipeline's audio output
+  /// @param buffer_size Size of the buffer in bytes between the reader and decoder
+  /// @param task_stack_in_psram True if the task stack should be allocated in PSRAM, false otherwise
+  /// @param task_name FreeRTOS task base name
+  /// @param priority FreeRTOS task priority
+  AudioPipeline(speaker::Speaker *speaker, size_t buffer_size, bool task_stack_in_psram, std::string base_name,
+                UBaseType_t priority);
+
+  /// @brief Starts an audio pipeline given a media url
+  /// @param uri media file url
+  /// @return ESP_OK if successful or an appropriate error if not
+  void start_url(const std::string &uri);
+
+  /// @brief Starts an audio pipeline given a AudioFile pointer
+  /// @param audio_file pointer to an AudioFile object
+  /// @return ESP_OK if successful or an appropriate error if not
+  void start_file(audio::AudioFile *audio_file);
+
+  /// @brief Stops the pipeline. Sends a stop signal to each task (if running) and clears the ring buffers.
+  /// @return ESP_OK if successful or ESP_ERR_TIMEOUT if the tasks did not indicate they stopped
+  esp_err_t stop();
+
+  /// @brief Processes the state of the audio pipeline based on the info_error_queue_ and event_group_. Handles creating
+  /// and stopping the pipeline tasks. Needs to be regularly called to update the internal pipeline state.
+  /// @return AudioPipelineState
+  AudioPipelineState process_state();
+
+  /// @brief Suspends any running tasks
+  void suspend_tasks();
+  /// @brief Resumes any running tasks
+  void resume_tasks();
+
+  uint32_t get_playback_ms() { return this->playback_ms_; }
+
+  void set_pause_state(bool pause_state);
+
+ protected:
+  /// @brief Allocates the event group and info error queue.
+  /// @return ESP_OK if successful or ESP_ERR_NO_MEM if it is unable to allocate all parts
+  esp_err_t allocate_communications_();
+
+  /// @brief Common start code for the pipeline, regardless if the source is a file or url.
+  /// @return ESP_OK if successful or an appropriate error if not
+  esp_err_t start_tasks_();
+
+  /// @brief Resets the task related pointers and deallocates their stacks.
+  void delete_tasks_();
+
+  std::string base_name_;
+  UBaseType_t priority_;
+
+  uint32_t playback_ms_{0};
+
+  bool is_playing_{false};
+  bool pause_state_{false};
+  bool task_stack_in_psram_;
+
+  // Pending file start state used to ensure the pipeline fully stops before attempting to start the next file
+  bool pending_url_{false};
+  bool pending_file_{false};
+
+  speaker::Speaker *speaker_{nullptr};
+
+  std::string current_uri_{};
+  audio::AudioFile *current_audio_file_{nullptr};
+
+  audio::AudioFileType current_audio_file_type_;
+  audio::AudioStreamInfo current_audio_stream_info_;
+
+  size_t buffer_size_;           // Ring buffer between reader and decoder
+  size_t transfer_buffer_size_;  // Internal source/sink buffers for the audio reader and decoder
+
+  std::weak_ptr<RingBuffer> raw_file_ring_buffer_;
+
+  // Handles basic control/state of the three tasks
+  EventGroupHandle_t event_group_{nullptr};
+
+  // Receives detailed info (file type, stream info, resampling info) or specific errors from the three tasks
+  QueueHandle_t info_error_queue_{nullptr};
+
+  // Handles reading the media file from flash or a url
+  static void read_task(void *params);
+  TaskHandle_t read_task_handle_{nullptr};
+  StaticTask_t read_task_stack_;
+  StackType_t *read_task_stack_buffer_{nullptr};
+
+  // Decodes the media file into PCM audio
+  static void decode_task(void *params);
+  TaskHandle_t decode_task_handle_{nullptr};
+  StaticTask_t decode_task_stack_;
+  StackType_t *decode_task_stack_buffer_{nullptr};
+};
+
+}  // namespace speaker
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/speaker/media_player/automation.h b/esphome/components/speaker/media_player/automation.h
new file mode 100644
index 0000000000..d1a01aabc4
--- /dev/null
+++ b/esphome/components/speaker/media_player/automation.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "speaker_media_player.h"
+
+#ifdef USE_ESP_IDF
+
+#include "esphome/components/audio/audio.h"
+#include "esphome/core/automation.h"
+
+namespace esphome {
+namespace speaker {
+
+template<typename... Ts> class PlayOnDeviceMediaAction : public Action<Ts...>, public Parented<SpeakerMediaPlayer> {
+  TEMPLATABLE_VALUE(audio::AudioFile *, audio_file)
+  TEMPLATABLE_VALUE(bool, announcement)
+  TEMPLATABLE_VALUE(bool, enqueue)
+  void play(Ts... x) override {
+    this->parent_->play_file(this->audio_file_.value(x...), this->announcement_.value(x...),
+                             this->enqueue_.value(x...));
+  }
+};
+
+}  // namespace speaker
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/speaker/media_player/speaker_media_player.cpp b/esphome/components/speaker/media_player/speaker_media_player.cpp
new file mode 100644
index 0000000000..0a2585ce60
--- /dev/null
+++ b/esphome/components/speaker/media_player/speaker_media_player.cpp
@@ -0,0 +1,577 @@
+#include "speaker_media_player.h"
+
+#ifdef USE_ESP_IDF
+
+#include "esphome/core/log.h"
+
+#include "esphome/components/audio/audio.h"
+#ifdef USE_OTA
+#include "esphome/components/ota/ota_backend.h"
+#endif
+
+namespace esphome {
+namespace speaker {
+
+// Framework:
+//  - Media player that can handle two streams: one for media and one for announcements
+//    - Each stream has an individual speaker component for output
+//  - Each stream is handled by an ``AudioPipeline`` object with two parts/tasks
+//    - ``AudioReader`` handles reading from an HTTP source or from a PROGMEM flash set at compile time
+//    - ``AudioDecoder`` handles decoding the audio file. All formats are limited to two channels and 16 bits per sample
+//      - FLAC
+//      - MP3 (based on the libhelix decoder)
+//      - WAV
+//    - Each task runs until it is done processing the file or it receives a stop command
+//    - Inter-task communication uses a FreeRTOS Event Group
+//    - The ``AudioPipeline`` sets up a ring buffer between the reader and decoder tasks. The decoder task outputs audio
+//      directly to a speaker component.
+//    - The pipelines internal state needs to be processed by regularly calling ``process_state``.
+//  - Generic media player commands are received by the ``control`` function. The commands are added to the
+//    ``media_control_command_queue_`` to be processed in the component's loop
+//    - Local file play back is initiatied with ``play_file`` and adds it to the ``media_control_command_queue_``
+//    - Starting a stream intializes the appropriate pipeline or stops it if it is already running
+//    - Volume and mute commands are achieved by the ``mute``, ``unmute``, ``set_volume`` functions.
+//      - Volume commands are ignored if the media control queue is full to avoid crashing with rapid volume
+//        increases/decreases.
+//      - These functions all send the appropriate information to the speakers to implement.
+//    - Pausing is implemented in the decoder task and is also sent directly to the media speaker component to decrease
+//      latency.
+//  - The components main loop performs housekeeping:
+//    - It reads the media control queue and processes it directly
+//    - It determines the overall state of the media player by considering the state of each pipeline
+//      - announcement playback takes highest priority
+//    - Handles playlists and repeating by starting the appropriate file when a previous file is finished
+//  - Logging only happens in the main loop task to reduce task stack memory usage.
+
+static const uint32_t MEDIA_CONTROLS_QUEUE_LENGTH = 20;
+
+static const UBaseType_t MEDIA_PIPELINE_TASK_PRIORITY = 1;
+static const UBaseType_t ANNOUNCEMENT_PIPELINE_TASK_PRIORITY = 1;
+
+static const float FIRST_BOOT_DEFAULT_VOLUME = 0.5f;
+
+static const char *const TAG = "speaker_media_player";
+
+void SpeakerMediaPlayer::setup() {
+  state = media_player::MEDIA_PLAYER_STATE_IDLE;
+
+  this->media_control_command_queue_ = xQueueCreate(MEDIA_CONTROLS_QUEUE_LENGTH, sizeof(MediaCallCommand));
+
+  this->pref_ = global_preferences->make_preference<VolumeRestoreState>(this->get_object_id_hash());
+
+  VolumeRestoreState volume_restore_state;
+  if (this->pref_.load(&volume_restore_state)) {
+    this->set_volume_(volume_restore_state.volume);
+    this->set_mute_state_(volume_restore_state.is_muted);
+  } else {
+    this->set_volume_(FIRST_BOOT_DEFAULT_VOLUME);
+    this->set_mute_state_(false);
+  }
+
+#ifdef USE_OTA
+  ota::get_global_ota_callback()->add_on_state_callback(
+      [this](ota::OTAState state, float progress, uint8_t error, ota::OTAComponent *comp) {
+        if (state == ota::OTA_STARTED) {
+          if (this->media_pipeline_ != nullptr) {
+            this->media_pipeline_->suspend_tasks();
+          }
+          if (this->announcement_pipeline_ != nullptr) {
+            this->announcement_pipeline_->suspend_tasks();
+          }
+        } else if (state == ota::OTA_ERROR) {
+          if (this->media_pipeline_ != nullptr) {
+            this->media_pipeline_->resume_tasks();
+          }
+          if (this->announcement_pipeline_ != nullptr) {
+            this->announcement_pipeline_->resume_tasks();
+          }
+        }
+      });
+#endif
+
+  this->announcement_pipeline_ =
+      make_unique<AudioPipeline>(this->announcement_speaker_, this->buffer_size_, this->task_stack_in_psram_, "ann",
+                                 ANNOUNCEMENT_PIPELINE_TASK_PRIORITY);
+
+  if (this->announcement_pipeline_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to create announcement pipeline");
+    this->mark_failed();
+  }
+
+  if (!this->single_pipeline_()) {
+    this->media_pipeline_ = make_unique<AudioPipeline>(this->media_speaker_, this->buffer_size_,
+                                                       this->task_stack_in_psram_, "ann", MEDIA_PIPELINE_TASK_PRIORITY);
+
+    if (this->media_pipeline_ == nullptr) {
+      ESP_LOGE(TAG, "Failed to create media pipeline");
+      this->mark_failed();
+    }
+
+    // Setup callback to track the duration of audio played by the media pipeline
+    this->media_speaker_->add_audio_output_callback(
+        [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) {
+          this->playback_ms_ += new_playback_ms;
+          this->remainder_us_ = remainder_us;
+          this->pending_ms_ = pending_ms;
+          this->last_audio_write_timestamp_ = write_timestamp;
+          this->playback_us_ = this->playback_ms_ * 1000 + this->remainder_us_;
+        });
+  }
+
+  ESP_LOGI(TAG, "Set up speaker media player");
+}
+
+void SpeakerMediaPlayer::set_playlist_delay_ms(AudioPipelineType pipeline_type, uint32_t delay_ms) {
+  switch (pipeline_type) {
+    case AudioPipelineType::ANNOUNCEMENT:
+      this->announcement_playlist_delay_ms_ = delay_ms;
+      break;
+    case AudioPipelineType::MEDIA:
+      this->media_playlist_delay_ms_ = delay_ms;
+      break;
+  }
+}
+
+void SpeakerMediaPlayer::watch_media_commands_() {
+  if (!this->is_ready()) {
+    return;
+  }
+
+  MediaCallCommand media_command;
+  esp_err_t err = ESP_OK;
+
+  if (xQueueReceive(this->media_control_command_queue_, &media_command, 0) == pdTRUE) {
+    bool new_url = media_command.new_url.has_value() && media_command.new_url.value();
+    bool new_file = media_command.new_file.has_value() && media_command.new_file.value();
+
+    if (new_url || new_file) {
+      bool enqueue = media_command.enqueue.has_value() && media_command.enqueue.value();
+
+      if (this->single_pipeline_() || (media_command.announce.has_value() && media_command.announce.value())) {
+        // Announcement playlist/pipeline
+
+        if (!enqueue) {
+          // Clear the queue and ensure the loaded next item doesn't start playing
+          this->cancel_timeout("next_ann");
+          this->announcement_playlist_.clear();
+        }
+
+        PlaylistItem playlist_item;
+        if (new_url) {
+          playlist_item.url = this->announcement_url_;
+          if (!enqueue) {
+            // Not adding to the queue, so directly start playback and internally unpause the pipeline
+            this->announcement_pipeline_->start_url(playlist_item.url.value());
+            this->announcement_pipeline_->set_pause_state(false);
+          }
+        } else {
+          playlist_item.file = this->announcement_file_;
+          if (!enqueue) {
+            // Not adding to the queue, so directly start playback and internally unpause the pipeline
+            this->announcement_pipeline_->start_file(playlist_item.file.value());
+            this->announcement_pipeline_->set_pause_state(false);
+          }
+        }
+        this->announcement_playlist_.push_back(playlist_item);
+      } else {
+        // Media playlist/pipeline
+
+        if (!enqueue) {
+          // Clear the queue and ensure the loaded next item doesn't start playing
+          this->cancel_timeout("next_media");
+          this->media_playlist_.clear();
+        }
+
+        this->is_paused_ = false;
+        PlaylistItem playlist_item;
+        if (new_url) {
+          playlist_item.url = this->media_url_;
+          if (!enqueue) {
+            // Not adding to the queue, so directly start playback and internally unpause the pipeline
+            this->media_pipeline_->start_url(playlist_item.url.value());
+            this->media_pipeline_->set_pause_state(false);
+          }
+        } else {
+          playlist_item.file = this->media_file_;
+          if (!enqueue) {
+            // Not adding to the queue, so directly start playback and internally unpause the pipeline
+            this->media_pipeline_->start_file(playlist_item.file.value());
+            this->media_pipeline_->set_pause_state(false);
+          }
+        }
+        this->media_playlist_.push_back(playlist_item);
+      }
+
+      if (err != ESP_OK) {
+        ESP_LOGE(TAG, "Error starting the audio pipeline: %s", esp_err_to_name(err));
+        this->status_set_error();
+      } else {
+        this->status_clear_error();
+      }
+
+      return;  // Don't process the new file play command further
+    }
+
+    if (media_command.volume.has_value()) {
+      this->set_volume_(media_command.volume.value());
+      this->publish_state();
+    }
+
+    if (media_command.command.has_value()) {
+      switch (media_command.command.value()) {
+        case media_player::MEDIA_PLAYER_COMMAND_PLAY:
+          if ((this->media_pipeline_ != nullptr) && (this->is_paused_)) {
+            this->media_pipeline_->set_pause_state(false);
+          }
+          this->is_paused_ = false;
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_PAUSE:
+          if ((this->media_pipeline_ != nullptr) && (!this->is_paused_)) {
+            this->media_pipeline_->set_pause_state(true);
+          }
+          this->is_paused_ = true;
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_STOP:
+          if (this->single_pipeline_() || (media_command.announce.has_value() && media_command.announce.value())) {
+            if (this->announcement_pipeline_ != nullptr) {
+              this->cancel_timeout("next_ann");
+              this->announcement_playlist_.clear();
+              this->announcement_pipeline_->stop();
+            }
+          } else {
+            if (this->media_pipeline_ != nullptr) {
+              this->cancel_timeout("next_media");
+              this->media_playlist_.clear();
+              this->media_pipeline_->stop();
+            }
+          }
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_TOGGLE:
+          if (this->media_pipeline_ != nullptr) {
+            if (this->is_paused_) {
+              this->media_pipeline_->set_pause_state(false);
+              this->is_paused_ = false;
+            } else {
+              this->media_pipeline_->set_pause_state(true);
+              this->is_paused_ = true;
+            }
+          }
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_MUTE: {
+          this->set_mute_state_(true);
+
+          this->publish_state();
+          break;
+        }
+        case media_player::MEDIA_PLAYER_COMMAND_UNMUTE:
+          this->set_mute_state_(false);
+          this->publish_state();
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_VOLUME_UP:
+          this->set_volume_(std::min(1.0f, this->volume + this->volume_increment_));
+          this->publish_state();
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_VOLUME_DOWN:
+          this->set_volume_(std::max(0.0f, this->volume - this->volume_increment_));
+          this->publish_state();
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_REPEAT_ONE:
+          if (this->single_pipeline_() || (media_command.announce.has_value() && media_command.announce.value())) {
+            this->announcement_repeat_one_ = true;
+          } else {
+            this->media_repeat_one_ = true;
+          }
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_REPEAT_OFF:
+          if (this->single_pipeline_() || (media_command.announce.has_value() && media_command.announce.value())) {
+            this->announcement_repeat_one_ = false;
+          } else {
+            this->media_repeat_one_ = false;
+          }
+          break;
+        case media_player::MEDIA_PLAYER_COMMAND_CLEAR_PLAYLIST:
+          if (this->single_pipeline_() || (media_command.announce.has_value() && media_command.announce.value())) {
+            if (this->announcement_playlist_.empty()) {
+              this->announcement_playlist_.resize(1);
+            }
+          } else {
+            if (this->media_playlist_.empty()) {
+              this->media_playlist_.resize(1);
+            }
+          }
+          break;
+        default:
+          break;
+      }
+    }
+  }
+}
+
+void SpeakerMediaPlayer::loop() {
+  this->watch_media_commands_();
+
+  // Determine state of the media player
+  media_player::MediaPlayerState old_state = this->state;
+
+  AudioPipelineState old_media_pipeline_state = this->media_pipeline_state_;
+  if (this->media_pipeline_ != nullptr) {
+    this->media_pipeline_state_ = this->media_pipeline_->process_state();
+    this->decoded_playback_ms_ = this->media_pipeline_->get_playback_ms();
+  }
+
+  if (this->media_pipeline_state_ == AudioPipelineState::ERROR_READING) {
+    ESP_LOGE(TAG, "The media pipeline's file reader encountered an error.");
+  } else if (this->media_pipeline_state_ == AudioPipelineState::ERROR_DECODING) {
+    ESP_LOGE(TAG, "The media pipeline's audio decoder encountered an error.");
+  }
+
+  AudioPipelineState old_announcement_pipeline_state = this->announcement_pipeline_state_;
+  if (this->announcement_pipeline_ != nullptr) {
+    this->announcement_pipeline_state_ = this->announcement_pipeline_->process_state();
+  }
+
+  if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_READING) {
+    ESP_LOGE(TAG, "The announcement pipeline's file reader encountered an error.");
+  } else if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_DECODING) {
+    ESP_LOGE(TAG, "The announcement pipeline's audio decoder encountered an error.");
+  }
+
+  if (this->announcement_pipeline_state_ != AudioPipelineState::STOPPED) {
+    this->state = media_player::MEDIA_PLAYER_STATE_ANNOUNCING;
+  } else {
+    if (!this->announcement_playlist_.empty()) {
+      uint32_t timeout_ms = 0;
+      if (old_announcement_pipeline_state == AudioPipelineState::PLAYING) {
+        // Finished the current announcement file
+        if (!this->announcement_repeat_one_) {
+          //  Pop item off the playlist if repeat is disabled
+          this->announcement_playlist_.pop_front();
+        }
+        // Only delay starting playback if moving on the next playlist item or repeating the current item
+        timeout_ms = this->announcement_playlist_delay_ms_;
+      }
+
+      if (!this->announcement_playlist_.empty()) {
+        // Start the next announcement file
+        PlaylistItem playlist_item = this->announcement_playlist_.front();
+        if (playlist_item.url.has_value()) {
+          this->announcement_pipeline_->start_url(playlist_item.url.value());
+        } else if (playlist_item.file.has_value()) {
+          this->announcement_pipeline_->start_file(playlist_item.file.value());
+        }
+
+        if (timeout_ms > 0) {
+          // Pause pipeline internally to facilitiate delay between items
+          this->announcement_pipeline_->set_pause_state(true);
+          // Internally unpause the pipeline after the delay between playlist items
+          this->set_timeout("next_ann", timeout_ms,
+                            [this]() { this->announcement_pipeline_->set_pause_state(this->is_paused_); });
+        }
+      }
+    } else {
+      if (this->is_paused_) {
+        this->state = media_player::MEDIA_PLAYER_STATE_PAUSED;
+      } else if (this->media_pipeline_state_ == AudioPipelineState::PLAYING) {
+        this->state = media_player::MEDIA_PLAYER_STATE_PLAYING;
+      } else if (this->media_pipeline_state_ == AudioPipelineState::STOPPED) {
+        // Reset playback durations
+        this->decoded_playback_ms_ = 0;
+        this->playback_us_ = 0;
+        this->playback_ms_ = 0;
+        this->remainder_us_ = 0;
+        this->pending_ms_ = 0;
+
+        if (!media_playlist_.empty()) {
+          uint32_t timeout_ms = 0;
+          if (old_media_pipeline_state == AudioPipelineState::PLAYING) {
+            // Finished the current media file
+            if (!this->media_repeat_one_) {
+              // Pop item off the playlist if repeat is disabled
+              this->media_playlist_.pop_front();
+            }
+            // Only delay starting playback if moving on the next playlist item or repeating the current item
+            timeout_ms = this->announcement_playlist_delay_ms_;
+          }
+          if (!this->media_playlist_.empty()) {
+            PlaylistItem playlist_item = this->media_playlist_.front();
+            if (playlist_item.url.has_value()) {
+              this->media_pipeline_->start_url(playlist_item.url.value());
+            } else if (playlist_item.file.has_value()) {
+              this->media_pipeline_->start_file(playlist_item.file.value());
+            }
+
+            if (timeout_ms > 0) {
+              // Pause pipeline internally to facilitiate delay between items
+              this->media_pipeline_->set_pause_state(true);
+              // Internally unpause the pipeline after the delay between playlist items
+              this->set_timeout("next_media", timeout_ms,
+                                [this]() { this->media_pipeline_->set_pause_state(this->is_paused_); });
+            }
+          }
+        } else {
+          this->state = media_player::MEDIA_PLAYER_STATE_IDLE;
+        }
+      }
+    }
+  }
+
+  if (this->state != old_state) {
+    this->publish_state();
+    ESP_LOGD(TAG, "State changed to %s", media_player::media_player_state_to_string(this->state));
+  }
+}
+
+void SpeakerMediaPlayer::play_file(audio::AudioFile *media_file, bool announcement, bool enqueue) {
+  if (!this->is_ready()) {
+    // Ignore any commands sent before the media player is setup
+    return;
+  }
+
+  MediaCallCommand media_command;
+
+  media_command.new_file = true;
+  if (this->single_pipeline_() || announcement) {
+    this->announcement_file_ = media_file;
+    media_command.announce = true;
+  } else {
+    this->media_file_ = media_file;
+    media_command.announce = false;
+  }
+  media_command.enqueue = enqueue;
+  xQueueSend(this->media_control_command_queue_, &media_command, portMAX_DELAY);
+}
+
+void SpeakerMediaPlayer::control(const media_player::MediaPlayerCall &call) {
+  if (!this->is_ready()) {
+    // Ignore any commands sent before the media player is setup
+    return;
+  }
+
+  MediaCallCommand media_command;
+
+  if (this->single_pipeline_() || (call.get_announcement().has_value() && call.get_announcement().value())) {
+    media_command.announce = true;
+  } else {
+    media_command.announce = false;
+  }
+
+  if (call.get_media_url().has_value()) {
+    std::string new_uri = call.get_media_url().value();
+
+    media_command.new_url = true;
+    if (this->single_pipeline_() || (call.get_announcement().has_value() && call.get_announcement().value())) {
+      this->announcement_url_ = new_uri;
+    } else {
+      this->media_url_ = new_uri;
+    }
+
+    if (call.get_command().has_value()) {
+      if (call.get_command().value() == media_player::MEDIA_PLAYER_COMMAND_ENQUEUE) {
+        media_command.enqueue = true;
+      }
+    }
+
+    xQueueSend(this->media_control_command_queue_, &media_command, portMAX_DELAY);
+    return;
+  }
+
+  if (call.get_volume().has_value()) {
+    media_command.volume = call.get_volume().value();
+    // Wait 0 ticks for queue to be free, volume sets aren't that important!
+    xQueueSend(this->media_control_command_queue_, &media_command, 0);
+    return;
+  }
+
+  if (call.get_command().has_value()) {
+    media_command.command = call.get_command().value();
+    TickType_t ticks_to_wait = portMAX_DELAY;
+    if ((call.get_command().value() == media_player::MEDIA_PLAYER_COMMAND_VOLUME_UP) ||
+        (call.get_command().value() == media_player::MEDIA_PLAYER_COMMAND_VOLUME_DOWN)) {
+      ticks_to_wait = 0;  // Wait 0 ticks for queue to be free, volume sets aren't that important!
+    }
+    xQueueSend(this->media_control_command_queue_, &media_command, ticks_to_wait);
+    return;
+  }
+}
+
+media_player::MediaPlayerTraits SpeakerMediaPlayer::get_traits() {
+  auto traits = media_player::MediaPlayerTraits();
+  if (!this->single_pipeline_()) {
+    traits.set_supports_pause(true);
+  }
+
+  if (this->announcement_format_.has_value()) {
+    traits.get_supported_formats().push_back(this->announcement_format_.value());
+  }
+  if (this->media_format_.has_value()) {
+    traits.get_supported_formats().push_back(this->media_format_.value());
+  } else if (this->single_pipeline_() && this->announcement_format_.has_value()) {
+    // Only one pipeline is defined, so use the announcement format (if configured) for the default purpose
+    media_player::MediaPlayerSupportedFormat media_format = this->announcement_format_.value();
+    media_format.purpose = media_player::MediaPlayerFormatPurpose::PURPOSE_DEFAULT;
+    traits.get_supported_formats().push_back(media_format);
+  }
+
+  return traits;
+};
+
+void SpeakerMediaPlayer::save_volume_restore_state_() {
+  VolumeRestoreState volume_restore_state;
+  volume_restore_state.volume = this->volume;
+  volume_restore_state.is_muted = this->is_muted_;
+  this->pref_.save(&volume_restore_state);
+}
+
+void SpeakerMediaPlayer::set_mute_state_(bool mute_state) {
+  if (this->media_speaker_ != nullptr) {
+    this->media_speaker_->set_mute_state(mute_state);
+  }
+  if (this->announcement_speaker_ != nullptr) {
+    this->announcement_speaker_->set_mute_state(mute_state);
+  }
+
+  bool old_mute_state = this->is_muted_;
+  this->is_muted_ = mute_state;
+
+  this->save_volume_restore_state_();
+
+  if (old_mute_state != mute_state) {
+    if (mute_state) {
+      this->defer([this]() { this->mute_trigger_->trigger(); });
+    } else {
+      this->defer([this]() { this->unmute_trigger_->trigger(); });
+    }
+  }
+}
+
+void SpeakerMediaPlayer::set_volume_(float volume, bool publish) {
+  // Remap the volume to fit with in the configured limits
+  float bounded_volume = remap<float, float>(volume, 0.0f, 1.0f, this->volume_min_, this->volume_max_);
+
+  if (this->media_speaker_ != nullptr) {
+    this->media_speaker_->set_volume(bounded_volume);
+  }
+
+  if (this->announcement_speaker_ != nullptr) {
+    this->announcement_speaker_->set_volume(bounded_volume);
+  }
+
+  if (publish) {
+    this->volume = volume;
+    this->save_volume_restore_state_();
+  }
+
+  // Turn on the mute state if the volume is effectively zero, off otherwise
+  if (volume < 0.001) {
+    this->set_mute_state_(true);
+  } else {
+    this->set_mute_state_(false);
+  }
+
+  this->defer([this, volume]() { this->volume_trigger_->trigger(volume); });
+}
+
+}  // namespace speaker
+}  // namespace esphome
+
+#endif
diff --git a/esphome/components/speaker/media_player/speaker_media_player.h b/esphome/components/speaker/media_player/speaker_media_player.h
new file mode 100644
index 0000000000..6cbce91866
--- /dev/null
+++ b/esphome/components/speaker/media_player/speaker_media_player.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#ifdef USE_ESP_IDF
+
+#include "audio_pipeline.h"
+
+#include "esphome/components/audio/audio.h"
+
+#include "esphome/components/media_player/media_player.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/automation.h"
+#include "esphome/core/component.h"
+#include "esphome/core/preferences.h"
+
+#include <deque>
+#include <freertos/FreeRTOS.h>
+#include <freertos/queue.h>
+
+namespace esphome {
+namespace speaker {
+
+struct MediaCallCommand {
+  optional<media_player::MediaPlayerCommand> command;
+  optional<float> volume;
+  optional<bool> announce;
+  optional<bool> new_url;
+  optional<bool> new_file;
+  optional<bool> enqueue;
+};
+
+struct PlaylistItem {
+  optional<std::string> url;
+  optional<audio::AudioFile *> file;
+};
+
+struct VolumeRestoreState {
+  float volume;
+  bool is_muted;
+};
+
+class SpeakerMediaPlayer : public Component, public media_player::MediaPlayer {
+ public:
+  float get_setup_priority() const override { return esphome::setup_priority::PROCESSOR; }
+  void setup() override;
+  void loop() override;
+
+  // MediaPlayer implementations
+  media_player::MediaPlayerTraits get_traits() override;
+  bool is_muted() const override { return this->is_muted_; }
+
+  void set_buffer_size(size_t buffer_size) { this->buffer_size_ = buffer_size; }
+  void set_task_stack_in_psram(bool task_stack_in_psram) { this->task_stack_in_psram_ = task_stack_in_psram; }
+
+  // Percentage to increase or decrease the volume for volume up or volume down commands
+  void set_volume_increment(float volume_increment) { this->volume_increment_ = volume_increment; }
+
+  void set_volume_max(float volume_max) { this->volume_max_ = volume_max; }
+  void set_volume_min(float volume_min) { this->volume_min_ = volume_min; }
+
+  void set_announcement_speaker(Speaker *announcement_speaker) { this->announcement_speaker_ = announcement_speaker; }
+  void set_announcement_format(const media_player::MediaPlayerSupportedFormat &announcement_format) {
+    this->announcement_format_ = announcement_format;
+  }
+  void set_media_speaker(Speaker *media_speaker) { this->media_speaker_ = media_speaker; }
+  void set_media_format(const media_player::MediaPlayerSupportedFormat &media_format) {
+    this->media_format_ = media_format;
+  }
+
+  Trigger<> *get_mute_trigger() const { return this->mute_trigger_; }
+  Trigger<> *get_unmute_trigger() const { return this->unmute_trigger_; }
+  Trigger<float> *get_volume_trigger() const { return this->volume_trigger_; }
+
+  void play_file(audio::AudioFile *media_file, bool announcement, bool enqueue);
+
+  uint32_t get_playback_ms() const { return this->playback_ms_; }
+  uint32_t get_playback_us() const { return this->playback_us_; }
+  uint32_t get_decoded_playback_ms() const { return this->decoded_playback_ms_; }
+
+  void set_playlist_delay_ms(AudioPipelineType pipeline_type, uint32_t delay_ms);
+
+ protected:
+  // Receives commands from HA or from the voice assistant component
+  // Sends commands to the media_control_commanda_queue_
+  void control(const media_player::MediaPlayerCall &call) override;
+
+  /// @brief Updates this->volume and saves volume/mute state to flash for restortation if publish is true.
+  void set_volume_(float volume, bool publish = true);
+
+  /// @brief Sets the mute state. Restores previous volume if unmuting. Always saves volume/mute state to flash for
+  /// restoration.
+  /// @param mute_state If true, audio will be muted. If false, audio will be unmuted
+  void set_mute_state_(bool mute_state);
+
+  /// @brief Saves the current volume and mute state to the flash for restoration.
+  void save_volume_restore_state_();
+
+  /// Returns true if the media player has only the announcement pipeline defined, false if both the announcement and
+  /// media pipelines are defined.
+  inline bool single_pipeline_() { return (this->media_speaker_ == nullptr); }
+
+  // Processes commands from media_control_command_queue_.
+  void watch_media_commands_();
+
+  std::unique_ptr<AudioPipeline> announcement_pipeline_;
+  std::unique_ptr<AudioPipeline> media_pipeline_;
+  Speaker *media_speaker_{nullptr};
+  Speaker *announcement_speaker_{nullptr};
+
+  optional<media_player::MediaPlayerSupportedFormat> media_format_;
+  AudioPipelineState media_pipeline_state_{AudioPipelineState::STOPPED};
+  std::string media_url_{};         // only modified by control function
+  audio::AudioFile *media_file_{};  // only modified by play_file function
+  bool media_repeat_one_{false};
+  uint32_t media_playlist_delay_ms_{0};
+
+  optional<media_player::MediaPlayerSupportedFormat> announcement_format_;
+  AudioPipelineState announcement_pipeline_state_{AudioPipelineState::STOPPED};
+  std::string announcement_url_{};         // only modified by control function
+  audio::AudioFile *announcement_file_{};  // only modified by play_file function
+  bool announcement_repeat_one_{false};
+  uint32_t announcement_playlist_delay_ms_{0};
+
+  QueueHandle_t media_control_command_queue_;
+
+  std::deque<PlaylistItem> announcement_playlist_;
+  std::deque<PlaylistItem> media_playlist_;
+
+  size_t buffer_size_;
+
+  bool task_stack_in_psram_;
+
+  bool is_paused_{false};
+  bool is_muted_{false};
+
+  // The amount to change the volume on volume up/down commands
+  float volume_increment_;
+
+  float volume_max_;
+  float volume_min_;
+
+  // Used to save volume/mute state for restoration on reboot
+  ESPPreferenceObject pref_;
+
+  Trigger<> *mute_trigger_ = new Trigger<>();
+  Trigger<> *unmute_trigger_ = new Trigger<>();
+  Trigger<float> *volume_trigger_ = new Trigger<float>();
+
+  uint32_t decoded_playback_ms_{0};
+  uint32_t playback_us_{0};
+  uint32_t playback_ms_{0};
+  uint32_t remainder_us_{0};
+  uint32_t pending_ms_{0};
+  uint32_t last_audio_write_timestamp_{0};
+};
+
+}  // namespace speaker
+}  // namespace esphome
+
+#endif
diff --git a/tests/components/media_player/common.yaml b/tests/components/media_player/common.yaml
index af0d5c7765..763bc231c0 100644
--- a/tests/components/media_player/common.yaml
+++ b/tests/components/media_player/common.yaml
@@ -21,6 +21,8 @@ media_player:
       - media_player.pause:
     on_play:
       - media_player.stop:
+      - media_player.stop:
+          announcement: true
     on_pause:
       - media_player.toggle:
       - wait_until:
diff --git a/tests/components/speaker/common-media_player.yaml b/tests/components/speaker/common-media_player.yaml
new file mode 100644
index 0000000000..edc9f670fc
--- /dev/null
+++ b/tests/components/speaker/common-media_player.yaml
@@ -0,0 +1,12 @@
+<<: !include common.yaml
+
+media_player:
+  - platform: speaker
+    id: speaker_media_player_id
+    announcement_pipeline:
+      speaker: speaker_id
+    buffer_size: 1000000
+    volume_increment: 0.02
+    volume_max: 0.95
+    volume_min: 0.0
+    task_stack_in_psram: true
diff --git a/tests/components/speaker/media_player.esp32-idf.yaml b/tests/components/speaker/media_player.esp32-idf.yaml
new file mode 100644
index 0000000000..4712e4bae8
--- /dev/null
+++ b/tests/components/speaker/media_player.esp32-idf.yaml
@@ -0,0 +1,9 @@
+substitutions:
+  scl_pin: GPIO16
+  sda_pin: GPIO17
+  i2s_bclk_pin: GPIO27
+  i2s_lrclk_pin: GPIO26
+  i2s_mclk_pin: GPIO25
+  i2s_dout_pin: GPIO23
+
+<<: !include common-media_player.yaml
diff --git a/tests/components/speaker/media_player.esp32-s3-idf.yaml b/tests/components/speaker/media_player.esp32-s3-idf.yaml
new file mode 100644
index 0000000000..b3eec04d23
--- /dev/null
+++ b/tests/components/speaker/media_player.esp32-s3-idf.yaml
@@ -0,0 +1,9 @@
+substitutions:
+  scl_pin: GPIO2
+  sda_pin: GPIO3
+  i2s_bclk_pin: GPIO4
+  i2s_lrclk_pin: GPIO5
+  i2s_mclk_pin: GPIO6
+  i2s_dout_pin: GPIO7
+
+<<: !include common-media_player.yaml