From 9431848bea57f31d068008c984bf34f19d5529d8 Mon Sep 17 00:00:00 2001
From: John Boiles <johnaboiles@gmail.com>
Date: Sat, 11 Jan 2025 03:02:38 -0800
Subject: [PATCH] SPDIF Speaker support

---
 CODEOWNERS                                    |   2 +
 esphome/components/spdif_audio/__init__.py    |   7 +
 .../spdif_audio/speaker/__init__.py           |  61 ++
 .../components/spdif_audio/speaker/spdif.cpp  |  90 +++
 .../components/spdif_audio/speaker/spdif.h    |  52 ++
 .../spdif_audio/speaker/spdif_speaker.cpp     | 540 ++++++++++++++++++
 .../spdif_audio/speaker/spdif_speaker.h       | 132 +++++
 7 files changed, 884 insertions(+)
 create mode 100644 esphome/components/spdif_audio/__init__.py
 create mode 100644 esphome/components/spdif_audio/speaker/__init__.py
 create mode 100644 esphome/components/spdif_audio/speaker/spdif.cpp
 create mode 100644 esphome/components/spdif_audio/speaker/spdif.h
 create mode 100644 esphome/components/spdif_audio/speaker/spdif_speaker.cpp
 create mode 100644 esphome/components/spdif_audio/speaker/spdif_speaker.h

diff --git a/CODEOWNERS b/CODEOWNERS
index 404ad35efc..609f3ddd03 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -382,6 +382,8 @@ esphome/components/smt100/* @piechade
 esphome/components/sn74hc165/* @jesserockz
 esphome/components/socket/* @esphome/core
 esphome/components/sonoff_d1/* @anatoly-savchenkov
+esphome/components/spdif_audio/* @johnboiles
+esphome/components/spdif_audio/speaker/* @johnboiles
 esphome/components/speaker/* @jesserockz @kahrendt
 esphome/components/spi/* @clydebarrow @esphome/core
 esphome/components/spi_device/* @clydebarrow
diff --git a/esphome/components/spdif_audio/__init__.py b/esphome/components/spdif_audio/__init__.py
new file mode 100644
index 0000000000..e03a07528e
--- /dev/null
+++ b/esphome/components/spdif_audio/__init__.py
@@ -0,0 +1,7 @@
+import esphome.codegen as cg
+
+CODEOWNERS = ["@johnboiles"]
+DEPENDENCIES = ["esp32"]
+MULTI_CONF = True
+
+spdif_audio_ns = cg.esphome_ns.namespace("spdif_audio")
diff --git a/esphome/components/spdif_audio/speaker/__init__.py b/esphome/components/spdif_audio/speaker/__init__.py
new file mode 100644
index 0000000000..397911ab85
--- /dev/null
+++ b/esphome/components/spdif_audio/speaker/__init__.py
@@ -0,0 +1,61 @@
+from esphome import pins
+import esphome.codegen as cg
+from esphome.components import speaker
+from esphome.components.i2s_audio import I2SAudioComponent
+import esphome.config_validation as cv
+from esphome.const import (
+    CONF_DATA_PIN,
+    CONF_DEBUG,
+    CONF_ID,
+    CONF_SAMPLE_RATE,
+    CONF_TIMEOUT,
+)
+
+from .. import spdif_audio_ns
+
+DEPENDENCIES = ["i2s_audio"]
+CODEOWNERS = ["@johnboiles"]
+
+SPDIFSpeaker = spdif_audio_ns.class_(
+    "SPDIFSpeaker", cg.Component, speaker.Speaker, I2SAudioComponent
+)
+
+CONF_BUFFER_DURATION = "buffer_duration"
+CONF_NEVER = "never"
+
+CONF_I2S_AUDIO_ID = "i2s_audio_id"
+
+CONF_FILL_SILENCE = "fill_silence"
+
+CONFIG_SCHEMA = speaker.SPEAKER_SCHEMA.extend(
+    {
+        cv.GenerateID(): cv.declare_id(SPDIFSpeaker),
+        cv.GenerateID(CONF_I2S_AUDIO_ID): cv.use_id(I2SAudioComponent),
+        cv.Required(CONF_DATA_PIN): pins.internal_gpio_output_pin_number,
+        cv.Optional(CONF_SAMPLE_RATE, default=48000): cv.positive_int,
+        cv.Optional(
+            CONF_BUFFER_DURATION, default="500ms"
+        ): cv.positive_time_period_milliseconds,
+        cv.Optional(CONF_TIMEOUT, default="500ms"): cv.Any(
+            cv.positive_time_period_milliseconds,
+            cv.one_of(CONF_NEVER, lower=True),
+        ),
+        cv.Optional(CONF_FILL_SILENCE, default=True): cv.boolean,
+        cv.Optional(CONF_DEBUG, default=False): cv.boolean,
+    }
+).extend(cv.COMPONENT_SCHEMA)
+
+
+async def to_code(config):
+    var = cg.new_Pvariable(config[CONF_ID])
+    await cg.register_component(var, config)
+    await cg.register_parented(var, config[CONF_I2S_AUDIO_ID])
+    await speaker.register_speaker(var, config)
+
+    cg.add(var.set_data_pin(config[CONF_DATA_PIN]))
+    cg.add(var.set_sample_rate(config[CONF_SAMPLE_RATE]))
+    cg.add(var.set_buffer_duration(config[CONF_BUFFER_DURATION]))
+    if config[CONF_TIMEOUT] != CONF_NEVER:
+        cg.add(var.set_timeout(config[CONF_TIMEOUT]))
+    cg.add_define("SPDIF_FILL_SILENCE", config[CONF_FILL_SILENCE])
+    cg.add_define("SPDIF_DEBUG", config[CONF_DEBUG])
diff --git a/esphome/components/spdif_audio/speaker/spdif.cpp b/esphome/components/spdif_audio/speaker/spdif.cpp
new file mode 100644
index 0000000000..dfde370085
--- /dev/null
+++ b/esphome/components/spdif_audio/speaker/spdif.cpp
@@ -0,0 +1,90 @@
+#include "spdif.h"
+
+#include "esphome/core/defines.h"
+#include "esphome/core/log.h"
+
+#include <esp_timer.h>
+namespace esphome {
+namespace spdif_audio {
+
+static const char *const TAG = "spdif";
+
+/*
+ * 8bit PCM to 16bit BMC conversion table, LSb first, 1 end
+ */
+static const uint16_t BMC_TABLE[256] = {
+    0x3333, 0xb333, 0xd333, 0x5333, 0xcb33, 0x4b33, 0x2b33, 0xab33, 0xcd33, 0x4d33, 0x2d33, 0xad33, 0x3533, 0xb533,
+    0xd533, 0x5533, 0xccb3, 0x4cb3, 0x2cb3, 0xacb3, 0x34b3, 0xb4b3, 0xd4b3, 0x54b3, 0x32b3, 0xb2b3, 0xd2b3, 0x52b3,
+    0xcab3, 0x4ab3, 0x2ab3, 0xaab3, 0xccd3, 0x4cd3, 0x2cd3, 0xacd3, 0x34d3, 0xb4d3, 0xd4d3, 0x54d3, 0x32d3, 0xb2d3,
+    0xd2d3, 0x52d3, 0xcad3, 0x4ad3, 0x2ad3, 0xaad3, 0x3353, 0xb353, 0xd353, 0x5353, 0xcb53, 0x4b53, 0x2b53, 0xab53,
+    0xcd53, 0x4d53, 0x2d53, 0xad53, 0x3553, 0xb553, 0xd553, 0x5553, 0xcccb, 0x4ccb, 0x2ccb, 0xaccb, 0x34cb, 0xb4cb,
+    0xd4cb, 0x54cb, 0x32cb, 0xb2cb, 0xd2cb, 0x52cb, 0xcacb, 0x4acb, 0x2acb, 0xaacb, 0x334b, 0xb34b, 0xd34b, 0x534b,
+    0xcb4b, 0x4b4b, 0x2b4b, 0xab4b, 0xcd4b, 0x4d4b, 0x2d4b, 0xad4b, 0x354b, 0xb54b, 0xd54b, 0x554b, 0x332b, 0xb32b,
+    0xd32b, 0x532b, 0xcb2b, 0x4b2b, 0x2b2b, 0xab2b, 0xcd2b, 0x4d2b, 0x2d2b, 0xad2b, 0x352b, 0xb52b, 0xd52b, 0x552b,
+    0xccab, 0x4cab, 0x2cab, 0xacab, 0x34ab, 0xb4ab, 0xd4ab, 0x54ab, 0x32ab, 0xb2ab, 0xd2ab, 0x52ab, 0xcaab, 0x4aab,
+    0x2aab, 0xaaab, 0xcccd, 0x4ccd, 0x2ccd, 0xaccd, 0x34cd, 0xb4cd, 0xd4cd, 0x54cd, 0x32cd, 0xb2cd, 0xd2cd, 0x52cd,
+    0xcacd, 0x4acd, 0x2acd, 0xaacd, 0x334d, 0xb34d, 0xd34d, 0x534d, 0xcb4d, 0x4b4d, 0x2b4d, 0xab4d, 0xcd4d, 0x4d4d,
+    0x2d4d, 0xad4d, 0x354d, 0xb54d, 0xd54d, 0x554d, 0x332d, 0xb32d, 0xd32d, 0x532d, 0xcb2d, 0x4b2d, 0x2b2d, 0xab2d,
+    0xcd2d, 0x4d2d, 0x2d2d, 0xad2d, 0x352d, 0xb52d, 0xd52d, 0x552d, 0xccad, 0x4cad, 0x2cad, 0xacad, 0x34ad, 0xb4ad,
+    0xd4ad, 0x54ad, 0x32ad, 0xb2ad, 0xd2ad, 0x52ad, 0xcaad, 0x4aad, 0x2aad, 0xaaad, 0x3335, 0xb335, 0xd335, 0x5335,
+    0xcb35, 0x4b35, 0x2b35, 0xab35, 0xcd35, 0x4d35, 0x2d35, 0xad35, 0x3535, 0xb535, 0xd535, 0x5535, 0xccb5, 0x4cb5,
+    0x2cb5, 0xacb5, 0x34b5, 0xb4b5, 0xd4b5, 0x54b5, 0x32b5, 0xb2b5, 0xd2b5, 0x52b5, 0xcab5, 0x4ab5, 0x2ab5, 0xaab5,
+    0xccd5, 0x4cd5, 0x2cd5, 0xacd5, 0x34d5, 0xb4d5, 0xd4d5, 0x54d5, 0x32d5, 0xb2d5, 0xd2d5, 0x52d5, 0xcad5, 0x4ad5,
+    0x2ad5, 0xaad5, 0x3355, 0xb355, 0xd355, 0x5355, 0xcb55, 0x4b55, 0x2b55, 0xab55, 0xcd55, 0x4d55, 0x2d55, 0xad55,
+    0x3555, 0xb555, 0xd555, 0x5555,
+};
+
+// BMC preamble
+static const uint32_t BMC_B = 0x33173333;  // block start
+static const uint32_t BMC_M = 0x331d3333;  // left ch
+static const uint32_t BMC_W = 0x331b3333;  // right ch
+static const uint32_t BMC_MW_DIF = (BMC_M ^ BMC_W);
+static const uint8_t SYNC_OFFSET = 2;  // byte offset of SYNC
+static const uint32_t SYNC_FLIP = ((BMC_B ^ BMC_M) >> (SYNC_OFFSET * 8));
+
+// initialize S/PDIF buffer
+void SPDIF::setup() {
+  ESP_LOGCONFIG(TAG, "Setting up SPDIF...");
+
+  int i;
+  uint32_t bmc_mw = BMC_W;
+
+  for (i = 0; i < SPDIF_BLOCK_SIZE_U32; i += 2) {
+    spdif_block_buf_[i] = bmc_mw ^= BMC_MW_DIF;
+  }
+  ESP_LOGD(TAG, "SPDIF buffer initialized to %zu bytes", sizeof(spdif_block_buf_));
+
+  spdif_block_ptr_ = spdif_block_buf_;
+}
+
+esp_err_t SPDIF::write(const uint8_t *src, size_t size, TickType_t ticks_to_wait) {
+  const uint8_t *p = reinterpret_cast<const uint8_t *>(src);
+
+  while (p < (uint8_t *) src + size) {
+    // convert PCM 16bit data to BMC 32bit pulse pattern (which is 64 i2s bits to emulate BMC)
+    // We cast to int16_t to avoid sign extension issues when XOR-ing
+    *(spdif_block_ptr_ + 1) =
+        (uint32_t) (((static_cast<int16_t>(BMC_TABLE[*p]) << 16) ^ static_cast<int16_t>(BMC_TABLE[*(p + 1)])) << 1) >>
+        1;
+
+    p += 2;
+    spdif_block_ptr_ += 2;  // advance to next audio data
+
+    if (spdif_block_ptr_ >= &spdif_block_buf_[SPDIF_BLOCK_SIZE_U32]) {
+      // set block start preamble
+      ((uint8_t *) spdif_block_buf_)[SYNC_OFFSET] ^= SYNC_FLIP;
+
+      esp_err_t err = block_complete_callback_(spdif_block_buf_, sizeof(spdif_block_buf_), ticks_to_wait);
+      if (err != ESP_OK) {
+        return err;
+      }
+
+      spdif_block_ptr_ = spdif_block_buf_;
+    }
+  }
+
+  return ESP_OK;
+}
+
+}  // namespace spdif_audio
+}  // namespace esphome
diff --git a/esphome/components/spdif_audio/speaker/spdif.h b/esphome/components/spdif_audio/speaker/spdif.h
new file mode 100644
index 0000000000..38b1786cbe
--- /dev/null
+++ b/esphome/components/spdif_audio/speaker/spdif.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <freertos/FreeRTOS.h>
+
+// Number of samples in a SPDIF block
+static const uint16_t SPDIF_BLOCK_SAMPLES = 192;
+// A SPDIF sample is 64-bit
+static const uint8_t SPDIF_BITS_PER_SAMPLE = 64;
+// To emulate bi-phase mark code (BMC) (aka differential Manchester encoding) we are send
+// twice as many bits per sample so that we can generate the transitions this encoding requires.
+static const uint8_t EMULATED_BMC_BITS_PER_SAMPLE = SPDIF_BITS_PER_SAMPLE * 2;
+#define SPDIF_BLOCK_SIZE_BYTES (SPDIF_BLOCK_SAMPLES * (EMULATED_BMC_BITS_PER_SAMPLE / 8))
+#define SPDIF_BLOCK_SIZE_U32 (SPDIF_BLOCK_SIZE_BYTES / sizeof(uint32_t))  // One block, 1536 bytes
+
+namespace esphome {
+namespace spdif_audio {
+
+class SPDIF {
+ public:
+  /// @brief Initialize the BMC lookup table and working buffer
+  void setup();
+
+  /// @brief Function to call when a block of data is complete (called from write)
+  void set_block_complete_callback(
+      std::function<esp_err_t(uint32_t *data, size_t size, TickType_t ticks_to_wait)> callback) {
+    block_complete_callback_ = std::move(callback);
+  }
+
+  /// @brief Convert PCM audio data to SPDIF BMC encoded data
+  /// @param src Source PCM audio data
+  /// @param size Size of source data in bytes
+  /// @return esp_err_t as returned from block_complete_callback_
+  esp_err_t write(const uint8_t *src, size_t size, TickType_t ticks_to_wait);
+
+  /// @brief Reset the SPDIF block buffer
+  void reset() { spdif_block_ptr_ = spdif_block_buf_; }
+
+ protected:
+  // BMC lookup table for converting 8-bits to 16-bit emulated BMC waveform
+  static const uint16_t BMC_LOOKUP_TABLE[256];
+
+  std::function<esp_err_t(uint32_t *data, size_t size, TickType_t ticks_to_wait)> block_complete_callback_;
+
+  // Working buffer that holds an entire SPDIF block ready for I2S output
+  uint32_t spdif_block_buf_[SPDIF_BLOCK_SIZE_U32];
+  uint32_t *spdif_block_ptr_{nullptr};
+};
+
+}  // namespace spdif_audio
+}  // namespace esphome
diff --git a/esphome/components/spdif_audio/speaker/spdif_speaker.cpp b/esphome/components/spdif_audio/speaker/spdif_speaker.cpp
new file mode 100644
index 0000000000..5a31005436
--- /dev/null
+++ b/esphome/components/spdif_audio/speaker/spdif_speaker.cpp
@@ -0,0 +1,540 @@
+#include "spdif_speaker.h"
+
+#ifdef USE_ESP32
+
+#include <driver/i2s.h>
+#include <esp_timer.h>
+
+#include "esphome/components/audio/audio.h"
+
+#include "esphome/core/application.h"
+#include "esphome/core/hal.h"
+#include "esphome/core/log.h"
+
+namespace esphome {
+namespace spdif_audio {
+
+static const size_t DMA_BUFFERS_COUNT = 4;
+
+static const size_t TASK_STACK_SIZE = 4096;
+static const ssize_t TASK_PRIORITY = 23;
+
+static const size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT + 1;
+
+static const char *const TAG = "spdif_audio.speaker";
+
+#if SPDIF_FILL_SILENCE
+// A full BMC block's worth of 16-big stereo samples
+int16_t silence[SPDIF_BLOCK_SAMPLES * 2];
+#endif
+
+enum SpeakerEventGroupBits : uint32_t {
+  COMMAND_START = (1 << 0),            // starts the speaker task
+  COMMAND_STOP = (1 << 1),             // stops the speaker task
+  COMMAND_STOP_GRACEFULLY = (1 << 2),  // Stops the speaker task once all data has been written
+  STATE_STARTING = (1 << 10),
+  STATE_RUNNING = (1 << 11),
+  STATE_STOPPING = (1 << 12),
+  STATE_STOPPED = (1 << 13),
+  ERR_TASK_FAILED_TO_START = (1 << 14),
+  ERR_ESP_INVALID_STATE = (1 << 15),
+  ERR_ESP_NOT_SUPPORTED = (1 << 16),
+  ERR_ESP_INVALID_ARG = (1 << 17),
+  ERR_ESP_INVALID_SIZE = (1 << 18),
+  ERR_ESP_NO_MEM = (1 << 19),
+  ERR_ESP_FAIL = (1 << 20),
+  ALL_ERR_ESP_BITS = ERR_ESP_INVALID_STATE | ERR_ESP_NOT_SUPPORTED | ERR_ESP_INVALID_ARG | ERR_ESP_INVALID_SIZE |
+                     ERR_ESP_NO_MEM | ERR_ESP_FAIL,
+  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
+};
+
+// Translates a SpeakerEventGroupBits ERR_ESP bit to the coressponding esp_err_t
+static esp_err_t err_bit_to_esp_err(uint32_t bit) {
+  switch (bit) {
+    case SpeakerEventGroupBits::ERR_ESP_INVALID_STATE:
+      return ESP_ERR_INVALID_STATE;
+    case SpeakerEventGroupBits::ERR_ESP_INVALID_ARG:
+      return ESP_ERR_INVALID_ARG;
+    case SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE:
+      return ESP_ERR_INVALID_SIZE;
+    case SpeakerEventGroupBits::ERR_ESP_NO_MEM:
+      return ESP_ERR_NO_MEM;
+    case SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED:
+      return ESP_ERR_NOT_SUPPORTED;
+    default:
+      return ESP_FAIL;
+  }
+}
+
+/// @brief Multiplies the input array of Q15 numbers by a Q15 constant factor
+///
+/// Based on `dsps_mulc_s16_ansi` from the esp-dsp library:
+/// https://github.com/espressif/esp-dsp/blob/master/modules/math/mulc/fixed/dsps_mulc_s16_ansi.c
+/// (accessed on 2024-09-30).
+/// @param input Array of Q15 numbers
+/// @param output Array of Q15 numbers
+/// @param len Length of array
+/// @param c Q15 constant factor
+static void q15_multiplication(const int16_t *input, int16_t *output, size_t len, int16_t c) {
+  for (int i = 0; i < len; i++) {
+    int32_t acc = (int32_t) input[i] * (int32_t) c;
+    output[i] = (int16_t) (acc >> 15);
+  }
+}
+
+// Lists the Q15 fixed point scaling factor for volume reduction.
+// Has 100 values representing silence and a reduction [49, 48.5, ... 0.5, 0] dB.
+// dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014)
+// float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15)
+static const std::vector<int16_t> Q15_VOLUME_SCALING_FACTORS = {
+    0,     116,   122,   130,   137,   146,   154,   163,   173,   183,   194,   206,   218,   231,   244,
+    259,   274,   291,   308,   326,   345,   366,   388,   411,   435,   461,   488,   517,   548,   580,
+    615,   651,   690,   731,   774,   820,   868,   920,   974,   1032,  1094,  1158,  1227,  1300,  1377,
+    1459,  1545,  1637,  1734,  1837,  1946,  2061,  2184,  2313,  2450,  2596,  2750,  2913,  3085,  3269,
+    3462,  3668,  3885,  4116,  4360,  4619,  4893,  5183,  5490,  5816,  6161,  6527,  6914,  7324,  7758,
+    8218,  8706,  9222,  9770,  10349, 10963, 11613, 12302, 13032, 13805, 14624, 15491, 16410, 17384, 18415,
+    19508, 20665, 21891, 23189, 24565, 26022, 27566, 29201, 30933, 32767};
+
+void SPDIFSpeaker::setup() {
+  ESP_LOGCONFIG(TAG, "Setting up SPDIF Audio Speaker...");
+
+  this->event_group_ = xEventGroupCreate();
+
+  if (this->event_group_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to create event group");
+    this->mark_failed();
+    return;
+  }
+
+  this->spdif_->setup();
+  this->spdif_->set_block_complete_callback([this](uint32_t *data, size_t size, TickType_t ticks_to_wait) -> esp_err_t {
+    size_t i2s_write_len;
+    return i2s_write(this->parent_->get_port(), data, size, &i2s_write_len, ticks_to_wait);
+  });
+
+#if SPDIF_FILL_SILENCE
+  memset(silence, 0, sizeof(silence));
+#endif
+}
+
+void SPDIFSpeaker::loop() {
+  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
+
+  if (event_group_bits & SpeakerEventGroupBits::STATE_STARTING) {
+    ESP_LOGD(TAG, "Starting Speaker");
+    this->state_ = speaker::STATE_STARTING;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STARTING);
+  }
+  if (event_group_bits & SpeakerEventGroupBits::STATE_RUNNING) {
+    ESP_LOGD(TAG, "Started Speaker");
+    this->state_ = speaker::STATE_RUNNING;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
+    this->status_clear_warning();
+    this->status_clear_error();
+  }
+  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPING) {
+    ESP_LOGD(TAG, "Stopping Speaker");
+    this->state_ = speaker::STATE_STOPPING;
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
+  }
+  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPED) {
+    if (!this->task_created_) {
+      ESP_LOGD(TAG, "Stopped Speaker");
+      this->state_ = speaker::STATE_STOPPED;
+      xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
+      this->speaker_task_handle_ = nullptr;
+    }
+  }
+
+  if (event_group_bits & SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START) {
+    this->status_set_error("Failed to start speaker task");
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+  }
+
+  if (event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS) {
+    uint32_t error_bits = event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS;
+    ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(err_bit_to_esp_err(error_bits)));
+    this->status_set_warning();
+  }
+
+  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED) {
+    this->status_set_error("Failed to adjust I2S bus to match the incoming audio");
+    ESP_LOGE(TAG,
+             "Incompatible audio format: sample rate = %" PRIu32 ", channels = %" PRIu8 ", bits per sample = %" PRIu8,
+             this->audio_stream_info_.sample_rate, this->audio_stream_info_.channels,
+             this->audio_stream_info_.bits_per_sample);
+  }
+}
+
+void SPDIFSpeaker::set_volume(float volume) {
+  this->volume_ = volume;
+  // Software volume control by using a Q15 fixed point scaling factor
+  ssize_t decibel_index = remap<ssize_t, float>(volume, 0.0f, 1.0f, 0, Q15_VOLUME_SCALING_FACTORS.size() - 1);
+  this->q15_volume_factor_ = Q15_VOLUME_SCALING_FACTORS[decibel_index];
+}
+
+void SPDIFSpeaker::set_mute_state(bool mute_state) {
+  this->mute_state_ = mute_state;
+  if (mute_state) {
+    // Software volume control and scale by 0
+    this->q15_volume_factor_ = 0;
+  } else {
+    // Revert to previous volume when unmuting
+    this->set_volume(this->volume_);
+  }
+}
+
+size_t SPDIFSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) {
+  if (this->is_failed()) {
+    ESP_LOGE(TAG, "Cannot play audio, speaker failed to setup");
+    return 0;
+  }
+  if (this->state_ != speaker::STATE_RUNNING && this->state_ != speaker::STATE_STARTING) {
+    this->start();
+  }
+
+  size_t bytes_written = 0;
+  if ((this->state_ == speaker::STATE_RUNNING) && (this->audio_ring_buffer_.use_count() == 1)) {
+    // Only one owner of the ring buffer (the speaker task), so the ring buffer is allocated and no other components are
+    // attempting to write to it.
+
+    // Temporarily share ownership of the ring buffer so it won't be deallocated while writing
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_;
+    bytes_written = temp_ring_buffer->write_without_replacement((void *) data, length, ticks_to_wait);
+  }
+
+  return bytes_written;
+}
+
+bool SPDIFSpeaker::has_buffered_data() const {
+  if (this->audio_ring_buffer_ != nullptr) {
+    return this->audio_ring_buffer_->available() > 0;
+  }
+  return false;
+}
+
+void SPDIFSpeaker::speaker_task(void *params) {
+  SPDIFSpeaker *this_speaker = (SPDIFSpeaker *) params;
+  uint32_t event_group_bits =
+      xEventGroupWaitBits(this_speaker->event_group_,
+                          SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP |
+                              SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY,  // Bit message to read
+                          pdTRUE,                                              // Clear the bits on exit
+                          pdFALSE,                                             // Don't wait for all the bits,
+                          portMAX_DELAY);                                      // Block indefinitely until a bit is set
+
+  if (event_group_bits & (SpeakerEventGroupBits::COMMAND_STOP | SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY)) {
+    // Received a stop signal before the task was requested to start
+    this_speaker->delete_task_(0);
+  }
+
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STARTING);
+
+  audio::AudioStreamInfo audio_stream_info = this_speaker->audio_stream_info_;
+
+  const uint32_t bytes_per_ms =
+      audio_stream_info.channels * audio_stream_info.get_bytes_per_sample() * audio_stream_info.sample_rate / 1000;
+
+  const size_t dma_buffers_size =
+      DMA_BUFFERS_COUNT * SPDIF_BLOCK_SAMPLES * audio_stream_info.channels * audio_stream_info.get_bytes_per_sample();
+
+  int task_delay_ms = bytes_per_ms * DMA_BUFFERS_COUNT / 2;
+
+  // Ensure ring buffer is at least as large as the total size of the DMA buffers
+  const size_t ring_buffer_size =
+      std::max((uint32_t) dma_buffers_size, this_speaker->buffer_duration_ms_ * bytes_per_ms);
+
+  if (this_speaker->send_esp_err_to_event_group_(this_speaker->allocate_buffers_(dma_buffers_size, ring_buffer_size))) {
+    // Failed to allocate buffers
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+    this_speaker->delete_task_(dma_buffers_size);
+  }
+
+  if (!this_speaker->send_esp_err_to_event_group_(this_speaker->start_i2s_driver_(audio_stream_info))) {
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
+
+    bool stop_gracefully = false;
+    uint32_t last_data_received_time = millis();
+    bool tx_dma_underflow = false;
+
+    while (!this_speaker->timeout_.has_value() ||
+           (millis() - last_data_received_time) <= this_speaker->timeout_.value()) {
+      event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
+
+      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
+        break;
+      }
+      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
+        stop_gracefully = true;
+      }
+
+      if (this_speaker->audio_stream_info_ != audio_stream_info) {
+        // Audio stream info has changed, stop the speaker task so it will restart with the proper settings.
+
+        break;
+      }
+
+      i2s_event_t i2s_event;
+      while (xQueueReceive(this_speaker->i2s_event_queue_, &i2s_event, 0)) {
+        if (i2s_event.type == I2S_EVENT_TX_Q_OVF) {
+#if SPDIF_DEBUG
+          int64_t last_overflow_log_time = 0;
+          const int64_t min_log_interval_us = 1000000;
+          int64_t current_time = esp_timer_get_time();
+          if (current_time - last_overflow_log_time >= min_log_interval_us) {
+            ESP_LOGE(TAG, "I2S_EVENT_TX_Q_OVF");
+            last_overflow_log_time = current_time;
+          }
+#endif
+#if SPDIF_FILL_SILENCE
+          // Queue DMA a couple buffers full of silence when we don't have anything else to play
+          this_speaker->spdif_->reset();
+          this_speaker->spdif_->write(reinterpret_cast<uint8_t *>(silence), sizeof(silence), 0);
+          this_speaker->spdif_->write(reinterpret_cast<uint8_t *>(silence), sizeof(silence), 0);
+          // this_speaker->spdif_->write(reinterpret_cast<uint8_t *>(silence), sizeof(silence), 0);
+          // this_speaker->spdif_->write(reinterpret_cast<uint8_t *>(silence), sizeof(silence), 0);
+#endif
+          tx_dma_underflow = true;
+        }
+      }
+
+      size_t bytes_to_read = dma_buffers_size;
+
+      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, bytes_to_read,
+                                                                 pdMS_TO_TICKS(task_delay_ms));
+      if (bytes_read > 0) {
+        if ((audio_stream_info.bits_per_sample == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
+          // Scale samples by the volume factor in place
+          q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
+                             bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
+        }
+
+        this_speaker->spdif_->write(this_speaker->data_buffer_, bytes_read, portMAX_DELAY);
+
+#if SPDIF_DEBUG
+        static uint64_t total_bytes = 0;
+        static uint64_t last_log_time = 0;
+        static uint64_t last_log_bytes = 0;
+
+        total_bytes += bytes_read;
+        int64_t current_time = esp_timer_get_time();
+
+        if (last_log_time == 0) {
+          last_log_time = current_time;
+          last_log_bytes = total_bytes;
+        }
+
+        // Check if it's time to log sample statistics (every minute)
+        if (current_time - last_log_time >= 5000000) {
+          uint64_t elapsed_time = current_time - last_log_time;
+          uint64_t bytes_since_last_log = total_bytes - last_log_bytes;
+          // 4 bytes per 16-bit stereo sample
+          uint64_t samples = bytes_since_last_log / 4;
+          float seconds = elapsed_time / 1000000.0f;
+          float hz = samples / seconds;
+
+          ESP_LOGD(TAG, "%llu samples in %.2fs (%.2fHz)", samples, seconds, hz);
+
+          // Reset for next log
+          last_log_time = current_time;
+          last_log_bytes = total_bytes;
+        }
+#endif
+
+        tx_dma_underflow = false;
+        last_data_received_time = millis();
+      } else {
+        // No data received
+        if (stop_gracefully && tx_dma_underflow) {
+          break;
+        }
+      }
+    }
+
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
+
+    i2s_driver_uninstall(this_speaker->parent_->get_port());
+
+    this_speaker->parent_->unlock();
+  }
+
+  this_speaker->delete_task_(dma_buffers_size);
+}
+
+void SPDIFSpeaker::start() {
+  if (!this->is_ready() || this->is_failed() || this->status_has_error())
+    return;
+  if ((this->state_ == speaker::STATE_STARTING) || (this->state_ == speaker::STATE_RUNNING))
+    return;
+
+  if (this->speaker_task_handle_ == nullptr) {
+    xTaskCreate(SPDIFSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
+                &this->speaker_task_handle_);
+  }
+
+  if (this->speaker_task_handle_ != nullptr) {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
+    this->task_created_ = true;
+  } else {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+  }
+}
+
+void SPDIFSpeaker::stop() { this->stop_(false); }
+
+void SPDIFSpeaker::finish() { this->stop_(true); }
+
+void SPDIFSpeaker::stop_(bool wait_on_empty) {
+  if (this->is_failed())
+    return;
+  if (this->state_ == speaker::STATE_STOPPED)
+    return;
+
+  if (wait_on_empty) {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
+  } else {
+    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
+  }
+}
+
+bool SPDIFSpeaker::send_esp_err_to_event_group_(esp_err_t err) {
+  switch (err) {
+    case ESP_OK:
+      return false;
+    case ESP_ERR_INVALID_STATE:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_STATE);
+      return true;
+    case ESP_ERR_INVALID_ARG:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_ARG);
+      return true;
+    case ESP_ERR_INVALID_SIZE:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
+      return true;
+    case ESP_ERR_NO_MEM:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+      return true;
+    case ESP_ERR_NOT_SUPPORTED:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED);
+      return true;
+    default:
+      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_FAIL);
+      return true;
+  }
+}
+
+esp_err_t SPDIFSpeaker::allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size) {
+  if (this->data_buffer_ == nullptr) {
+    // Allocate data buffer for temporarily storing audio from the ring buffer before writing to the I2S bus
+    ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+    this->data_buffer_ = allocator.allocate(data_buffer_size);
+  }
+
+  if (this->data_buffer_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to allocate data_buffer_");
+    return ESP_ERR_NO_MEM;
+  }
+
+  if (this->audio_ring_buffer_.use_count() == 0) {
+    // Allocate ring buffer. Uses a shared_ptr to ensure it isn't improperly deallocated.
+    this->audio_ring_buffer_ = RingBuffer::create(ring_buffer_size);
+  }
+
+  if (this->audio_ring_buffer_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to allocate audio_ring_buffer_");
+    return ESP_ERR_NO_MEM;
+  }
+
+  return ESP_OK;
+}
+
+esp_err_t SPDIFSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info) {
+  if (this->sample_rate_ != audio_stream_info.sample_rate) {  // NOLINT
+    //  Can't reconfigure I2S bus, so the sample rate must match the configured value
+    ESP_LOGE(TAG, "SPDIF only supports a single sample rate");
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+
+  // Currently only 16-bit samples are supported
+  if (audio_stream_info.bits_per_sample != 16) {
+    ESP_LOGE(TAG, "SPDIF only supports 16 bits per sample");
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+
+  // Currently only stereo is supported
+  if (audio_stream_info.channels != 2) {
+    ESP_LOGE(TAG, "SPDIF only supports stereo");
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+
+  if (!this->parent_->try_lock()) {
+    return ESP_ERR_INVALID_STATE;
+  }
+
+  constexpr uint32_t i2s_bits_per_sample = 32;
+  uint32_t sample_rate = this->sample_rate_ * 2;
+  i2s_config_t config = {
+    .mode = static_cast<i2s_mode_t>(I2S_MODE_MASTER | I2S_MODE_TX),
+    .sample_rate = sample_rate,
+    .bits_per_sample = static_cast<i2s_bits_per_sample_t>(i2s_bits_per_sample),
+    .channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT,
+    .communication_format = I2S_COMM_FORMAT_STAND_I2S,
+    .intr_alloc_flags = 0,
+    .dma_buf_count = DMA_BUFFERS_COUNT,
+    .dma_buf_len = SPDIF_BLOCK_SIZE_U32,
+    .use_apll = true,
+#if SPDIF_FILL_SILENCE
+    .tx_desc_auto_clear = false,
+#else
+    .tx_desc_auto_clear = true,
+#endif
+    .fixed_mclk = 0,
+    .mclk_multiple = I2S_MCLK_MULTIPLE_256,
+    .bits_per_chan = I2S_BITS_PER_CHAN_DEFAULT,
+  };
+
+  esp_err_t err =
+      i2s_driver_install(this->parent_->get_port(), &config, I2S_EVENT_QUEUE_COUNT, &this->i2s_event_queue_);
+  if (err != ESP_OK) {
+    // Failed to install the driver, so unlock the I2S port
+    this->parent_->unlock();
+    return err;
+  }
+
+  i2s_pin_config_t pin_config = {
+      .mck_io_num = -1,
+      .bck_io_num = -1,
+      .ws_io_num = -1,
+      .data_out_num = this->data_pin_,
+      .data_in_num = -1,
+  };
+
+  err = i2s_set_pin(this->parent_->get_port(), &pin_config);
+
+  if (err != ESP_OK) {
+    // Failed to set the data out pin, so uninstall the driver and unlock the I2S port
+    i2s_driver_uninstall(this->parent_->get_port());
+    this->parent_->unlock();
+  }
+
+  return err;
+}
+
+void SPDIFSpeaker::delete_task_(size_t buffer_size) {
+  this->audio_ring_buffer_.reset();  // Releases ownership of the shared_ptr
+
+  if (this->data_buffer_ != nullptr) {
+    ExternalRAMAllocator<uint8_t> allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
+    allocator.deallocate(this->data_buffer_, buffer_size);
+    this->data_buffer_ = nullptr;
+  }
+
+  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPED);
+
+  this->task_created_ = false;
+  vTaskDelete(nullptr);
+}
+
+}  // namespace spdif_audio
+}  // namespace esphome
+
+#endif  // USE_ESP32
diff --git a/esphome/components/spdif_audio/speaker/spdif_speaker.h b/esphome/components/spdif_audio/speaker/spdif_speaker.h
new file mode 100644
index 0000000000..4cdad84822
--- /dev/null
+++ b/esphome/components/spdif_audio/speaker/spdif_speaker.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#ifdef USE_ESP32
+
+#include "esphome/components/i2s_audio/i2s_audio.h"
+
+#include <driver/i2s.h>
+
+#include <freertos/event_groups.h>
+#include <freertos/queue.h>
+#include <freertos/FreeRTOS.h>
+
+#include "spdif.h"
+#include "esphome/components/audio/audio.h"
+#include "esphome/components/speaker/speaker.h"
+
+#include "esphome/core/component.h"
+#include "esphome/core/gpio.h"
+#include "esphome/core/helpers.h"
+#include "esphome/core/ring_buffer.h"
+
+namespace esphome {
+namespace spdif_audio {
+
+class SPDIFSpeaker : public Parented<esphome::i2s_audio::I2SAudioComponent>, public speaker::Speaker, public Component {
+ public:
+  SPDIFSpeaker() : spdif_(new SPDIF()) {}
+  float get_setup_priority() const override { return esphome::setup_priority::PROCESSOR; }
+
+  void setup() override;
+  void loop() override;
+
+  void set_buffer_duration(uint32_t buffer_duration_ms) { this->buffer_duration_ms_ = buffer_duration_ms; }
+  void set_timeout(uint32_t ms) { this->timeout_ = ms; }
+  void set_data_pin(uint8_t pin) { this->data_pin_ = pin; }
+  void set_sample_rate(uint32_t rate) { this->sample_rate_ = rate; }
+
+  void start() override;
+  void stop() override;
+  void finish() override;
+
+  /// @brief Plays the provided audio data.
+  /// Starts the speaker task, if necessary. Writes the audio data to the ring buffer.
+  /// @param data Audio data in the format set by the parent speaker classes ``set_audio_stream_info`` method.
+  /// @param length The length of the audio data in bytes.
+  /// @param ticks_to_wait The FreeRTOS ticks to wait before writing as much data as possible to the ring buffer.
+  /// @return The number of bytes that were actually written to the ring buffer.
+  size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override;
+  size_t play(const uint8_t *data, size_t length) override { return play(data, length, 0); }
+
+  bool has_buffered_data() const override;
+
+  /// @brief Sets the volume of the speaker. Uses the speaker's configured audio dac component. If unavailble, it is
+  /// implemented as a software volume control. Overrides the default setter to convert the floating point volume to a
+  /// Q15 fixed-point factor.
+  /// @param volume between 0.0 and 1.0
+  void set_volume(float volume) override;
+
+  /// @brief Mutes or unmute the speaker. Uses the speaker's configured audio dac component. If unavailble, it is
+  /// implemented as a software volume control. Overrides the default setter to convert the floating point volume to a
+  /// Q15 fixed-point factor.
+  /// @param mute_state true for muting, false for unmuting
+  void set_mute_state(bool mute_state) override;
+
+ protected:
+  /// @brief Function for the FreeRTOS task handling audio output.
+  /// After receiving the COMMAND_START signal, allocates space for the buffers, starts the I2S driver, and reads
+  /// audio from the ring buffer and writes audio to the I2S port. Stops immmiately after receiving the COMMAND_STOP
+  /// signal and stops only after the ring buffer is empty after receiving the COMMAND_STOP_GRACEFULLY signal. Stops if
+  /// the ring buffer hasn't read data for more than timeout_ milliseconds. When stopping, it deallocates the buffers,
+  /// stops the I2S driver, unlocks the I2S port, and deletes the task. It communicates the state and any errors via
+  /// event_group_.
+  /// @param params I2SAudioSpeaker component
+  static void speaker_task(void *params);
+
+  /// @brief Sends a stop command to the speaker task via event_group_.
+  /// @param wait_on_empty If false, sends the COMMAND_STOP signal. If true, sends the COMMAND_STOP_GRACEFULLY signal.
+  void stop_(bool wait_on_empty);
+
+  /// @brief Sets the corresponding ERR_ESP event group bits.
+  /// @param err esp_err_t error code.
+  /// @return True if an ERR_ESP bit is set and false if err == ESP_OK
+  bool send_esp_err_to_event_group_(esp_err_t err);
+
+  /// @brief Allocates the data buffer and ring buffer
+  /// @param data_buffer_size Number of bytes to allocate for the data buffer.
+  /// @param ring_buffer_size Number of bytes to allocate for the ring buffer.
+  /// @return ESP_ERR_NO_MEM if either buffer fails to allocate
+  ///         ESP_OK if successful
+  esp_err_t allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size);
+
+  /// @brief Starts the ESP32 I2S driver.
+  /// Attempts to lock the I2S port, starts the I2S driver using the passed in stream information, and sets the data out
+  /// pin. If it fails, it will unlock the I2S port and uninstall the driver, if necessary.
+  /// @param audio_stream_info Stream information for the I2S driver.
+  /// @return ESP_ERR_NOT_ALLOWED if the I2S port can't play the incoming audio stream.
+  ///         ESP_ERR_INVALID_STATE if the I2S port is already locked.
+  ///         ESP_ERR_INVALID_ARG if nstalling the driver or setting the data outpin fails due to a parameter error.
+  ///         ESP_ERR_NO_MEM if the driver fails to install due to a memory allocation error.
+  ///         ESP_FAIL if setting the data out pin fails due to an IO error ESP_OK if successful
+  esp_err_t start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info);
+
+  /// @brief Deletes the speaker's task.
+  /// Deallocates the data_buffer_ and audio_ring_buffer_, if necessary, and deletes the task. Should only be called by
+  /// the speaker_task itself.
+  /// @param buffer_size The allocated size of the data_buffer_.
+  void delete_task_(size_t buffer_size);
+
+  SPDIF *spdif_{nullptr};
+  TaskHandle_t speaker_task_handle_{nullptr};
+  EventGroupHandle_t event_group_{nullptr};
+
+  QueueHandle_t i2s_event_queue_;
+
+  uint8_t *data_buffer_{nullptr};
+  std::shared_ptr<RingBuffer> audio_ring_buffer_;
+
+  uint32_t buffer_duration_ms_;
+
+  optional<uint32_t> timeout_;
+  uint8_t data_pin_;
+  uint32_t sample_rate_;
+
+  bool task_created_{false};
+
+  int16_t q15_volume_factor_{INT16_MAX};
+};
+
+}  // namespace spdif_audio
+}  // namespace esphome
+
+#endif  // USE_ESP32