[i2s_audio] Speaker improvements: CPU core agnostic and more accurate timestamps (#9800)

Co-authored-by: NP v/d Spek <github_mail@lumensoft.nl>
2025-10-31 07:03:55 +00:00 · 2025-07-24 04:14:00 +01:00
parent 108e447072
commit 6398bb2fdf
3 changed files with 302 additions and 307 deletions
--- a/esphome/components/i2s_audio/init.py
+++ b/esphome/components/i2s_audio/init.py
@@ -1,6 +1,6 @@
 from esphome import pins
 import esphome.codegen as cg
-from esphome.components.esp32 import get_esp32_variant
+from esphome.components.esp32 import add_idf_sdkconfig_option, get_esp32_variant
 from esphome.components.esp32.const import (
    VARIANT_ESP32,
    VARIANT_ESP32C3,
@@ -258,6 +258,10 @@ async def to_code(config):
    if use_legacy():
        cg.add_define("USE_I2S_LEGACY")
    # Helps avoid callbacks being skipped due to processor load
    if CORE.using_esp_idf:
        add_idf_sdkconfig_option("CONFIG_I2S_ISR_IRAM_SAFE", True)
    cg.add(var.set_lrclk_pin(config[CONF_I2S_LRCLK_PIN]))
    if CONF_I2S_BCLK_PIN in config:
        cg.add(var.set_bclk_pin(config[CONF_I2S_BCLK_PIN]))
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -9,6 +9,7 @@
 #endif
 #include "esphome/components/audio/audio.h"
 #include "esphome/components/audio/audio_transfer_buffer.h"
 #include "esphome/core/application.h"
 #include "esphome/core/hal.h"
@@ -19,72 +20,33 @@
 namespace esphome {
 namespace i2s_audio {
-static const uint8_t DMA_BUFFER_DURATION_MS = 15;
+static const uint32_t DMA_BUFFER_DURATION_MS = 15;
 static const size_t DMA_BUFFERS_COUNT = 4;
 static const size_t TASK_DELAY_MS = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT / 2;
 static const size_t TASK_STACK_SIZE = 4096;
-static const ssize_t TASK_PRIORITY = 23;
+static const ssize_t TASK_PRIORITY = 19;
 static const size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT + 1;
 static const char *const TAG = "i2s_audio.speaker";
 enum SpeakerEventGroupBits : uint32_t {
-  COMMAND_START = (1 << 0),            // starts the speaker task
+  COMMAND_START = (1 << 0),            // indicates loop should start speaker task
  COMMAND_STOP = (1 << 1),             // stops the speaker task
  COMMAND_STOP_GRACEFULLY = (1 << 2),  // Stops the speaker task once all data has been written
-  STATE_STARTING = (1 << 10),
+
-  STATE_RUNNING = (1 << 11),
+  TASK_STARTING = (1 << 10),
-  STATE_STOPPING = (1 << 12),
+  TASK_RUNNING = (1 << 11),
-  STATE_STOPPED = (1 << 13),
+  TASK_STOPPING = (1 << 12),
-  ERR_TASK_FAILED_TO_START = (1 << 14),
+  TASK_STOPPED = (1 << 13),
-  ERR_ESP_INVALID_STATE = (1 << 15),
+
  ERR_ESP_NOT_SUPPORTED = (1 << 16),
  ERR_ESP_INVALID_ARG = (1 << 17),
  ERR_ESP_INVALID_SIZE = (1 << 18),
  ERR_ESP_NO_MEM = (1 << 19),
-  ERR_ESP_FAIL = (1 << 20),
+
-  ALL_ERR_ESP_BITS = ERR_ESP_INVALID_STATE | ERR_ESP_NOT_SUPPORTED | ERR_ESP_INVALID_ARG | ERR_ESP_INVALID_SIZE |
+  WARN_DROPPED_EVENT = (1 << 20),
-                     ERR_ESP_NO_MEM | ERR_ESP_FAIL,
+
  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
 };
 // Translates a SpeakerEventGroupBits ERR_ESP bit to the coressponding esp_err_t
 static esp_err_t err_bit_to_esp_err(uint32_t bit) {
  switch (bit) {
    case SpeakerEventGroupBits::ERR_ESP_INVALID_STATE:
      return ESP_ERR_INVALID_STATE;
    case SpeakerEventGroupBits::ERR_ESP_INVALID_ARG:
      return ESP_ERR_INVALID_ARG;
    case SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE:
      return ESP_ERR_INVALID_SIZE;
    case SpeakerEventGroupBits::ERR_ESP_NO_MEM:
      return ESP_ERR_NO_MEM;
    case SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED:
      return ESP_ERR_NOT_SUPPORTED;
    default:
      return ESP_FAIL;
  }
 }
 /// @brief Multiplies the input array of Q15 numbers by a Q15 constant factor
 ///
 /// Based on `dsps_mulc_s16_ansi` from the esp-dsp library:
 /// https://github.com/espressif/esp-dsp/blob/master/modules/math/mulc/fixed/dsps_mulc_s16_ansi.c
 /// (accessed on 2024-09-30).
 /// @param input Array of Q15 numbers
 /// @param output Array of Q15 numbers
 /// @param len Length of array
 /// @param c Q15 constant factor
 static void q15_multiplication(const int16_t *input, int16_t *output, size_t len, int16_t c) {
  for (int i = 0; i < len; i++) {
    int32_t acc = (int32_t) input[i] * (int32_t) c;
    output[i] = (int16_t) (acc >> 15);
  }
 }
 // Lists the Q15 fixed point scaling factor for volume reduction.
 // Has 100 values representing silence and a reduction [49, 48.5, ... 0.5, 0] dB.
 // dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014)
@@ -132,51 +94,80 @@ void I2SAudioSpeaker::dump_config() {
 void I2SAudioSpeaker::loop() {
  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
-  if (event_group_bits & SpeakerEventGroupBits::STATE_STARTING) {
+  if ((event_group_bits & SpeakerEventGroupBits::COMMAND_START) && (this->state_ == speaker::STATE_STOPPED)) {
    ESP_LOGD(TAG, "Starting");
    this->state_ = speaker::STATE_STARTING;
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STARTING);
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
  }
-  if (event_group_bits & SpeakerEventGroupBits::STATE_RUNNING) {
+
  // Handle the task's state
  if (event_group_bits & SpeakerEventGroupBits::TASK_STARTING) {
    ESP_LOGD(TAG, "Starting");
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_STARTING);
  }
  if (event_group_bits & SpeakerEventGroupBits::TASK_RUNNING) {
    ESP_LOGD(TAG, "Started");
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
    this->state_ = speaker::STATE_RUNNING;
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
    this->status_clear_warning();
    this->status_clear_error();
  }
-  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPING) {
+  if (event_group_bits & SpeakerEventGroupBits::TASK_STOPPING) {
    ESP_LOGD(TAG, "Stopping");
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
    this->state_ = speaker::STATE_STOPPING;
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
  }
-  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPED) {
+  if (event_group_bits & SpeakerEventGroupBits::TASK_STOPPED) {
    if (!this->task_created_) {
    ESP_LOGD(TAG, "Stopped");
-      this->state_ = speaker::STATE_STOPPED;
+
-      xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
+    vTaskDelete(this->speaker_task_handle_);
    this->speaker_task_handle_ = nullptr;
-    }
+
    this->stop_i2s_driver_();
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
    this->status_clear_error();
    this->state_ = speaker::STATE_STOPPED;
  }
-  if (event_group_bits & SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START) {
+  // Log any errors encounted by the task
-    this->status_set_error("Failed to start task");
+  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NO_MEM) {
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+    ESP_LOGE(TAG, "Not enough memory");
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
  }
-  if (event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS) {
+  // Warn if any playback timestamp events are dropped, which drastically reduces synced playback accuracy
-    uint32_t error_bits = event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS;
+  if (event_group_bits & SpeakerEventGroupBits::WARN_DROPPED_EVENT) {
-    ESP_LOGW(TAG, "Writing failed: %s", esp_err_to_name(err_bit_to_esp_err(error_bits)));
+    ESP_LOGW(TAG, "Event dropped, synchronized playback accuracy is reduced");
-    this->status_set_warning();
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::WARN_DROPPED_EVENT);
  }
-  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED) {
+  // Handle the speaker's state
-    this->status_set_error("Failed to adjust bus to match incoming audio");
+  switch (this->state_) {
-    ESP_LOGE(TAG, "Incompatible audio format: sample rate = %" PRIu32 ", channels = %u, bits per sample = %u",
+    case speaker::STATE_STARTING:
-             this->audio_stream_info_.get_sample_rate(), this->audio_stream_info_.get_channels(),
+      if (this->status_has_error()) {
-             this->audio_stream_info_.get_bits_per_sample());
+        break;
      }
-  xEventGroupClearBits(this->event_group_, ALL_ERR_ESP_BITS);
+      if (this->start_i2s_driver_(this->audio_stream_info_) != ESP_OK) {
        ESP_LOGE(TAG, "Driver failed to start; retrying in 1 second");
        this->status_momentary_error("driver-faiure", 1000);
        break;
      }
      if (this->speaker_task_handle_ == nullptr) {
        xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
                    &this->speaker_task_handle_);
        if (this->speaker_task_handle_ == nullptr) {
          ESP_LOGE(TAG, "Task failed to start, retrying in 1 second");
          this->status_momentary_error("task-failure", 1000);
          this->stop_i2s_driver_();  // Stops the driver to return the lock; will be reloaded in next attempt
        }
      }
      break;
    case speaker::STATE_RUNNING:   // Intentional fallthrough
    case speaker::STATE_STOPPING:  // Intentional fallthrough
    case speaker::STATE_STOPPED:
      break;
  }
 }
 void I2SAudioSpeaker::set_volume(float volume) {
@@ -227,83 +218,76 @@ size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t tick
    this->start();
  }
-  if ((this->state_ != speaker::STATE_RUNNING) || (this->audio_ring_buffer_.use_count() != 1)) {
+  if (this->state_ != speaker::STATE_RUNNING) {
    // Unable to write data to a running speaker, so delay the max amount of time so it can get ready
    vTaskDelay(ticks_to_wait);
    ticks_to_wait = 0;
  }
  size_t bytes_written = 0;
-  if ((this->state_ == speaker::STATE_RUNNING) && (this->audio_ring_buffer_.use_count() == 1)) {
+  if (this->state_ == speaker::STATE_RUNNING) {
-    // Only one owner of the ring buffer (the speaker task), so the ring buffer is allocated and no other components are
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_.lock();
-    // attempting to write to it.
+    if (temp_ring_buffer.use_count() == 2) {
-
+      // Only the speaker task and this temp_ring_buffer own the ring buffer, so its safe to write to
    // Temporarily share ownership of the ring buffer so it won't be deallocated while writing
    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_;
      bytes_written = temp_ring_buffer->write_without_replacement((void *) data, length, ticks_to_wait);
    }
  }
  return bytes_written;
 }
 bool I2SAudioSpeaker::has_buffered_data() const {
-  if (this->audio_ring_buffer_ != nullptr) {
+  if (this->audio_ring_buffer_.use_count() > 0) {
-    return this->audio_ring_buffer_->available() > 0;
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_.lock();
    return temp_ring_buffer->available() > 0;
  }
  return false;
 }
 void I2SAudioSpeaker::speaker_task(void *params) {
  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
  this_speaker->task_created_ = true;
-  uint32_t event_group_bits =
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_STARTING);
      xEventGroupWaitBits(this_speaker->event_group_,
                          SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP |
                              SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY,  // Bit message to read
                          pdTRUE,                                              // Clear the bits on exit
                          pdFALSE,                                             // Don't wait for all the bits,
                          portMAX_DELAY);                                      // Block indefinitely until a bit is set
  if (event_group_bits & (SpeakerEventGroupBits::COMMAND_STOP | SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY)) {
    // Received a stop signal before the task was requested to start
    this_speaker->delete_task_(0);
  }
  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STARTING);
  audio::AudioStreamInfo audio_stream_info = this_speaker->audio_stream_info_;
  const uint32_t dma_buffers_duration_ms = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT;
  // Ensure ring buffer duration is at least the duration of all DMA buffers
  const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this_speaker->buffer_duration_ms_);
  // The DMA buffers may have more bits per sample, so calculate buffer sizes based in the input audio stream info
-  const size_t data_buffer_size = audio_stream_info.ms_to_bytes(dma_buffers_duration_ms);
+  const size_t ring_buffer_size = this_speaker->current_stream_info_.ms_to_bytes(ring_buffer_duration);
  const size_t ring_buffer_size = audio_stream_info.ms_to_bytes(ring_buffer_duration);
-  const size_t single_dma_buffer_input_size = data_buffer_size / DMA_BUFFERS_COUNT;
+  const uint32_t frames_to_fill_single_dma_buffer =
      this_speaker->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
  const size_t bytes_to_fill_single_dma_buffer =
      this_speaker->current_stream_info_.frames_to_bytes(frames_to_fill_single_dma_buffer);
-  if (this_speaker->send_esp_err_to_event_group_(this_speaker->allocate_buffers_(data_buffer_size, ring_buffer_size))) {
+  bool successful_setup = false;
-    // Failed to allocate buffers
+  std::unique_ptr<audio::AudioSourceTransferBuffer> transfer_buffer =
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+      audio::AudioSourceTransferBuffer::create(bytes_to_fill_single_dma_buffer);
-    this_speaker->delete_task_(data_buffer_size);
+
  if (transfer_buffer != nullptr) {
    std::shared_ptr<RingBuffer> temp_ring_buffer = RingBuffer::create(ring_buffer_size);
    if (temp_ring_buffer.use_count() == 1) {
      transfer_buffer->set_source(temp_ring_buffer);
      this_speaker->audio_ring_buffer_ = temp_ring_buffer;
      successful_setup = true;
    }
  }
-  if (!this_speaker->send_esp_err_to_event_group_(this_speaker->start_i2s_driver_(audio_stream_info))) {
+  if (!successful_setup) {
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
-
+  } else {
    bool stop_gracefully = false;
    bool tx_dma_underflow = true;
    uint32_t frames_written = 0;
    uint32_t last_data_received_time = millis();
    bool tx_dma_underflow = false;
-    this_speaker->accumulated_frames_written_ = 0;
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
    // Keep looping if paused, there is no timeout configured, or data was received more recently than the configured
    // timeout
    while (this_speaker->pause_state_ || !this_speaker->timeout_.has_value() ||
           (millis() - last_data_received_time) <= this_speaker->timeout_.value()) {
-      event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
+      uint32_t event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
        xEventGroupClearBits(this_speaker->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
@@ -314,7 +298,7 @@ void I2SAudioSpeaker::speaker_task(void *params) {
        stop_gracefully = true;
      }
-      if (this_speaker->audio_stream_info_ != audio_stream_info) {
+      if (this_speaker->audio_stream_info_ != this_speaker->current_stream_info_) {
        // Audio stream info changed, stop the speaker task so it will restart with the proper settings.
        break;
      }
@@ -326,36 +310,75 @@ void I2SAudioSpeaker::speaker_task(void *params) {
        }
      }
 #else
-      bool overflow;
+      int64_t write_timestamp;
-      while (xQueueReceive(this_speaker->i2s_event_queue_, &overflow, 0)) {
+      while (xQueueReceive(this_speaker->i2s_event_queue_, &write_timestamp, 0)) {
-        if (overflow) {
+        // Receives timing events from the I2S on_sent callback. If actual audio data was sent in this event, it passes
        // on the timing info via the audio_output_callback.
        uint32_t frames_sent = frames_to_fill_single_dma_buffer;
        if (frames_to_fill_single_dma_buffer > frames_written) {
          tx_dma_underflow = true;
          frames_sent = frames_written;
          const uint32_t frames_zeroed = frames_to_fill_single_dma_buffer - frames_written;
          write_timestamp -= this_speaker->current_stream_info_.frames_to_microseconds(frames_zeroed);
        } else {
          tx_dma_underflow = false;
        }
        frames_written -= frames_sent;
        if (frames_sent > 0) {
          this_speaker->audio_output_callback_(frames_sent, write_timestamp);
        }
      }
 #endif
      if (this_speaker->pause_state_) {
        // Pause state is accessed atomically, so thread safe
-        // Delay so the task can yields, then skip transferring audio data
+        // Delay so the task yields, then skip transferring audio data
-        delay(TASK_DELAY_MS);
+        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
        continue;
      }
-      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, data_buffer_size,
+      // Wait half the duration of the data already written to the DMA buffers for new audio data
-                                                                 pdMS_TO_TICKS(TASK_DELAY_MS));
+      // The millisecond helper modifies the frames_written variable, so use the microsecond helper and divide by 1000
      const uint32_t read_delay =
          (this_speaker->current_stream_info_.frames_to_microseconds(frames_written) / 1000) / 2;
      uint8_t *new_data = transfer_buffer->get_buffer_end();  // track start of any newly copied bytes
      size_t bytes_read = transfer_buffer->transfer_data_from_source(pdMS_TO_TICKS(read_delay));
      if (bytes_read > 0) {
-        if ((audio_stream_info.get_bits_per_sample() == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
+        if (this_speaker->q15_volume_factor_ < INT16_MAX) {
-          // Scale samples by the volume factor in place
+          // Apply the software volume adjustment by unpacking the sample into a Q31 fixed-point number, shifting it,
-          q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
+          // multiplying by the volume factor, and packing the sample back into the original bytes per sample.
-                             bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
+
          const size_t bytes_per_sample = this_speaker->current_stream_info_.samples_to_bytes(1);
          const uint32_t len = bytes_read / bytes_per_sample;
          // Use Q16 for samples with 1 or 2 bytes: shifted_sample * gain_factor is Q16 * Q15 -> Q31
          int32_t shift = 15;                                      // Q31 -> Q16
          int32_t gain_factor = this_speaker->q15_volume_factor_;  // Q15
          if (bytes_per_sample >= 3) {
            // Use Q23 for samples with 3 or 4 bytes: shifted_sample * gain_factor is Q23 * Q8 -> Q31
            shift = 8;          // Q31 -> Q23
            gain_factor >>= 7;  // Q15 -> Q8
          }
          for (uint32_t i = 0; i < len; ++i) {
            int32_t sample =
                audio::unpack_audio_sample_to_q31(&new_data[i * bytes_per_sample], bytes_per_sample);  // Q31
            sample >>= shift;
            sample *= gain_factor;  // Q31
            audio::pack_q31_as_audio_sample(sample, &new_data[i * bytes_per_sample], bytes_per_sample);
          }
        }
 #ifdef USE_ESP32_VARIANT_ESP32
        // For ESP32 8/16 bit mono mode samples need to be switched.
-        if (audio_stream_info.get_channels() == 1 && audio_stream_info.get_bits_per_sample() <= 16) {
+        if (this_speaker->current_stream_info_.get_channels() == 1 &&
            this_speaker->current_stream_info_.get_bits_per_sample() <= 16) {
          size_t len = bytes_read / sizeof(int16_t);
-          int16_t *tmp_buf = (int16_t *) this_speaker->data_buffer_;
+          int16_t *tmp_buf = (int16_t *) new_data;
          for (int i = 0; i < len; i += 2) {
            int16_t tmp = tmp_buf[i];
            tmp_buf[i] = tmp_buf[i + 1];
@@ -363,62 +386,87 @@ void I2SAudioSpeaker::speaker_task(void *params) {
          }
        }
 #endif
        // Write the audio data to a single DMA buffer at a time to reduce latency for the audio duration played
        // callback.
        const uint32_t batches = (bytes_read + single_dma_buffer_input_size - 1) / single_dma_buffer_input_size;
        for (uint32_t i = 0; i < batches; ++i) {
          size_t bytes_written = 0;
          size_t bytes_to_write = std::min(single_dma_buffer_input_size, bytes_read);
 #ifdef USE_I2S_LEGACY
          if (audio_stream_info.get_bits_per_sample() == (uint8_t) this_speaker->bits_per_sample_) {
            i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_ + i * single_dma_buffer_input_size,
                      bytes_to_write, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
          } else if (audio_stream_info.get_bits_per_sample() < (uint8_t) this_speaker->bits_per_sample_) {
            i2s_write_expand(this_speaker->parent_->get_port(),
                             this_speaker->data_buffer_ + i * single_dma_buffer_input_size, bytes_to_write,
                             audio_stream_info.get_bits_per_sample(), this_speaker->bits_per_sample_, &bytes_written,
                             pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
      }
 #else
          i2s_channel_write(this_speaker->tx_handle_, this_speaker->data_buffer_ + i * single_dma_buffer_input_size,
                            bytes_to_write, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
 #endif
-          int64_t now = esp_timer_get_time();
+      if (transfer_buffer->available() == 0) {
          if (bytes_written != bytes_to_write) {
            xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
          }
          bytes_read -= bytes_written;
          this_speaker->audio_output_callback_(audio_stream_info.bytes_to_frames(bytes_written),
                                               now + dma_buffers_duration_ms * 1000);
          tx_dma_underflow = false;
          last_data_received_time = millis();
        }
      } else {
        // No data received
        if (stop_gracefully && tx_dma_underflow) {
          break;
        }
-      }
+        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS / 2));
-    }
+      } else {
-
+        size_t bytes_written = 0;
    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
 #ifdef USE_I2S_LEGACY
-    i2s_driver_uninstall(this_speaker->parent_->get_port());
+        if (this_speaker->current_stream_info_.get_bits_per_sample() == (uint8_t) this_speaker->bits_per_sample_) {
          i2s_write(this_speaker->parent_->get_port(), transfer_buffer->get_buffer_start(),
                    transfer_buffer->available(), &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
        } else if (this_speaker->current_stream_info_.get_bits_per_sample() <
                   (uint8_t) this_speaker->bits_per_sample_) {
          i2s_write_expand(this_speaker->parent_->get_port(), transfer_buffer->get_buffer_start(),
                           transfer_buffer->available(), this_speaker->current_stream_info_.get_bits_per_sample(),
                           this_speaker->bits_per_sample_, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
        }
 #else
        if (tx_dma_underflow) {
          // Temporarily disable channel and callback to reset the I2S driver's internal DMA buffer queue so timing
          // callbacks are accurate. Preload the data.
          i2s_channel_disable(this_speaker->tx_handle_);
-    i2s_del_channel(this_speaker->tx_handle_);
+          const i2s_event_callbacks_t callbacks = {
-#endif
+              .on_sent = nullptr,
          };
-    this_speaker->parent_->unlock();
+          i2s_channel_register_event_callback(this_speaker->tx_handle_, &callbacks, this_speaker);
          i2s_channel_preload_data(this_speaker->tx_handle_, transfer_buffer->get_buffer_start(),
                                   transfer_buffer->available(), &bytes_written);
        } else {
          // Audio is already playing, use regular I2S write to add to the DMA buffers
          i2s_channel_write(this_speaker->tx_handle_, transfer_buffer->get_buffer_start(), transfer_buffer->available(),
                            &bytes_written, DMA_BUFFER_DURATION_MS);
        }
 #endif
        if (bytes_written > 0) {
          last_data_received_time = millis();
          frames_written += this_speaker->current_stream_info_.bytes_to_frames(bytes_written);
          transfer_buffer->decrease_buffer_length(bytes_written);
          if (tx_dma_underflow) {
            tx_dma_underflow = false;
 #ifndef USE_I2S_LEGACY
            // Reset the event queue timestamps
            // Enable the on_sent callback to accurately track the timestamps of played audio
            // Enable the I2S channel to start sending the preloaded audio
            xQueueReset(this_speaker->i2s_event_queue_);
            const i2s_event_callbacks_t callbacks = {
                .on_sent = i2s_on_sent_cb,
            };
            i2s_channel_register_event_callback(this_speaker->tx_handle_, &callbacks, this_speaker);
            i2s_channel_enable(this_speaker->tx_handle_);
 #endif
          }
 #ifdef USE_I2S_LEGACY
          // The legacy driver doesn't easily support the callback approach for timestamps, so fall back to a direct but
          // less accurate approach.
          this_speaker->audio_output_callback_(this_speaker->current_stream_info_.bytes_to_frames(bytes_written),
                                               esp_timer_get_time() + dma_buffers_duration_ms * 1000);
 #endif
        }
      }
    }
  }
-  this_speaker->delete_task_(data_buffer_size);
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
  if (transfer_buffer != nullptr) {
    transfer_buffer.reset();
  }
  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_STOPPED);
  while (true) {
    // Continuously delay until the loop method deletes the task
    vTaskDelay(pdMS_TO_TICKS(10));
  }
 }
 void I2SAudioSpeaker::start() {
@@ -427,16 +475,7 @@ void I2SAudioSpeaker::start() {
  if ((this->state_ == speaker::STATE_STARTING) || (this->state_ == speaker::STATE_RUNNING))
    return;
  if (!this->task_created_ && (this->speaker_task_handle_ == nullptr)) {
    xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
                &this->speaker_task_handle_);
    if (this->speaker_task_handle_ != nullptr) {
  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
    } else {
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
    }
  }
 }
 void I2SAudioSpeaker::stop() { this->stop_(false); }
@@ -456,61 +495,16 @@ void I2SAudioSpeaker::stop_(bool wait_on_empty) {
  }
 }
 bool I2SAudioSpeaker::send_esp_err_to_event_group_(esp_err_t err) {
  switch (err) {
    case ESP_OK:
      return false;
    case ESP_ERR_INVALID_STATE:
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_STATE);
      return true;
    case ESP_ERR_INVALID_ARG:
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_ARG);
      return true;
    case ESP_ERR_INVALID_SIZE:
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
      return true;
    case ESP_ERR_NO_MEM:
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
      return true;
    case ESP_ERR_NOT_SUPPORTED:
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED);
      return true;
    default:
      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_FAIL);
      return true;
  }
 }
 esp_err_t I2SAudioSpeaker::allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size) {
  if (this->data_buffer_ == nullptr) {
    // Allocate data buffer for temporarily storing audio from the ring buffer before writing to the I2S bus
    RAMAllocator<uint8_t> allocator;
    this->data_buffer_ = allocator.allocate(data_buffer_size);
  }
  if (this->data_buffer_ == nullptr) {
    return ESP_ERR_NO_MEM;
  }
  if (this->audio_ring_buffer_.use_count() == 0) {
    // Allocate ring buffer. Uses a shared_ptr to ensure it isn't improperly deallocated.
    this->audio_ring_buffer_ = RingBuffer::create(ring_buffer_size);
  }
  if (this->audio_ring_buffer_ == nullptr) {
    return ESP_ERR_NO_MEM;
  }
  return ESP_OK;
 }
 esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info) {
  this->current_stream_info_ = audio_stream_info;  // store the stream info settings the driver will use
 #ifdef USE_I2S_LEGACY
  if ((this->i2s_mode_ & I2S_MODE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) {  // NOLINT
 #else
  if ((this->i2s_role_ & I2S_ROLE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) {  // NOLINT
 #endif
    // Can't reconfigure I2S bus, so the sample rate must match the configured value
    ESP_LOGE(TAG, "Audio stream settings are not compatible with this I2S configuration");
    return ESP_ERR_NOT_SUPPORTED;
  }
@@ -521,10 +515,12 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
      (i2s_slot_bit_width_t) audio_stream_info.get_bits_per_sample() > this->slot_bit_width_) {
 #endif
    // Currently can't handle the case when the incoming audio has more bits per sample than the configured value
    ESP_LOGE(TAG, "Audio streams with more bits per sample than the I2S speaker's configuration is not supported");
    return ESP_ERR_NOT_SUPPORTED;
  }
  if (!this->parent_->try_lock()) {
    ESP_LOGE(TAG, "Parent I2S bus not free");
    return ESP_ERR_INVALID_STATE;
  }
@@ -575,6 +571,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  esp_err_t err =
      i2s_driver_install(this->parent_->get_port(), &config, I2S_EVENT_QUEUE_COUNT, &this->i2s_event_queue_);
  if (err != ESP_OK) {
    ESP_LOGE(TAG, "Failed to install I2S legacy driver");
    // Failed to install the driver, so unlock the I2S port
    this->parent_->unlock();
    return err;
@@ -595,6 +592,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  if (err != ESP_OK) {
    // Failed to set the data out pin, so uninstall the driver and unlock the I2S port
    ESP_LOGE(TAG, "Failed to set the data out pin");
    i2s_driver_uninstall(this->parent_->get_port());
    this->parent_->unlock();
  }
@@ -605,10 +603,12 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
      .dma_desc_num = DMA_BUFFERS_COUNT,
      .dma_frame_num = dma_buffer_length,
      .auto_clear = true,
      .intr_priority = 3,
  };
  /* Allocate a new TX channel and get the handle of this channel */
  esp_err_t err = i2s_new_channel(&chan_cfg, &this->tx_handle_, NULL);
  if (err != ESP_OK) {
    ESP_LOGE(TAG, "Failed to allocate new I2S channel");
    this->parent_->unlock();
    return err;
  }
@@ -652,7 +652,11 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  // per sample causes the audio to play too fast. Setting the ws_width to the configured slot bit width seems to
  // make it play at the correct speed while sending more bits per slot.
  if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
-    std_slot_cfg.ws_width = static_cast<uint32_t>(this->slot_bit_width_);
+    uint32_t configured_bit_width = static_cast<uint32_t>(this->slot_bit_width_);
    std_slot_cfg.ws_width = configured_bit_width;
    if (configured_bit_width > 16) {
      std_slot_cfg.msb_right = false;
    }
  }
 #else
  std_slot_cfg.slot_bit_width = this->slot_bit_width_;
@@ -670,54 +674,56 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  err = i2s_channel_init_std_mode(this->tx_handle_, &std_cfg);
  if (err != ESP_OK) {
    ESP_LOGE(TAG, "Failed to initialize channel");
    i2s_del_channel(this->tx_handle_);
    this->tx_handle_ = nullptr;
    this->parent_->unlock();
    return err;
  }
  if (this->i2s_event_queue_ == nullptr) {
-    this->i2s_event_queue_ = xQueueCreate(1, sizeof(bool));
+    this->i2s_event_queue_ = xQueueCreate(I2S_EVENT_QUEUE_COUNT, sizeof(int64_t));
  }
  const i2s_event_callbacks_t callbacks = {
      .on_send_q_ovf = i2s_overflow_cb,
  };
  i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
  /* Before reading data, start the TX channel first */
  i2s_channel_enable(this->tx_handle_);
  if (err != ESP_OK) {
    i2s_del_channel(this->tx_handle_);
    this->parent_->unlock();
  }
 #endif
  return err;
 }
-void I2SAudioSpeaker::delete_task_(size_t buffer_size) {
+#ifndef USE_I2S_LEGACY
-  this->audio_ring_buffer_.reset();  // Releases ownership of the shared_ptr
+bool IRAM_ATTR I2SAudioSpeaker::i2s_on_sent_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx) {
  int64_t now = esp_timer_get_time();
-  if (this->data_buffer_ != nullptr) {
+  BaseType_t need_yield1 = pdFALSE;
-    RAMAllocator<uint8_t> allocator;
+  BaseType_t need_yield2 = pdFALSE;
-    allocator.deallocate(this->data_buffer_, buffer_size);
+  BaseType_t need_yield3 = pdFALSE;
-    this->data_buffer_ = nullptr;
+
  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) user_ctx;
  if (xQueueIsQueueFullFromISR(this_speaker->i2s_event_queue_)) {
    // Queue is full, so discard the oldest event and set the warning flag to inform the user
    int64_t dummy;
    xQueueReceiveFromISR(this_speaker->i2s_event_queue_, &dummy, &need_yield1);
    xEventGroupSetBitsFromISR(this_speaker->event_group_, SpeakerEventGroupBits::WARN_DROPPED_EVENT, &need_yield2);
  }
-  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPED);
+  xQueueSendToBackFromISR(this_speaker->i2s_event_queue_, &now, &need_yield3);
-  this->task_created_ = false;
+  return need_yield1 | need_yield2 | need_yield3;
  vTaskDelete(nullptr);
 }
 #ifndef USE_I2S_LEGACY
 bool IRAM_ATTR I2SAudioSpeaker::i2s_overflow_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx) {
  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) user_ctx;
  bool overflow = true;
  xQueueOverwrite(this_speaker->i2s_event_queue_, &overflow);
  return false;
 }
 #endif
 void I2SAudioSpeaker::stop_i2s_driver_() {
 #ifdef USE_I2S_LEGACY
  i2s_driver_uninstall(this->parent_->get_port());
 #else
  i2s_channel_disable(this->tx_handle_);
  i2s_del_channel(this->tx_handle_);
  this->tx_handle_ = nullptr;
 #endif
  this->parent_->unlock();
 }
 }  // namespace i2s_audio
 }  // namespace esphome
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@@ -72,70 +72,57 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
 protected:
  /// @brief Function for the FreeRTOS task handling audio output.
-  /// After receiving the COMMAND_START signal, allocates space for the buffers, starts the I2S driver, and reads
+  /// Allocates space for the buffers, reads audio from the ring buffer and writes audio to the I2S port. Stops
-  /// audio from the ring buffer and writes audio to the I2S port. Stops immmiately after receiving the COMMAND_STOP
+  /// immmiately after receiving the COMMAND_STOP signal and stops only after the ring buffer is empty after receiving
-  /// signal and stops only after the ring buffer is empty after receiving the COMMAND_STOP_GRACEFULLY signal. Stops if
+  /// the COMMAND_STOP_GRACEFULLY signal. Stops if the ring buffer hasn't read data for more than timeout_ milliseconds.
-  /// the ring buffer hasn't read data for more than timeout_ milliseconds. When stopping, it deallocates the buffers,
+  /// When stopping, it deallocates the buffers. It communicates its state and any errors via ``event_group_``.
  /// stops the I2S driver, unlocks the I2S port, and deletes the task. It communicates the state and any errors via
  /// event_group_.
  /// @param params I2SAudioSpeaker component
  static void speaker_task(void *params);
-  /// @brief Sends a stop command to the speaker task via event_group_.
+  /// @brief Sends a stop command to the speaker task via ``event_group_``.
  /// @param wait_on_empty If false, sends the COMMAND_STOP signal. If true, sends the COMMAND_STOP_GRACEFULLY signal.
  void stop_(bool wait_on_empty);
  /// @brief Sets the corresponding ERR_ESP event group bits.
  /// @param err esp_err_t error code.
  /// @return True if an ERR_ESP bit is set and false if err == ESP_OK
  bool send_esp_err_to_event_group_(esp_err_t err);
 #ifndef USE_I2S_LEGACY
-  static bool i2s_overflow_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx);
+  /// @brief Callback function used to send playback timestamps the to the speaker task.
  /// @param handle (i2s_chan_handle_t)
  /// @param event (i2s_event_data_t)
  /// @param user_ctx (void*) User context pointer that the callback accesses
  /// @return True if a higher priority task was interrupted
  static bool i2s_on_sent_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx);
 #endif
  /// @brief Allocates the data buffer and ring buffer
  /// @param data_buffer_size Number of bytes to allocate for the data buffer.
  /// @param ring_buffer_size Number of bytes to allocate for the ring buffer.
  /// @return ESP_ERR_NO_MEM if either buffer fails to allocate
  ///         ESP_OK if successful
  esp_err_t allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size);
  /// @brief Starts the ESP32 I2S driver.
  /// Attempts to lock the I2S port, starts the I2S driver using the passed in stream information, and sets the data out
-  /// pin. If it fails, it will unlock the I2S port and uninstall the driver, if necessary.
+  /// pin. If it fails, it will unlock the I2S port and uninstalls the driver, if necessary.
  /// @param audio_stream_info Stream information for the I2S driver.
  /// @return ESP_ERR_NOT_ALLOWED if the I2S port can't play the incoming audio stream.
  ///         ESP_ERR_INVALID_STATE if the I2S port is already locked.
-  ///         ESP_ERR_INVALID_ARG if nstalling the driver or setting the data outpin fails due to a parameter error.
+  ///         ESP_ERR_INVALID_ARG if installing the driver or setting the data outpin fails due to a parameter error.
  ///         ESP_ERR_NO_MEM if the driver fails to install due to a memory allocation error.
-  ///         ESP_FAIL if setting the data out pin fails due to an IO error ESP_OK if successful
+  ///         ESP_FAIL if setting the data out pin fails due to an IO error
  ///         ESP_OK if successful
  esp_err_t start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info);
-  /// @brief Deletes the speaker's task.
+  /// @brief Stops the I2S driver and unlocks the I2S port
-  /// Deallocates the data_buffer_ and audio_ring_buffer_, if necessary, and deletes the task. Should only be called by
+  void stop_i2s_driver_();
  /// the speaker_task itself.
  /// @param buffer_size The allocated size of the data_buffer_.
  void delete_task_(size_t buffer_size);
  TaskHandle_t speaker_task_handle_{nullptr};
  EventGroupHandle_t event_group_{nullptr};
  QueueHandle_t i2s_event_queue_;
-  uint8_t *data_buffer_;
+  std::weak_ptr<RingBuffer> audio_ring_buffer_;
  std::shared_ptr<RingBuffer> audio_ring_buffer_;
  uint32_t buffer_duration_ms_;
  optional<uint32_t> timeout_;
  bool task_created_{false};
  bool pause_state_{false};
  int16_t q15_volume_factor_{INT16_MAX};
-  size_t bytes_written_{0};
+  audio::AudioStreamInfo current_stream_info_;  // The currently loaded driver's stream info
 #ifdef USE_I2S_LEGACY
 #if SOC_I2S_SUPPORTS_DAC
@@ -148,8 +135,6 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
  std::string i2s_comm_fmt_;
  i2s_chan_handle_t tx_handle_;
 #endif
  uint32_t accumulated_frames_written_{0};
 };
 }  // namespace i2s_audio