[i2s_audio] Speaker improvements: CPU core agnostic and more accurate timestamps (#9800)

Co-authored-by: NP v/d Spek <github_mail@lumensoft.nl>
2025-07-29 14:46:40 +00:00 · 2025-07-24 04:14:00 +01:00 · 2025-07-24 04:14:00 +01:00 · 6398bb2fdf
commit 6398bb2fdf
parent 108e447072
3 changed files with 302 additions and 307 deletions
--- a/esphome/components/i2s_audio/init.py
+++ b/esphome/components/i2s_audio/init.py
@ -1,6 +1,6 @@
 from esphome import pins
 import esphome.codegen as cg
-from esphome.components.esp32 import get_esp32_variant
+from esphome.components.esp32 import add_idf_sdkconfig_option, get_esp32_variant
 from esphome.components.esp32.const import (
    VARIANT_ESP32,
    VARIANT_ESP32C3,
@ -258,6 +258,10 @@ async def to_code(config):
    if use_legacy():
        cg.add_define("USE_I2S_LEGACY")

+    # Helps avoid callbacks being skipped due to processor load
+    if CORE.using_esp_idf:
+        add_idf_sdkconfig_option("CONFIG_I2S_ISR_IRAM_SAFE", True)
+
    cg.add(var.set_lrclk_pin(config[CONF_I2S_LRCLK_PIN]))
    if CONF_I2S_BCLK_PIN in config:
        cg.add(var.set_bclk_pin(config[CONF_I2S_BCLK_PIN]))
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@ -9,6 +9,7 @@
 #endif

 #include "esphome/components/audio/audio.h"
+#include "esphome/components/audio/audio_transfer_buffer.h"

 #include "esphome/core/application.h"
 #include "esphome/core/hal.h"
@ -19,72 +20,33 @@
 namespace esphome {
 namespace i2s_audio {

-static const uint8_t DMA_BUFFER_DURATION_MS = 15;
+static const uint32_t DMA_BUFFER_DURATION_MS = 15;
 static const size_t DMA_BUFFERS_COUNT = 4;

-static const size_t TASK_DELAY_MS = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT / 2;
-
 static const size_t TASK_STACK_SIZE = 4096;
-static const ssize_t TASK_PRIORITY = 23;
+static const ssize_t TASK_PRIORITY = 19;

 static const size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT + 1;

 static const char *const TAG = "i2s_audio.speaker";

 enum SpeakerEventGroupBits : uint32_t {
-  COMMAND_START = (1 << 0),            // starts the speaker task
+  COMMAND_START = (1 << 0),            // indicates loop should start speaker task
  COMMAND_STOP = (1 << 1),             // stops the speaker task
  COMMAND_STOP_GRACEFULLY = (1 << 2),  // Stops the speaker task once all data has been written
-  STATE_STARTING = (1 << 10),
-  STATE_RUNNING = (1 << 11),
-  STATE_STOPPING = (1 << 12),
-  STATE_STOPPED = (1 << 13),
-  ERR_TASK_FAILED_TO_START = (1 << 14),
-  ERR_ESP_INVALID_STATE = (1 << 15),
-  ERR_ESP_NOT_SUPPORTED = (1 << 16),
-  ERR_ESP_INVALID_ARG = (1 << 17),
-  ERR_ESP_INVALID_SIZE = (1 << 18),
+
+  TASK_STARTING = (1 << 10),
+  TASK_RUNNING = (1 << 11),
+  TASK_STOPPING = (1 << 12),
+  TASK_STOPPED = (1 << 13),
+
  ERR_ESP_NO_MEM = (1 << 19),
-  ERR_ESP_FAIL = (1 << 20),
-  ALL_ERR_ESP_BITS = ERR_ESP_INVALID_STATE | ERR_ESP_NOT_SUPPORTED | ERR_ESP_INVALID_ARG | ERR_ESP_INVALID_SIZE |
-                     ERR_ESP_NO_MEM | ERR_ESP_FAIL,
+
+  WARN_DROPPED_EVENT = (1 << 20),
+
  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
 };

-// Translates a SpeakerEventGroupBits ERR_ESP bit to the coressponding esp_err_t
-static esp_err_t err_bit_to_esp_err(uint32_t bit) {
-  switch (bit) {
-    case SpeakerEventGroupBits::ERR_ESP_INVALID_STATE:
-      return ESP_ERR_INVALID_STATE;
-    case SpeakerEventGroupBits::ERR_ESP_INVALID_ARG:
-      return ESP_ERR_INVALID_ARG;
-    case SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE:
-      return ESP_ERR_INVALID_SIZE;
-    case SpeakerEventGroupBits::ERR_ESP_NO_MEM:
-      return ESP_ERR_NO_MEM;
-    case SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED:
-      return ESP_ERR_NOT_SUPPORTED;
-    default:
-      return ESP_FAIL;
-  }
-}
-
-/// @brief Multiplies the input array of Q15 numbers by a Q15 constant factor
-///
-/// Based on `dsps_mulc_s16_ansi` from the esp-dsp library:
-/// https://github.com/espressif/esp-dsp/blob/master/modules/math/mulc/fixed/dsps_mulc_s16_ansi.c
-/// (accessed on 2024-09-30).
-/// @param input Array of Q15 numbers
-/// @param output Array of Q15 numbers
-/// @param len Length of array
-/// @param c Q15 constant factor
-static void q15_multiplication(const int16_t *input, int16_t *output, size_t len, int16_t c) {
-  for (int i = 0; i < len; i++) {
-    int32_t acc = (int32_t) input[i] * (int32_t) c;
-    output[i] = (int16_t) (acc >> 15);
-  }
-}
-
 // Lists the Q15 fixed point scaling factor for volume reduction.
 // Has 100 values representing silence and a reduction [49, 48.5, ... 0.5, 0] dB.
 // dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014)
@ -132,51 +94,80 @@ void I2SAudioSpeaker::dump_config() {
 void I2SAudioSpeaker::loop() {
  uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);

-  if (event_group_bits & SpeakerEventGroupBits::STATE_STARTING) {
-    ESP_LOGD(TAG, "Starting");
+  if ((event_group_bits & SpeakerEventGroupBits::COMMAND_START) && (this->state_ == speaker::STATE_STOPPED)) {
    this->state_ = speaker::STATE_STARTING;
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STARTING);
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
  }
-  if (event_group_bits & SpeakerEventGroupBits::STATE_RUNNING) {
+
+  // Handle the task's state
+  if (event_group_bits & SpeakerEventGroupBits::TASK_STARTING) {
+    ESP_LOGD(TAG, "Starting");
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_STARTING);
+  }
+  if (event_group_bits & SpeakerEventGroupBits::TASK_RUNNING) {
    ESP_LOGD(TAG, "Started");
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
    this->state_ = speaker::STATE_RUNNING;
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
-    this->status_clear_warning();
-    this->status_clear_error();
  }
-  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPING) {
+  if (event_group_bits & SpeakerEventGroupBits::TASK_STOPPING) {
    ESP_LOGD(TAG, "Stopping");
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
    this->state_ = speaker::STATE_STOPPING;
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
  }
-  if (event_group_bits & SpeakerEventGroupBits::STATE_STOPPED) {
-    if (!this->task_created_) {
-      ESP_LOGD(TAG, "Stopped");
-      this->state_ = speaker::STATE_STOPPED;
-      xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
-      this->speaker_task_handle_ = nullptr;
-    }
+  if (event_group_bits & SpeakerEventGroupBits::TASK_STOPPED) {
+    ESP_LOGD(TAG, "Stopped");
+
+    vTaskDelete(this->speaker_task_handle_);
+    this->speaker_task_handle_ = nullptr;
+
+    this->stop_i2s_driver_();
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ALL_BITS);
+    this->status_clear_error();
+
+    this->state_ = speaker::STATE_STOPPED;
  }

-  if (event_group_bits & SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START) {
-    this->status_set_error("Failed to start task");
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
+  // Log any errors encounted by the task
+  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NO_MEM) {
+    ESP_LOGE(TAG, "Not enough memory");
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
  }

-  if (event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS) {
-    uint32_t error_bits = event_group_bits & SpeakerEventGroupBits::ALL_ERR_ESP_BITS;
-    ESP_LOGW(TAG, "Writing failed: %s", esp_err_to_name(err_bit_to_esp_err(error_bits)));
-    this->status_set_warning();
+  // Warn if any playback timestamp events are dropped, which drastically reduces synced playback accuracy
+  if (event_group_bits & SpeakerEventGroupBits::WARN_DROPPED_EVENT) {
+    ESP_LOGW(TAG, "Event dropped, synchronized playback accuracy is reduced");
+    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::WARN_DROPPED_EVENT);
  }

-  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED) {
-    this->status_set_error("Failed to adjust bus to match incoming audio");
-    ESP_LOGE(TAG, "Incompatible audio format: sample rate = %" PRIu32 ", channels = %u, bits per sample = %u",
-             this->audio_stream_info_.get_sample_rate(), this->audio_stream_info_.get_channels(),
-             this->audio_stream_info_.get_bits_per_sample());
-  }
+  // Handle the speaker's state
+  switch (this->state_) {
+    case speaker::STATE_STARTING:
+      if (this->status_has_error()) {
+        break;
+      }

-  xEventGroupClearBits(this->event_group_, ALL_ERR_ESP_BITS);
+      if (this->start_i2s_driver_(this->audio_stream_info_) != ESP_OK) {
+        ESP_LOGE(TAG, "Driver failed to start; retrying in 1 second");
+        this->status_momentary_error("driver-faiure", 1000);
+        break;
+      }
+
+      if (this->speaker_task_handle_ == nullptr) {
+        xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
+                    &this->speaker_task_handle_);
+
+        if (this->speaker_task_handle_ == nullptr) {
+          ESP_LOGE(TAG, "Task failed to start, retrying in 1 second");
+          this->status_momentary_error("task-failure", 1000);
+          this->stop_i2s_driver_();  // Stops the driver to return the lock; will be reloaded in next attempt
+        }
+      }
+      break;
+    case speaker::STATE_RUNNING:   // Intentional fallthrough
+    case speaker::STATE_STOPPING:  // Intentional fallthrough
+    case speaker::STATE_STOPPED:
+      break;
+  }
 }

 void I2SAudioSpeaker::set_volume(float volume) {
@ -227,83 +218,76 @@ size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length, TickType_t tick
    this->start();
  }

-  if ((this->state_ != speaker::STATE_RUNNING) || (this->audio_ring_buffer_.use_count() != 1)) {
+  if (this->state_ != speaker::STATE_RUNNING) {
    // Unable to write data to a running speaker, so delay the max amount of time so it can get ready
    vTaskDelay(ticks_to_wait);
    ticks_to_wait = 0;
  }

  size_t bytes_written = 0;
-  if ((this->state_ == speaker::STATE_RUNNING) && (this->audio_ring_buffer_.use_count() == 1)) {
-    // Only one owner of the ring buffer (the speaker task), so the ring buffer is allocated and no other components are
-    // attempting to write to it.
-
-    // Temporarily share ownership of the ring buffer so it won't be deallocated while writing
-    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_;
-    bytes_written = temp_ring_buffer->write_without_replacement((void *) data, length, ticks_to_wait);
+  if (this->state_ == speaker::STATE_RUNNING) {
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_.lock();
+    if (temp_ring_buffer.use_count() == 2) {
+      // Only the speaker task and this temp_ring_buffer own the ring buffer, so its safe to write to
+      bytes_written = temp_ring_buffer->write_without_replacement((void *) data, length, ticks_to_wait);
+    }
  }

  return bytes_written;
 }

 bool I2SAudioSpeaker::has_buffered_data() const {
-  if (this->audio_ring_buffer_ != nullptr) {
-    return this->audio_ring_buffer_->available() > 0;
+  if (this->audio_ring_buffer_.use_count() > 0) {
+    std::shared_ptr<RingBuffer> temp_ring_buffer = this->audio_ring_buffer_.lock();
+    return temp_ring_buffer->available() > 0;
  }
  return false;
 }

 void I2SAudioSpeaker::speaker_task(void *params) {
  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) params;
-  this_speaker->task_created_ = true;

-  uint32_t event_group_bits =
-      xEventGroupWaitBits(this_speaker->event_group_,
-                          SpeakerEventGroupBits::COMMAND_START | SpeakerEventGroupBits::COMMAND_STOP |
-                              SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY,  // Bit message to read
-                          pdTRUE,                                              // Clear the bits on exit
-                          pdFALSE,                                             // Don't wait for all the bits,
-                          portMAX_DELAY);                                      // Block indefinitely until a bit is set
-
-  if (event_group_bits & (SpeakerEventGroupBits::COMMAND_STOP | SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY)) {
-    // Received a stop signal before the task was requested to start
-    this_speaker->delete_task_(0);
-  }
-
-  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STARTING);
-
-  audio::AudioStreamInfo audio_stream_info = this_speaker->audio_stream_info_;
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_STARTING);

  const uint32_t dma_buffers_duration_ms = DMA_BUFFER_DURATION_MS * DMA_BUFFERS_COUNT;
  // Ensure ring buffer duration is at least the duration of all DMA buffers
  const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this_speaker->buffer_duration_ms_);

  // The DMA buffers may have more bits per sample, so calculate buffer sizes based in the input audio stream info
-  const size_t data_buffer_size = audio_stream_info.ms_to_bytes(dma_buffers_duration_ms);
-  const size_t ring_buffer_size = audio_stream_info.ms_to_bytes(ring_buffer_duration);
+  const size_t ring_buffer_size = this_speaker->current_stream_info_.ms_to_bytes(ring_buffer_duration);

-  const size_t single_dma_buffer_input_size = data_buffer_size / DMA_BUFFERS_COUNT;
+  const uint32_t frames_to_fill_single_dma_buffer =
+      this_speaker->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
+  const size_t bytes_to_fill_single_dma_buffer =
+      this_speaker->current_stream_info_.frames_to_bytes(frames_to_fill_single_dma_buffer);

-  if (this_speaker->send_esp_err_to_event_group_(this_speaker->allocate_buffers_(data_buffer_size, ring_buffer_size))) {
-    // Failed to allocate buffers
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
-    this_speaker->delete_task_(data_buffer_size);
+  bool successful_setup = false;
+  std::unique_ptr<audio::AudioSourceTransferBuffer> transfer_buffer =
+      audio::AudioSourceTransferBuffer::create(bytes_to_fill_single_dma_buffer);
+
+  if (transfer_buffer != nullptr) {
+    std::shared_ptr<RingBuffer> temp_ring_buffer = RingBuffer::create(ring_buffer_size);
+    if (temp_ring_buffer.use_count() == 1) {
+      transfer_buffer->set_source(temp_ring_buffer);
+      this_speaker->audio_ring_buffer_ = temp_ring_buffer;
+      successful_setup = true;
+    }
  }

-  if (!this_speaker->send_esp_err_to_event_group_(this_speaker->start_i2s_driver_(audio_stream_info))) {
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_RUNNING);
-
+  if (!successful_setup) {
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
+  } else {
    bool stop_gracefully = false;
+    bool tx_dma_underflow = true;
+
+    uint32_t frames_written = 0;
    uint32_t last_data_received_time = millis();
-    bool tx_dma_underflow = false;

-    this_speaker->accumulated_frames_written_ = 0;
+    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_RUNNING);

-    // Keep looping if paused, there is no timeout configured, or data was received more recently than the configured
-    // timeout
    while (this_speaker->pause_state_ || !this_speaker->timeout_.has_value() ||
           (millis() - last_data_received_time) <= this_speaker->timeout_.value()) {
-      event_group_bits = xEventGroupGetBits(this_speaker->event_group_);
+      uint32_t event_group_bits = xEventGroupGetBits(this_speaker->event_group_);

      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
        xEventGroupClearBits(this_speaker->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
@ -314,7 +298,7 @@ void I2SAudioSpeaker::speaker_task(void *params) {
        stop_gracefully = true;
      }

-      if (this_speaker->audio_stream_info_ != audio_stream_info) {
+      if (this_speaker->audio_stream_info_ != this_speaker->current_stream_info_) {
        // Audio stream info changed, stop the speaker task so it will restart with the proper settings.
        break;
      }
@ -326,36 +310,75 @@ void I2SAudioSpeaker::speaker_task(void *params) {
        }
      }
 #else
-      bool overflow;
-      while (xQueueReceive(this_speaker->i2s_event_queue_, &overflow, 0)) {
-        if (overflow) {
+      int64_t write_timestamp;
+      while (xQueueReceive(this_speaker->i2s_event_queue_, &write_timestamp, 0)) {
+        // Receives timing events from the I2S on_sent callback. If actual audio data was sent in this event, it passes
+        // on the timing info via the audio_output_callback.
+        uint32_t frames_sent = frames_to_fill_single_dma_buffer;
+        if (frames_to_fill_single_dma_buffer > frames_written) {
          tx_dma_underflow = true;
+          frames_sent = frames_written;
+          const uint32_t frames_zeroed = frames_to_fill_single_dma_buffer - frames_written;
+          write_timestamp -= this_speaker->current_stream_info_.frames_to_microseconds(frames_zeroed);
+        } else {
+          tx_dma_underflow = false;
+        }
+        frames_written -= frames_sent;
+        if (frames_sent > 0) {
+          this_speaker->audio_output_callback_(frames_sent, write_timestamp);
        }
      }
 #endif

      if (this_speaker->pause_state_) {
        // Pause state is accessed atomically, so thread safe
-        // Delay so the task can yields, then skip transferring audio data
-        delay(TASK_DELAY_MS);
+        // Delay so the task yields, then skip transferring audio data
+        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
        continue;
      }

-      size_t bytes_read = this_speaker->audio_ring_buffer_->read((void *) this_speaker->data_buffer_, data_buffer_size,
-                                                                 pdMS_TO_TICKS(TASK_DELAY_MS));
+      // Wait half the duration of the data already written to the DMA buffers for new audio data
+      // The millisecond helper modifies the frames_written variable, so use the microsecond helper and divide by 1000
+      const uint32_t read_delay =
+          (this_speaker->current_stream_info_.frames_to_microseconds(frames_written) / 1000) / 2;
+
+      uint8_t *new_data = transfer_buffer->get_buffer_end();  // track start of any newly copied bytes
+      size_t bytes_read = transfer_buffer->transfer_data_from_source(pdMS_TO_TICKS(read_delay));

      if (bytes_read > 0) {
-        if ((audio_stream_info.get_bits_per_sample() == 16) && (this_speaker->q15_volume_factor_ < INT16_MAX)) {
-          // Scale samples by the volume factor in place
-          q15_multiplication((int16_t *) this_speaker->data_buffer_, (int16_t *) this_speaker->data_buffer_,
-                             bytes_read / sizeof(int16_t), this_speaker->q15_volume_factor_);
+        if (this_speaker->q15_volume_factor_ < INT16_MAX) {
+          // Apply the software volume adjustment by unpacking the sample into a Q31 fixed-point number, shifting it,
+          // multiplying by the volume factor, and packing the sample back into the original bytes per sample.
+
+          const size_t bytes_per_sample = this_speaker->current_stream_info_.samples_to_bytes(1);
+          const uint32_t len = bytes_read / bytes_per_sample;
+
+          // Use Q16 for samples with 1 or 2 bytes: shifted_sample * gain_factor is Q16 * Q15 -> Q31
+          int32_t shift = 15;                                      // Q31 -> Q16
+          int32_t gain_factor = this_speaker->q15_volume_factor_;  // Q15
+
+          if (bytes_per_sample >= 3) {
+            // Use Q23 for samples with 3 or 4 bytes: shifted_sample * gain_factor is Q23 * Q8 -> Q31
+
+            shift = 8;          // Q31 -> Q23
+            gain_factor >>= 7;  // Q15 -> Q8
+          }
+
+          for (uint32_t i = 0; i < len; ++i) {
+            int32_t sample =
+                audio::unpack_audio_sample_to_q31(&new_data[i * bytes_per_sample], bytes_per_sample);  // Q31
+            sample >>= shift;
+            sample *= gain_factor;  // Q31
+            audio::pack_q31_as_audio_sample(sample, &new_data[i * bytes_per_sample], bytes_per_sample);
+          }
        }

 #ifdef USE_ESP32_VARIANT_ESP32
        // For ESP32 8/16 bit mono mode samples need to be switched.
-        if (audio_stream_info.get_channels() == 1 && audio_stream_info.get_bits_per_sample() <= 16) {
+        if (this_speaker->current_stream_info_.get_channels() == 1 &&
+            this_speaker->current_stream_info_.get_bits_per_sample() <= 16) {
          size_t len = bytes_read / sizeof(int16_t);
-          int16_t *tmp_buf = (int16_t *) this_speaker->data_buffer_;
+          int16_t *tmp_buf = (int16_t *) new_data;
          for (int i = 0; i < len; i += 2) {
            int16_t tmp = tmp_buf[i];
            tmp_buf[i] = tmp_buf[i + 1];
@ -363,62 +386,87 @@ void I2SAudioSpeaker::speaker_task(void *params) {
          }
        }
 #endif
-        // Write the audio data to a single DMA buffer at a time to reduce latency for the audio duration played
-        // callback.
-        const uint32_t batches = (bytes_read + single_dma_buffer_input_size - 1) / single_dma_buffer_input_size;
+      }

-        for (uint32_t i = 0; i < batches; ++i) {
-          size_t bytes_written = 0;
-          size_t bytes_to_write = std::min(single_dma_buffer_input_size, bytes_read);
-
-#ifdef USE_I2S_LEGACY
-          if (audio_stream_info.get_bits_per_sample() == (uint8_t) this_speaker->bits_per_sample_) {
-            i2s_write(this_speaker->parent_->get_port(), this_speaker->data_buffer_ + i * single_dma_buffer_input_size,
-                      bytes_to_write, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
-          } else if (audio_stream_info.get_bits_per_sample() < (uint8_t) this_speaker->bits_per_sample_) {
-            i2s_write_expand(this_speaker->parent_->get_port(),
-                             this_speaker->data_buffer_ + i * single_dma_buffer_input_size, bytes_to_write,
-                             audio_stream_info.get_bits_per_sample(), this_speaker->bits_per_sample_, &bytes_written,
-                             pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
-          }
-#else
-          i2s_channel_write(this_speaker->tx_handle_, this_speaker->data_buffer_ + i * single_dma_buffer_input_size,
-                            bytes_to_write, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5));
-#endif
-
-          int64_t now = esp_timer_get_time();
-
-          if (bytes_written != bytes_to_write) {
-            xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
-          }
-          bytes_read -= bytes_written;
-
-          this_speaker->audio_output_callback_(audio_stream_info.bytes_to_frames(bytes_written),
-                                               now + dma_buffers_duration_ms * 1000);
-
-          tx_dma_underflow = false;
-          last_data_received_time = millis();
-        }
-      } else {
-        // No data received
+      if (transfer_buffer->available() == 0) {
        if (stop_gracefully && tx_dma_underflow) {
          break;
        }
+        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS / 2));
+      } else {
+        size_t bytes_written = 0;
+#ifdef USE_I2S_LEGACY
+        if (this_speaker->current_stream_info_.get_bits_per_sample() == (uint8_t) this_speaker->bits_per_sample_) {
+          i2s_write(this_speaker->parent_->get_port(), transfer_buffer->get_buffer_start(),
+                    transfer_buffer->available(), &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
+        } else if (this_speaker->current_stream_info_.get_bits_per_sample() <
+                   (uint8_t) this_speaker->bits_per_sample_) {
+          i2s_write_expand(this_speaker->parent_->get_port(), transfer_buffer->get_buffer_start(),
+                           transfer_buffer->available(), this_speaker->current_stream_info_.get_bits_per_sample(),
+                           this_speaker->bits_per_sample_, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
+        }
+#else
+        if (tx_dma_underflow) {
+          // Temporarily disable channel and callback to reset the I2S driver's internal DMA buffer queue so timing
+          // callbacks are accurate. Preload the data.
+          i2s_channel_disable(this_speaker->tx_handle_);
+          const i2s_event_callbacks_t callbacks = {
+              .on_sent = nullptr,
+          };
+
+          i2s_channel_register_event_callback(this_speaker->tx_handle_, &callbacks, this_speaker);
+          i2s_channel_preload_data(this_speaker->tx_handle_, transfer_buffer->get_buffer_start(),
+                                   transfer_buffer->available(), &bytes_written);
+        } else {
+          // Audio is already playing, use regular I2S write to add to the DMA buffers
+          i2s_channel_write(this_speaker->tx_handle_, transfer_buffer->get_buffer_start(), transfer_buffer->available(),
+                            &bytes_written, DMA_BUFFER_DURATION_MS);
+        }
+#endif
+        if (bytes_written > 0) {
+          last_data_received_time = millis();
+          frames_written += this_speaker->current_stream_info_.bytes_to_frames(bytes_written);
+          transfer_buffer->decrease_buffer_length(bytes_written);
+          if (tx_dma_underflow) {
+            tx_dma_underflow = false;
+#ifndef USE_I2S_LEGACY
+            // Reset the event queue timestamps
+            // Enable the on_sent callback to accurately track the timestamps of played audio
+            // Enable the I2S channel to start sending the preloaded audio
+
+            xQueueReset(this_speaker->i2s_event_queue_);
+
+            const i2s_event_callbacks_t callbacks = {
+                .on_sent = i2s_on_sent_cb,
+            };
+            i2s_channel_register_event_callback(this_speaker->tx_handle_, &callbacks, this_speaker);
+
+            i2s_channel_enable(this_speaker->tx_handle_);
+#endif
+          }
+#ifdef USE_I2S_LEGACY
+          // The legacy driver doesn't easily support the callback approach for timestamps, so fall back to a direct but
+          // less accurate approach.
+          this_speaker->audio_output_callback_(this_speaker->current_stream_info_.bytes_to_frames(bytes_written),
+                                               esp_timer_get_time() + dma_buffers_duration_ms * 1000);
+#endif
+        }
      }
    }
-
-    xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::STATE_STOPPING);
-#ifdef USE_I2S_LEGACY
-    i2s_driver_uninstall(this_speaker->parent_->get_port());
-#else
-    i2s_channel_disable(this_speaker->tx_handle_);
-    i2s_del_channel(this_speaker->tx_handle_);
-#endif
-
-    this_speaker->parent_->unlock();
  }

-  this_speaker->delete_task_(data_buffer_size);
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
+
+  if (transfer_buffer != nullptr) {
+    transfer_buffer.reset();
+  }
+
+  xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::TASK_STOPPED);
+
+  while (true) {
+    // Continuously delay until the loop method deletes the task
+    vTaskDelay(pdMS_TO_TICKS(10));
+  }
 }

 void I2SAudioSpeaker::start() {
@ -427,16 +475,7 @@ void I2SAudioSpeaker::start() {
  if ((this->state_ == speaker::STATE_STARTING) || (this->state_ == speaker::STATE_RUNNING))
    return;

-  if (!this->task_created_ && (this->speaker_task_handle_ == nullptr)) {
-    xTaskCreate(I2SAudioSpeaker::speaker_task, "speaker_task", TASK_STACK_SIZE, (void *) this, TASK_PRIORITY,
-                &this->speaker_task_handle_);
-
-    if (this->speaker_task_handle_ != nullptr) {
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
-    } else {
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_TASK_FAILED_TO_START);
-    }
-  }
+  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::COMMAND_START);
 }

 void I2SAudioSpeaker::stop() { this->stop_(false); }
@ -456,61 +495,16 @@ void I2SAudioSpeaker::stop_(bool wait_on_empty) {
  }
 }

-bool I2SAudioSpeaker::send_esp_err_to_event_group_(esp_err_t err) {
-  switch (err) {
-    case ESP_OK:
-      return false;
-    case ESP_ERR_INVALID_STATE:
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_STATE);
-      return true;
-    case ESP_ERR_INVALID_ARG:
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_ARG);
-      return true;
-    case ESP_ERR_INVALID_SIZE:
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE);
-      return true;
-    case ESP_ERR_NO_MEM:
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
-      return true;
-    case ESP_ERR_NOT_SUPPORTED:
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NOT_SUPPORTED);
-      return true;
-    default:
-      xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_FAIL);
-      return true;
-  }
-}
-
-esp_err_t I2SAudioSpeaker::allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size) {
-  if (this->data_buffer_ == nullptr) {
-    // Allocate data buffer for temporarily storing audio from the ring buffer before writing to the I2S bus
-    RAMAllocator<uint8_t> allocator;
-    this->data_buffer_ = allocator.allocate(data_buffer_size);
-  }
-
-  if (this->data_buffer_ == nullptr) {
-    return ESP_ERR_NO_MEM;
-  }
-
-  if (this->audio_ring_buffer_.use_count() == 0) {
-    // Allocate ring buffer. Uses a shared_ptr to ensure it isn't improperly deallocated.
-    this->audio_ring_buffer_ = RingBuffer::create(ring_buffer_size);
-  }
-
-  if (this->audio_ring_buffer_ == nullptr) {
-    return ESP_ERR_NO_MEM;
-  }
-
-  return ESP_OK;
-}
-
 esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info) {
+  this->current_stream_info_ = audio_stream_info;  // store the stream info settings the driver will use
+
 #ifdef USE_I2S_LEGACY
  if ((this->i2s_mode_ & I2S_MODE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) {  // NOLINT
 #else
  if ((this->i2s_role_ & I2S_ROLE_SLAVE) && (this->sample_rate_ != audio_stream_info.get_sample_rate())) {  // NOLINT
 #endif
    // Can't reconfigure I2S bus, so the sample rate must match the configured value
+    ESP_LOGE(TAG, "Audio stream settings are not compatible with this I2S configuration");
    return ESP_ERR_NOT_SUPPORTED;
  }

@ -521,10 +515,12 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
      (i2s_slot_bit_width_t) audio_stream_info.get_bits_per_sample() > this->slot_bit_width_) {
 #endif
    // Currently can't handle the case when the incoming audio has more bits per sample than the configured value
+    ESP_LOGE(TAG, "Audio streams with more bits per sample than the I2S speaker's configuration is not supported");
    return ESP_ERR_NOT_SUPPORTED;
  }

  if (!this->parent_->try_lock()) {
+    ESP_LOGE(TAG, "Parent I2S bus not free");
    return ESP_ERR_INVALID_STATE;
  }

@ -575,6 +571,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  esp_err_t err =
      i2s_driver_install(this->parent_->get_port(), &config, I2S_EVENT_QUEUE_COUNT, &this->i2s_event_queue_);
  if (err != ESP_OK) {
+    ESP_LOGE(TAG, "Failed to install I2S legacy driver");
    // Failed to install the driver, so unlock the I2S port
    this->parent_->unlock();
    return err;
@ -595,6 +592,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea

  if (err != ESP_OK) {
    // Failed to set the data out pin, so uninstall the driver and unlock the I2S port
+    ESP_LOGE(TAG, "Failed to set the data out pin");
    i2s_driver_uninstall(this->parent_->get_port());
    this->parent_->unlock();
  }
@ -605,10 +603,12 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
      .dma_desc_num = DMA_BUFFERS_COUNT,
      .dma_frame_num = dma_buffer_length,
      .auto_clear = true,
+      .intr_priority = 3,
  };
  /* Allocate a new TX channel and get the handle of this channel */
  esp_err_t err = i2s_new_channel(&chan_cfg, &this->tx_handle_, NULL);
  if (err != ESP_OK) {
+    ESP_LOGE(TAG, "Failed to allocate new I2S channel");
    this->parent_->unlock();
    return err;
  }
@ -652,7 +652,11 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  // per sample causes the audio to play too fast. Setting the ws_width to the configured slot bit width seems to
  // make it play at the correct speed while sending more bits per slot.
  if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
-    std_slot_cfg.ws_width = static_cast<uint32_t>(this->slot_bit_width_);
+    uint32_t configured_bit_width = static_cast<uint32_t>(this->slot_bit_width_);
+    std_slot_cfg.ws_width = configured_bit_width;
+    if (configured_bit_width > 16) {
+      std_slot_cfg.msb_right = false;
+    }
  }
 #else
  std_slot_cfg.slot_bit_width = this->slot_bit_width_;
@ -670,54 +674,56 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver_(audio::AudioStreamInfo &audio_strea
  err = i2s_channel_init_std_mode(this->tx_handle_, &std_cfg);

  if (err != ESP_OK) {
+    ESP_LOGE(TAG, "Failed to initialize channel");
    i2s_del_channel(this->tx_handle_);
+    this->tx_handle_ = nullptr;
    this->parent_->unlock();
    return err;
  }
  if (this->i2s_event_queue_ == nullptr) {
-    this->i2s_event_queue_ = xQueueCreate(1, sizeof(bool));
+    this->i2s_event_queue_ = xQueueCreate(I2S_EVENT_QUEUE_COUNT, sizeof(int64_t));
  }
-  const i2s_event_callbacks_t callbacks = {
-      .on_send_q_ovf = i2s_overflow_cb,
-  };

-  i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
-
-  /* Before reading data, start the TX channel first */
  i2s_channel_enable(this->tx_handle_);
-  if (err != ESP_OK) {
-    i2s_del_channel(this->tx_handle_);
-    this->parent_->unlock();
-  }
 #endif

  return err;
 }

-void I2SAudioSpeaker::delete_task_(size_t buffer_size) {
-  this->audio_ring_buffer_.reset();  // Releases ownership of the shared_ptr
+#ifndef USE_I2S_LEGACY
+bool IRAM_ATTR I2SAudioSpeaker::i2s_on_sent_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx) {
+  int64_t now = esp_timer_get_time();

-  if (this->data_buffer_ != nullptr) {
-    RAMAllocator<uint8_t> allocator;
-    allocator.deallocate(this->data_buffer_, buffer_size);
-    this->data_buffer_ = nullptr;
+  BaseType_t need_yield1 = pdFALSE;
+  BaseType_t need_yield2 = pdFALSE;
+  BaseType_t need_yield3 = pdFALSE;
+
+  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) user_ctx;
+
+  if (xQueueIsQueueFullFromISR(this_speaker->i2s_event_queue_)) {
+    // Queue is full, so discard the oldest event and set the warning flag to inform the user
+    int64_t dummy;
+    xQueueReceiveFromISR(this_speaker->i2s_event_queue_, &dummy, &need_yield1);
+    xEventGroupSetBitsFromISR(this_speaker->event_group_, SpeakerEventGroupBits::WARN_DROPPED_EVENT, &need_yield2);
  }

-  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::STATE_STOPPED);
+  xQueueSendToBackFromISR(this_speaker->i2s_event_queue_, &now, &need_yield3);

-  this->task_created_ = false;
-  vTaskDelete(nullptr);
-}
-
-#ifndef USE_I2S_LEGACY
-bool IRAM_ATTR I2SAudioSpeaker::i2s_overflow_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx) {
-  I2SAudioSpeaker *this_speaker = (I2SAudioSpeaker *) user_ctx;
-  bool overflow = true;
-  xQueueOverwrite(this_speaker->i2s_event_queue_, &overflow);
-  return false;
+  return need_yield1 | need_yield2 | need_yield3;
 }
 #endif

+void I2SAudioSpeaker::stop_i2s_driver_() {
+#ifdef USE_I2S_LEGACY
+  i2s_driver_uninstall(this->parent_->get_port());
+#else
+  i2s_channel_disable(this->tx_handle_);
+  i2s_del_channel(this->tx_handle_);
+  this->tx_handle_ = nullptr;
+#endif
+  this->parent_->unlock();
+}
+
 }  // namespace i2s_audio
 }  // namespace esphome

--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@ -72,70 +72,57 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp

 protected:
  /// @brief Function for the FreeRTOS task handling audio output.
-  /// After receiving the COMMAND_START signal, allocates space for the buffers, starts the I2S driver, and reads
-  /// audio from the ring buffer and writes audio to the I2S port. Stops immmiately after receiving the COMMAND_STOP
-  /// signal and stops only after the ring buffer is empty after receiving the COMMAND_STOP_GRACEFULLY signal. Stops if
-  /// the ring buffer hasn't read data for more than timeout_ milliseconds. When stopping, it deallocates the buffers,
-  /// stops the I2S driver, unlocks the I2S port, and deletes the task. It communicates the state and any errors via
-  /// event_group_.
+  /// Allocates space for the buffers, reads audio from the ring buffer and writes audio to the I2S port. Stops
+  /// immmiately after receiving the COMMAND_STOP signal and stops only after the ring buffer is empty after receiving
+  /// the COMMAND_STOP_GRACEFULLY signal. Stops if the ring buffer hasn't read data for more than timeout_ milliseconds.
+  /// When stopping, it deallocates the buffers. It communicates its state and any errors via ``event_group_``.
  /// @param params I2SAudioSpeaker component
  static void speaker_task(void *params);

-  /// @brief Sends a stop command to the speaker task via event_group_.
+  /// @brief Sends a stop command to the speaker task via ``event_group_``.
  /// @param wait_on_empty If false, sends the COMMAND_STOP signal. If true, sends the COMMAND_STOP_GRACEFULLY signal.
  void stop_(bool wait_on_empty);

-  /// @brief Sets the corresponding ERR_ESP event group bits.
-  /// @param err esp_err_t error code.
-  /// @return True if an ERR_ESP bit is set and false if err == ESP_OK
-  bool send_esp_err_to_event_group_(esp_err_t err);
-
 #ifndef USE_I2S_LEGACY
-  static bool i2s_overflow_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx);
+  /// @brief Callback function used to send playback timestamps the to the speaker task.
+  /// @param handle (i2s_chan_handle_t)
+  /// @param event (i2s_event_data_t)
+  /// @param user_ctx (void*) User context pointer that the callback accesses
+  /// @return True if a higher priority task was interrupted
+  static bool i2s_on_sent_cb(i2s_chan_handle_t handle, i2s_event_data_t *event, void *user_ctx);
 #endif

-  /// @brief Allocates the data buffer and ring buffer
-  /// @param data_buffer_size Number of bytes to allocate for the data buffer.
-  /// @param ring_buffer_size Number of bytes to allocate for the ring buffer.
-  /// @return ESP_ERR_NO_MEM if either buffer fails to allocate
-  ///         ESP_OK if successful
-  esp_err_t allocate_buffers_(size_t data_buffer_size, size_t ring_buffer_size);
-
  /// @brief Starts the ESP32 I2S driver.
  /// Attempts to lock the I2S port, starts the I2S driver using the passed in stream information, and sets the data out
-  /// pin. If it fails, it will unlock the I2S port and uninstall the driver, if necessary.
+  /// pin. If it fails, it will unlock the I2S port and uninstalls the driver, if necessary.
  /// @param audio_stream_info Stream information for the I2S driver.
  /// @return ESP_ERR_NOT_ALLOWED if the I2S port can't play the incoming audio stream.
  ///         ESP_ERR_INVALID_STATE if the I2S port is already locked.
-  ///         ESP_ERR_INVALID_ARG if nstalling the driver or setting the data outpin fails due to a parameter error.
+  ///         ESP_ERR_INVALID_ARG if installing the driver or setting the data outpin fails due to a parameter error.
  ///         ESP_ERR_NO_MEM if the driver fails to install due to a memory allocation error.
-  ///         ESP_FAIL if setting the data out pin fails due to an IO error ESP_OK if successful
+  ///         ESP_FAIL if setting the data out pin fails due to an IO error
+  ///         ESP_OK if successful
  esp_err_t start_i2s_driver_(audio::AudioStreamInfo &audio_stream_info);

-  /// @brief Deletes the speaker's task.
-  /// Deallocates the data_buffer_ and audio_ring_buffer_, if necessary, and deletes the task. Should only be called by
-  /// the speaker_task itself.
-  /// @param buffer_size The allocated size of the data_buffer_.
-  void delete_task_(size_t buffer_size);
+  /// @brief Stops the I2S driver and unlocks the I2S port
+  void stop_i2s_driver_();

  TaskHandle_t speaker_task_handle_{nullptr};
  EventGroupHandle_t event_group_{nullptr};

  QueueHandle_t i2s_event_queue_;

-  uint8_t *data_buffer_;
-  std::shared_ptr<RingBuffer> audio_ring_buffer_;
+  std::weak_ptr<RingBuffer> audio_ring_buffer_;

  uint32_t buffer_duration_ms_;

  optional<uint32_t> timeout_;

-  bool task_created_{false};
  bool pause_state_{false};

  int16_t q15_volume_factor_{INT16_MAX};

-  size_t bytes_written_{0};
+  audio::AudioStreamInfo current_stream_info_;  // The currently loaded driver's stream info

 #ifdef USE_I2S_LEGACY
 #if SOC_I2S_SUPPORTS_DAC
@ -148,8 +135,6 @@ class I2SAudioSpeaker : public I2SAudioOut, public speaker::Speaker, public Comp
  std::string i2s_comm_fmt_;
  i2s_chan_handle_t tx_handle_;
 #endif
-
-  uint32_t accumulated_frames_written_{0};
 };

 }  // namespace i2s_audio