From e9887625768f9c0146a857a55d5895c34cbf057a Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Tue, 6 May 2025 23:42:59 -0500 Subject: [PATCH] [i2s_audio, mixer, resampler, speaker] Simplify duration played callback (#8703) --- .../i2s_audio/speaker/i2s_audio_speaker.cpp | 18 +++------- .../mixer/speaker/mixer_speaker.cpp | 34 +++++++------------ .../components/mixer/speaker/mixer_speaker.h | 4 +-- .../resampler/speaker/resampler_speaker.cpp | 22 ++++++------ .../resampler/speaker/resampler_speaker.h | 2 +- .../media_player/speaker_media_player.cpp | 18 ---------- .../media_player/speaker_media_player.h | 11 ------ esphome/components/speaker/speaker.h | 11 +++--- 8 files changed, 35 insertions(+), 85 deletions(-) diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp index 7d247003f7..b287177016 100644 --- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp +++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp @@ -14,6 +14,8 @@ #include "esphome/core/hal.h" #include "esphome/core/log.h" +#include "esp_timer.h" + namespace esphome { namespace i2s_audio { @@ -366,25 +368,15 @@ void I2SAudioSpeaker::speaker_task(void *params) { bytes_to_write, &bytes_written, pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * 5)); #endif - uint32_t write_timestamp = micros(); + int64_t now = esp_timer_get_time(); if (bytes_written != bytes_to_write) { xEventGroupSetBits(this_speaker->event_group_, SpeakerEventGroupBits::ERR_ESP_INVALID_SIZE); } - bytes_read -= bytes_written; - this_speaker->accumulated_frames_written_ += audio_stream_info.bytes_to_frames(bytes_written); - const uint32_t new_playback_ms = - audio_stream_info.frames_to_milliseconds_with_remainder(&this_speaker->accumulated_frames_written_); - const uint32_t remainder_us = - audio_stream_info.frames_to_microseconds(this_speaker->accumulated_frames_written_); - - uint32_t pending_frames = - audio_stream_info.bytes_to_frames(bytes_read + this_speaker->audio_ring_buffer_->available()); - const uint32_t pending_ms = audio_stream_info.frames_to_milliseconds_with_remainder(&pending_frames); - - this_speaker->audio_output_callback_(new_playback_ms, remainder_us, pending_ms, write_timestamp); + this_speaker->audio_output_callback_(audio_stream_info.bytes_to_frames(bytes_written), + now + dma_buffers_duration_ms * 1000); tx_dma_underflow = false; last_data_received_time = millis(); diff --git a/esphome/components/mixer/speaker/mixer_speaker.cpp b/esphome/components/mixer/speaker/mixer_speaker.cpp index 121a62392c..8e480dd49b 100644 --- a/esphome/components/mixer/speaker/mixer_speaker.cpp +++ b/esphome/components/mixer/speaker/mixer_speaker.cpp @@ -53,14 +53,15 @@ void SourceSpeaker::dump_config() { } void SourceSpeaker::setup() { - this->parent_->get_output_speaker()->add_audio_output_callback( - [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) { - uint32_t personal_playback_ms = std::min(new_playback_ms, this->pending_playback_ms_); - if (personal_playback_ms > 0) { - this->pending_playback_ms_ -= personal_playback_ms; - this->audio_output_callback_(personal_playback_ms, remainder_us, this->pending_playback_ms_, write_timestamp); - } - }); + this->parent_->get_output_speaker()->add_audio_output_callback([this](uint32_t new_frames, int64_t write_timestamp) { + // The SourceSpeaker may not have included any audio in the mixed output, so verify there were pending frames + uint32_t speakers_playback_frames = std::min(new_frames, this->pending_playback_frames_); + this->pending_playback_frames_ -= speakers_playback_frames; + + if (speakers_playback_frames > 0) { + this->audio_output_callback_(speakers_playback_frames, write_timestamp); + } + }); } void SourceSpeaker::loop() { @@ -153,6 +154,7 @@ esp_err_t SourceSpeaker::start_() { } } + this->pending_playback_frames_ = 0; // reset return this->parent_->start(this->audio_stream_info_); } @@ -542,11 +544,7 @@ void MixerSpeaker::audio_mixer_task(void *params) { // Update source speaker buffer length transfer_buffers_with_data[0]->decrease_buffer_length(active_stream_info.frames_to_bytes(frames_to_mix)); - speakers_with_data[0]->accumulated_frames_read_ += frames_to_mix; - - // Add new audio duration to the source speaker pending playback - speakers_with_data[0]->pending_playback_ms_ += - active_stream_info.frames_to_milliseconds_with_remainder(&speakers_with_data[0]->accumulated_frames_read_); + speakers_with_data[0]->pending_playback_frames_ += frames_to_mix; // Update output transfer buffer length output_transfer_buffer->increase_buffer_length( @@ -586,10 +584,6 @@ void MixerSpeaker::audio_mixer_task(void *params) { reinterpret_cast(output_transfer_buffer->get_buffer_end()), this_mixer->audio_stream_info_.value(), frames_to_mix); - speakers_with_data[i]->pending_playback_ms_ += - speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder( - &speakers_with_data[i]->accumulated_frames_read_); - if (i != transfer_buffers_with_data.size() - 1) { // Need to mix more streams together, point primary buffer and stream info to the already mixed output primary_buffer = reinterpret_cast(output_transfer_buffer->get_buffer_end()); @@ -601,11 +595,7 @@ void MixerSpeaker::audio_mixer_task(void *params) { for (int i = 0; i < transfer_buffers_with_data.size(); ++i) { transfer_buffers_with_data[i]->decrease_buffer_length( speakers_with_data[i]->get_audio_stream_info().frames_to_bytes(frames_to_mix)); - speakers_with_data[i]->accumulated_frames_read_ += frames_to_mix; - - speakers_with_data[i]->pending_playback_ms_ += - speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder( - &speakers_with_data[i]->accumulated_frames_read_); + speakers_with_data[i]->pending_playback_frames_ += frames_to_mix; } // Update output transfer buffer length diff --git a/esphome/components/mixer/speaker/mixer_speaker.h b/esphome/components/mixer/speaker/mixer_speaker.h index 0bd6b5f4c8..48bacc4471 100644 --- a/esphome/components/mixer/speaker/mixer_speaker.h +++ b/esphome/components/mixer/speaker/mixer_speaker.h @@ -114,9 +114,7 @@ class SourceSpeaker : public speaker::Speaker, public Component { uint32_t ducking_transition_samples_remaining_{0}; uint32_t samples_per_ducking_step_{0}; - uint32_t accumulated_frames_read_{0}; - - uint32_t pending_playback_ms_{0}; + uint32_t pending_playback_frames_{0}; }; class MixerSpeaker : public Component { diff --git a/esphome/components/resampler/speaker/resampler_speaker.cpp b/esphome/components/resampler/speaker/resampler_speaker.cpp index 9bb46ad78c..5e5615cbb9 100644 --- a/esphome/components/resampler/speaker/resampler_speaker.cpp +++ b/esphome/components/resampler/speaker/resampler_speaker.cpp @@ -43,13 +43,18 @@ void ResamplerSpeaker::setup() { return; } - this->output_speaker_->add_audio_output_callback( - [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) { - int32_t adjustment = this->playback_differential_ms_; - this->playback_differential_ms_ -= adjustment; - int32_t adjusted_playback_ms = static_cast(new_playback_ms) + adjustment; - this->audio_output_callback_(adjusted_playback_ms, remainder_us, pending_ms, write_timestamp); - }); + this->output_speaker_->add_audio_output_callback([this](uint32_t new_frames, int64_t write_timestamp) { + if (this->audio_stream_info_.get_sample_rate() != this->target_stream_info_.get_sample_rate()) { + // Convert the number of frames from the target sample rate to the source sample rate. Track the remainder to + // avoid losing frames from integer division truncation. + const uint64_t numerator = new_frames * this->audio_stream_info_.get_sample_rate() + this->callback_remainder_; + const uint64_t denominator = this->target_stream_info_.get_sample_rate(); + this->callback_remainder_ = numerator % denominator; + this->audio_output_callback_(numerator / denominator, write_timestamp); + } else { + this->audio_output_callback_(new_frames, write_timestamp); + } + }); } void ResamplerSpeaker::loop() { @@ -283,7 +288,6 @@ void ResamplerSpeaker::resample_task(void *params) { xEventGroupSetBits(this_resampler->event_group_, ResamplingEventGroupBits::ERR_ESP_NOT_SUPPORTED); } - this_resampler->playback_differential_ms_ = 0; while (err == ESP_OK) { uint32_t event_bits = xEventGroupGetBits(this_resampler->event_group_); @@ -295,8 +299,6 @@ void ResamplerSpeaker::resample_task(void *params) { int32_t ms_differential = 0; audio::AudioResamplerState resampler_state = resampler->resample(false, &ms_differential); - this_resampler->playback_differential_ms_ += ms_differential; - if (resampler_state == audio::AudioResamplerState::FINISHED) { break; } else if (resampler_state == audio::AudioResamplerState::FAILED) { diff --git a/esphome/components/resampler/speaker/resampler_speaker.h b/esphome/components/resampler/speaker/resampler_speaker.h index d5e3f2b6d6..51790069d2 100644 --- a/esphome/components/resampler/speaker/resampler_speaker.h +++ b/esphome/components/resampler/speaker/resampler_speaker.h @@ -100,7 +100,7 @@ class ResamplerSpeaker : public Component, public speaker::Speaker { uint32_t buffer_duration_ms_; - int32_t playback_differential_ms_{0}; + uint64_t callback_remainder_{0}; }; } // namespace resampler diff --git a/esphome/components/speaker/media_player/speaker_media_player.cpp b/esphome/components/speaker/media_player/speaker_media_player.cpp index e143920010..fed0207c93 100644 --- a/esphome/components/speaker/media_player/speaker_media_player.cpp +++ b/esphome/components/speaker/media_player/speaker_media_player.cpp @@ -106,16 +106,6 @@ void SpeakerMediaPlayer::setup() { ESP_LOGE(TAG, "Failed to create media pipeline"); this->mark_failed(); } - - // Setup callback to track the duration of audio played by the media pipeline - this->media_speaker_->add_audio_output_callback( - [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) { - this->playback_ms_ += new_playback_ms; - this->remainder_us_ = remainder_us; - this->pending_ms_ = pending_ms; - this->last_audio_write_timestamp_ = write_timestamp; - this->playback_us_ = this->playback_ms_ * 1000 + this->remainder_us_; - }); } ESP_LOGI(TAG, "Set up speaker media player"); @@ -321,7 +311,6 @@ void SpeakerMediaPlayer::loop() { AudioPipelineState old_media_pipeline_state = this->media_pipeline_state_; if (this->media_pipeline_ != nullptr) { this->media_pipeline_state_ = this->media_pipeline_->process_state(); - this->decoded_playback_ms_ = this->media_pipeline_->get_playback_ms(); } if (this->media_pipeline_state_ == AudioPipelineState::ERROR_READING) { @@ -379,13 +368,6 @@ void SpeakerMediaPlayer::loop() { } else if (this->media_pipeline_state_ == AudioPipelineState::PLAYING) { this->state = media_player::MEDIA_PLAYER_STATE_PLAYING; } else if (this->media_pipeline_state_ == AudioPipelineState::STOPPED) { - // Reset playback durations - this->decoded_playback_ms_ = 0; - this->playback_us_ = 0; - this->playback_ms_ = 0; - this->remainder_us_ = 0; - this->pending_ms_ = 0; - if (!media_playlist_.empty()) { uint32_t timeout_ms = 0; if (old_media_pipeline_state == AudioPipelineState::PLAYING) { diff --git a/esphome/components/speaker/media_player/speaker_media_player.h b/esphome/components/speaker/media_player/speaker_media_player.h index 81eb72a830..67e9859a13 100644 --- a/esphome/components/speaker/media_player/speaker_media_player.h +++ b/esphome/components/speaker/media_player/speaker_media_player.h @@ -73,10 +73,6 @@ class SpeakerMediaPlayer : public Component, public media_player::MediaPlayer { void play_file(audio::AudioFile *media_file, bool announcement, bool enqueue); - uint32_t get_playback_ms() const { return this->playback_ms_; } - uint32_t get_playback_us() const { return this->playback_us_; } - uint32_t get_decoded_playback_ms() const { return this->decoded_playback_ms_; } - void set_playlist_delay_ms(AudioPipelineType pipeline_type, uint32_t delay_ms); protected: @@ -141,13 +137,6 @@ class SpeakerMediaPlayer : public Component, public media_player::MediaPlayer { Trigger<> *mute_trigger_ = new Trigger<>(); Trigger<> *unmute_trigger_ = new Trigger<>(); Trigger *volume_trigger_ = new Trigger(); - - uint32_t decoded_playback_ms_{0}; - uint32_t playback_us_{0}; - uint32_t playback_ms_{0}; - uint32_t remainder_us_{0}; - uint32_t pending_ms_{0}; - uint32_t last_audio_write_timestamp_{0}; }; } // namespace speaker diff --git a/esphome/components/speaker/speaker.h b/esphome/components/speaker/speaker.h index c4cf912fa6..373d2e3a74 100644 --- a/esphome/components/speaker/speaker.h +++ b/esphome/components/speaker/speaker.h @@ -104,12 +104,9 @@ class Speaker { /// Callback function for sending the duration of the audio written to the speaker since the last callback. /// Parameters: - /// - Duration in milliseconds. Never rounded and should always be less than or equal to the actual duration. - /// - Remainder duration in microseconds. Rounded duration after subtracting the previous parameter from the actual - /// duration. - /// - Duration of remaining, unwritten audio buffered in the speaker in milliseconds. - /// - System time in microseconds when the last write was completed. - void add_audio_output_callback(std::function &&callback) { + /// - Frames played + /// - System time in microseconds when the frames were written to the DAC + void add_audio_output_callback(std::function &&callback) { this->audio_output_callback_.add(std::move(callback)); } @@ -123,7 +120,7 @@ class Speaker { audio_dac::AudioDac *audio_dac_{nullptr}; #endif - CallbackManager audio_output_callback_{}; + CallbackManager audio_output_callback_{}; }; } // namespace speaker