From 6b55df36c7e21997d69c4688ee1627e545596087 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Mon, 3 Feb 2025 20:58:35 -0600 Subject: [PATCH] [audio] Media Player Components PR6 (#8168) Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com> --- esphome/components/audio/audio_resampler.cpp | 159 +++++++++++++++++++ esphome/components/audio/audio_resampler.h | 100 ++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 esphome/components/audio/audio_resampler.cpp create mode 100644 esphome/components/audio/audio_resampler.h diff --git a/esphome/components/audio/audio_resampler.cpp b/esphome/components/audio/audio_resampler.cpp new file mode 100644 index 0000000000..05e9ff6ca1 --- /dev/null +++ b/esphome/components/audio/audio_resampler.cpp @@ -0,0 +1,159 @@ +#include "audio_resampler.h" + +#ifdef USE_ESP32 + +#include "esphome/core/hal.h" + +namespace esphome { +namespace audio { + +static const uint32_t READ_WRITE_TIMEOUT_MS = 20; + +AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size) + : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) { + this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size); + this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size); +} + +esp_err_t AudioResampler::add_source(std::weak_ptr &input_ring_buffer) { + if (this->input_transfer_buffer_ != nullptr) { + this->input_transfer_buffer_->set_source(input_ring_buffer); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} + +esp_err_t AudioResampler::add_sink(std::weak_ptr &output_ring_buffer) { + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(output_ring_buffer); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} + +#ifdef USE_SPEAKER +esp_err_t AudioResampler::add_sink(speaker::Speaker *speaker) { + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(speaker); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} +#endif + +esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, + uint16_t number_of_taps, uint16_t number_of_filters) { + this->input_stream_info_ = input_stream_info; + this->output_stream_info_ = output_stream_info; + + if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + if ((input_stream_info.get_bits_per_sample() > 32) || (output_stream_info.get_bits_per_sample() > 32) || + (input_stream_info_.get_channels() != output_stream_info.get_channels())) { + return ESP_ERR_NOT_SUPPORTED; + } + + if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) || + (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) { + this->resampler_ = make_unique( + input_stream_info.bytes_to_samples(this->input_buffer_size_), + output_stream_info.bytes_to_samples(this->output_buffer_size_)); + + // Use cascaded biquad filters when downsampling to avoid aliasing + bool use_pre_filter = output_stream_info.get_sample_rate() < input_stream_info.get_sample_rate(); + + esp_audio_libs::resampler::ResamplerConfiguration resample_config = { + .source_sample_rate = static_cast(input_stream_info.get_sample_rate()), + .target_sample_rate = static_cast(output_stream_info.get_sample_rate()), + .source_bits_per_sample = input_stream_info.get_bits_per_sample(), + .target_bits_per_sample = output_stream_info.get_bits_per_sample(), + .channels = input_stream_info_.get_channels(), + .use_pre_or_post_filter = use_pre_filter, + .subsample_interpolate = false, // Doubles the CPU load. Using more filters is a better alternative + .number_of_taps = number_of_taps, + .number_of_filters = number_of_filters, + }; + + if (!this->resampler_->initialize(resample_config)) { + // Failed to allocate the resampler's internal buffers + return ESP_ERR_NO_MEM; + } + } + + return ESP_OK; +} + +AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) { + if (stop_gracefully) { + if (!this->input_transfer_buffer_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) { + return AudioResamplerState::FINISHED; + } + } + + if (!this->pause_output_) { + // Move audio data to the sink + this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + } else { + // If paused, block to avoid wasting CPU resources + delay(READ_WRITE_TIMEOUT_MS); + } + + this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + if (this->input_transfer_buffer_->available() == 0) { + // No samples available to process + return AudioResamplerState::RESAMPLING; + } + + const size_t bytes_free = this->output_transfer_buffer_->free(); + const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free); + + const size_t bytes_available = this->input_transfer_buffer_->available(); + const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available); + + if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) || + (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) { + esp_audio_libs::resampler::ResamplerResults results = + this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(), + this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3); + + this->input_transfer_buffer_->decrease_buffer_length(this->input_stream_info_.frames_to_bytes(results.frames_used)); + this->output_transfer_buffer_->increase_buffer_length( + this->output_stream_info_.frames_to_bytes(results.frames_generated)); + + // Resampling causes slight differences in the durations used versus generated. Computes the difference in + // millisconds. The callback function passing the played audio duration uses the difference to convert from output + // duration to input duration. + this->accumulated_frames_used_ += results.frames_used; + this->accumulated_frames_generated_ += results.frames_generated; + + const int32_t used_ms = + this->input_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_used_); + const int32_t generated_ms = + this->output_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_generated_); + + *ms_differential = used_ms - generated_ms; + + } else { + // No resampling required, copy samples directly to the output transfer buffer + *ms_differential = 0; + + const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free), + this->input_stream_info_.frames_to_bytes(frames_available)); + + std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), + (void *) this->input_transfer_buffer_->get_buffer_start(), bytes_to_transfer); + + this->input_transfer_buffer_->decrease_buffer_length(bytes_to_transfer); + this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer); + } + + return AudioResamplerState::RESAMPLING; +} + +} // namespace audio +} // namespace esphome + +#endif diff --git a/esphome/components/audio/audio_resampler.h b/esphome/components/audio/audio_resampler.h new file mode 100644 index 0000000000..a348aaf783 --- /dev/null +++ b/esphome/components/audio/audio_resampler.h @@ -0,0 +1,100 @@ +#pragma once + +#ifdef USE_ESP32 + +#include "audio.h" +#include "audio_transfer_buffer.h" + +#ifdef USE_SPEAKER +#include "esphome/components/speaker/speaker.h" +#endif + +#include "esphome/core/ring_buffer.h" + +#include "esp_err.h" + +#include // esp-audio-libs + +namespace esphome { +namespace audio { + +enum class AudioResamplerState : uint8_t { + RESAMPLING, // More data is available to resample + FINISHED, // All file data has been resampled and transferred + FAILED, // Unused state included for consistency among Audio classes +}; + +class AudioResampler { + /* + * @brief Class that facilitates resampling audio. + * The audio data is read from a ring buffer source, resampled, and sent to an audio sink (ring buffer or speaker + * component). Also supports converting bits per sample. + */ + public: + /// @brief Allocates the input and output transfer buffers + /// @param input_buffer_size Size of the input transfer buffer in bytes. + /// @param output_buffer_size Size of the output transfer buffer in bytes. + AudioResampler(size_t input_buffer_size, size_t output_buffer_size); + + /// @brief Adds a source ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr. + /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_source(std::weak_ptr &input_ring_buffer); + + /// @brief Adds a sink ring buffer for resampled audio. Takes ownership of the ring buffer in a shared_ptr. + /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_sink(std::weak_ptr &output_ring_buffer); + +#ifdef USE_SPEAKER + /// @brief Adds a sink speaker for decoded audio. + /// @param speaker pointer to speaker component + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_sink(speaker::Speaker *speaker); +#endif + + /// @brief Sets up the class to resample. + /// @param input_stream_info The incoming sample rate, bits per sample, and number of channels + /// @param output_stream_info The desired outgoing sample rate, bits per sample, and number of channels + /// @param number_of_taps Number of taps per FIR filter + /// @param number_of_filters Number of FIR filters + /// @return ESP_OK if it is able to convert the incoming stream, + /// ESP_ERR_NO_MEM if the transfer buffers failed to allocate, + /// ESP_ERR_NOT_SUPPORTED if the stream can't be converted. + esp_err_t start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, uint16_t number_of_taps, + uint16_t number_of_filters); + + /// @brief Resamples audio from the ring buffer source and writes to the sink. + /// @param stop_gracefully If true, it indicates the file decoder is finished. The resampler will resample all the + /// remaining audio and then finish. + /// @param ms_differential Pointer to a (int32_t) variable that will store the difference, in milliseconds, between + /// the duration of input audio used and the duration of output audio generated. + /// @return AudioResamplerState + AudioResamplerState resample(bool stop_gracefully, int32_t *ms_differential); + + /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers. + /// @param pause_state If true, audio data is not sent to the sink. + void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; } + + protected: + std::unique_ptr input_transfer_buffer_; + std::unique_ptr output_transfer_buffer_; + + size_t input_buffer_size_; + size_t output_buffer_size_; + + uint32_t accumulated_frames_used_{0}; + uint32_t accumulated_frames_generated_{0}; + + bool pause_output_{false}; + + AudioStreamInfo input_stream_info_; + AudioStreamInfo output_stream_info_; + + std::unique_ptr resampler_; +}; + +} // namespace audio +} // namespace esphome + +#endif