From c8bbc2e84c2eb53c58ec43f0e69a2a4501f8afab Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Mon, 3 Feb 2025 16:34:20 -0600 Subject: [PATCH 1/3] [audio] Media Player Components PR4 (#8166) Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com> --- esphome/components/audio/audio_reader.cpp | 308 ++++++++++++++++++++++ esphome/components/audio/audio_reader.h | 85 ++++++ 2 files changed, 393 insertions(+) create mode 100644 esphome/components/audio/audio_reader.cpp create mode 100644 esphome/components/audio/audio_reader.h diff --git a/esphome/components/audio/audio_reader.cpp b/esphome/components/audio/audio_reader.cpp new file mode 100644 index 0000000000..b93e4e74ea --- /dev/null +++ b/esphome/components/audio/audio_reader.cpp @@ -0,0 +1,308 @@ +#include "audio_reader.h" + +#ifdef USE_ESP_IDF + +#include "esphome/core/defines.h" +#include "esphome/core/hal.h" +#include "esphome/core/helpers.h" + +#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE +#include "esp_crt_bundle.h" +#endif + +namespace esphome { +namespace audio { + +static const uint32_t READ_WRITE_TIMEOUT_MS = 20; + +// The number of times the http read times out with no data before throwing an error +static const uint32_t ERROR_COUNT_NO_DATA_READ_TIMEOUT = 100; + +static const size_t HTTP_STREAM_BUFFER_SIZE = 2048; + +static const uint8_t MAX_REDIRECTION = 5; + +// Some common HTTP status codes - borrowed from http_request component accessed 20241224 +enum HttpStatus { + HTTP_STATUS_OK = 200, + HTTP_STATUS_NO_CONTENT = 204, + HTTP_STATUS_PARTIAL_CONTENT = 206, + + /* 3xx - Redirection */ + HTTP_STATUS_MULTIPLE_CHOICES = 300, + HTTP_STATUS_MOVED_PERMANENTLY = 301, + HTTP_STATUS_FOUND = 302, + HTTP_STATUS_SEE_OTHER = 303, + HTTP_STATUS_NOT_MODIFIED = 304, + HTTP_STATUS_TEMPORARY_REDIRECT = 307, + HTTP_STATUS_PERMANENT_REDIRECT = 308, + + /* 4XX - CLIENT ERROR */ + HTTP_STATUS_BAD_REQUEST = 400, + HTTP_STATUS_UNAUTHORIZED = 401, + HTTP_STATUS_FORBIDDEN = 403, + HTTP_STATUS_NOT_FOUND = 404, + HTTP_STATUS_METHOD_NOT_ALLOWED = 405, + HTTP_STATUS_NOT_ACCEPTABLE = 406, + HTTP_STATUS_LENGTH_REQUIRED = 411, + + /* 5xx - Server Error */ + HTTP_STATUS_INTERNAL_ERROR = 500 +}; + +AudioReader::~AudioReader() { this->cleanup_connection_(); } + +esp_err_t AudioReader::add_sink(const std::weak_ptr &output_ring_buffer) { + if (current_audio_file_ != nullptr) { + // A transfer buffer isn't ncessary for a local file + this->file_ring_buffer_ = output_ring_buffer.lock(); + return ESP_OK; + } + + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(output_ring_buffer); + return ESP_OK; + } + + return ESP_ERR_INVALID_STATE; +} + +esp_err_t AudioReader::start(AudioFile *audio_file, AudioFileType &file_type) { + file_type = AudioFileType::NONE; + + this->current_audio_file_ = audio_file; + + this->file_current_ = audio_file->data; + file_type = audio_file->file_type; + + return ESP_OK; +} + +esp_err_t AudioReader::start(const std::string &uri, AudioFileType &file_type) { + file_type = AudioFileType::NONE; + + this->cleanup_connection_(); + + if (uri.empty()) { + return ESP_ERR_INVALID_ARG; + } + + esp_http_client_config_t client_config = {}; + + client_config.url = uri.c_str(); + client_config.cert_pem = nullptr; + client_config.disable_auto_redirect = false; + client_config.max_redirection_count = 10; + client_config.event_handler = http_event_handler; + client_config.user_data = this; + client_config.buffer_size = HTTP_STREAM_BUFFER_SIZE; + client_config.keep_alive_enable = true; + client_config.timeout_ms = 5000; // Shouldn't trigger watchdog resets if caller runs in a task + +#if CONFIG_MBEDTLS_CERTIFICATE_BUNDLE + if (uri.find("https:") != std::string::npos) { + client_config.crt_bundle_attach = esp_crt_bundle_attach; + } +#endif + + this->client_ = esp_http_client_init(&client_config); + + if (this->client_ == nullptr) { + return ESP_FAIL; + } + + esp_err_t err = esp_http_client_open(this->client_, 0); + + if (err != ESP_OK) { + this->cleanup_connection_(); + return err; + } + + int64_t header_length = esp_http_client_fetch_headers(this->client_); + if (header_length < 0) { + this->cleanup_connection_(); + return ESP_FAIL; + } + + int status_code = esp_http_client_get_status_code(this->client_); + + if ((status_code < HTTP_STATUS_OK) || (status_code > HTTP_STATUS_PERMANENT_REDIRECT)) { + this->cleanup_connection_(); + return ESP_FAIL; + } + + ssize_t redirect_count = 0; + + while ((esp_http_client_set_redirection(this->client_) == ESP_OK) && (redirect_count < MAX_REDIRECTION)) { + err = esp_http_client_open(this->client_, 0); + if (err != ESP_OK) { + this->cleanup_connection_(); + return ESP_FAIL; + } + + header_length = esp_http_client_fetch_headers(this->client_); + if (header_length < 0) { + this->cleanup_connection_(); + return ESP_FAIL; + } + + status_code = esp_http_client_get_status_code(this->client_); + + if ((status_code < HTTP_STATUS_OK) || (status_code > HTTP_STATUS_PERMANENT_REDIRECT)) { + this->cleanup_connection_(); + return ESP_FAIL; + } + + ++redirect_count; + } + + if (this->audio_file_type_ == AudioFileType::NONE) { + // Failed to determine the file type from the header, fallback to using the url + char url[500]; + err = esp_http_client_get_url(this->client_, url, 500); + if (err != ESP_OK) { + this->cleanup_connection_(); + return err; + } + + std::string url_string = str_lower_case(url); + + if (str_endswith(url_string, ".wav")) { + file_type = AudioFileType::WAV; + } +#ifdef USE_AUDIO_MP3_SUPPORT + else if (str_endswith(url_string, ".mp3")) { + file_type = AudioFileType::MP3; + } +#endif +#ifdef USE_AUDIO_FLAC_SUPPORT + else if (str_endswith(url_string, ".flac")) { + file_type = AudioFileType::FLAC; + } +#endif + else { + file_type = AudioFileType::NONE; + this->cleanup_connection_(); + return ESP_ERR_NOT_SUPPORTED; + } + } else { + file_type = this->audio_file_type_; + } + + this->no_data_read_count_ = 0; + + this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(this->buffer_size_); + if (this->output_transfer_buffer_ == nullptr) { + return ESP_ERR_NO_MEM; + } + + return ESP_OK; +} + +AudioReaderState AudioReader::read() { + if (this->client_ != nullptr) { + return this->http_read_(); + } else if (this->current_audio_file_ != nullptr) { + return this->file_read_(); + } + + return AudioReaderState::FAILED; +} + +AudioFileType AudioReader::get_audio_type(const char *content_type) { +#ifdef USE_AUDIO_MP3_SUPPORT + if (strcasecmp(content_type, "mp3") == 0 || strcasecmp(content_type, "audio/mp3") == 0 || + strcasecmp(content_type, "audio/mpeg") == 0) { + return AudioFileType::MP3; + } +#endif + if (strcasecmp(content_type, "audio/wav") == 0) { + return AudioFileType::WAV; + } +#ifdef USE_AUDIO_FLAC_SUPPORT + if (strcasecmp(content_type, "audio/flac") == 0 || strcasecmp(content_type, "audio/x-flac") == 0) { + return AudioFileType::FLAC; + } +#endif + return AudioFileType::NONE; +} + +esp_err_t AudioReader::http_event_handler(esp_http_client_event_t *evt) { + // Based on https://github.com/maroc81/WeatherLily/tree/main/main/net accessed 20241224 + AudioReader *this_reader = (AudioReader *) evt->user_data; + + switch (evt->event_id) { + case HTTP_EVENT_ON_HEADER: + if (strcasecmp(evt->header_key, "Content-Type") == 0) { + this_reader->audio_file_type_ = get_audio_type(evt->header_value); + } + break; + default: + break; + } + return ESP_OK; +} + +AudioReaderState AudioReader::file_read_() { + size_t remaining_bytes = this->current_audio_file_->length - (this->file_current_ - this->current_audio_file_->data); + if (remaining_bytes > 0) { + size_t bytes_written = this->file_ring_buffer_->write_without_replacement(this->file_current_, remaining_bytes, + pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + this->file_current_ += bytes_written; + + return AudioReaderState::READING; + } + + return AudioReaderState::FINISHED; +} + +AudioReaderState AudioReader::http_read_() { + this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + if (esp_http_client_is_complete_data_received(this->client_)) { + if (this->output_transfer_buffer_->available() == 0) { + this->cleanup_connection_(); + return AudioReaderState::FINISHED; + } + } else { + size_t bytes_to_read = this->output_transfer_buffer_->free(); + int received_len = + esp_http_client_read(this->client_, (char *) this->output_transfer_buffer_->get_buffer_end(), bytes_to_read); + + if (received_len > 0) { + this->output_transfer_buffer_->increase_buffer_length(received_len); + + this->no_data_read_count_ = 0; + } else if (received_len < 0) { + // HTTP read error + this->cleanup_connection_(); + return AudioReaderState::FAILED; + } else { + if (bytes_to_read > 0) { + // Read timed out + ++this->no_data_read_count_; + if (this->no_data_read_count_ >= ERROR_COUNT_NO_DATA_READ_TIMEOUT) { + // Timed out with no data read too many times, so the http read has failed + this->cleanup_connection_(); + return AudioReaderState::FAILED; + } + delay(READ_WRITE_TIMEOUT_MS); + } + } + } + + return AudioReaderState::READING; +} + +void AudioReader::cleanup_connection_() { + if (this->client_ != nullptr) { + esp_http_client_close(this->client_); + esp_http_client_cleanup(this->client_); + this->client_ = nullptr; + } +} + +} // namespace audio +} // namespace esphome + +#endif diff --git a/esphome/components/audio/audio_reader.h b/esphome/components/audio/audio_reader.h new file mode 100644 index 0000000000..90113e6dda --- /dev/null +++ b/esphome/components/audio/audio_reader.h @@ -0,0 +1,85 @@ +#pragma once + +#ifdef USE_ESP_IDF + +#include "audio.h" +#include "audio_transfer_buffer.h" + +#include "esphome/core/ring_buffer.h" + +#include "esp_err.h" + +#include + +namespace esphome { +namespace audio { + +enum class AudioReaderState : uint8_t { + READING = 0, // More data is available to read + FINISHED, // All data has been read and transferred + FAILED, // Encountered an error +}; + +class AudioReader { + /* + * @brief Class that facilitates reading a raw audio file. + * Files can be read from flash (stored in a AudioFile struct) or from an http source. + * The file data is sent to a ring buffer sink. + */ + public: + /// @brief Constructs an AudioReader object. + /// The transfer buffer isn't allocated here, but only if necessary (an http source) in the start function. + /// @param buffer_size Transfer buffer size in bytes. + AudioReader(size_t buffer_size) : buffer_size_(buffer_size) {} + ~AudioReader(); + + /// @brief Adds a sink ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr + /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successful, ESP_ERR_INVALID_STATE otherwise + esp_err_t add_sink(const std::weak_ptr &output_ring_buffer); + + /// @brief Starts reading an audio file from an http source. The transfer buffer is allocated here. + /// @param uri Web url to the http file. + /// @param file_type AudioFileType variable passed-by-reference indicating the type of file being read. + /// @return ESP_OK if successful, an ESP_ERR* code otherwise. + esp_err_t start(const std::string &uri, AudioFileType &file_type); + + /// @brief Starts reading an audio file from flash. No transfer buffer is allocated. + /// @param audio_file AudioFile struct containing the file. + /// @param file_type AudioFileType variable passed-by-reference indicating the type of file being read. + /// @return ESP_OK + esp_err_t start(AudioFile *audio_file, AudioFileType &file_type); + + /// @brief Reads new file data from the source and sends to the ring buffer sink. + /// @return AudioReaderState + AudioReaderState read(); + + protected: + /// @brief Monitors the http client events to attempt determining the file type from the Content-Type header + static esp_err_t http_event_handler(esp_http_client_event_t *evt); + + /// @brief Determines the audio file type from the http header's Content-Type key + /// @param content_type string with the Content-Type key + /// @return AudioFileType of the url, if it can be determined. If not, return AudioFileType::NONE. + static AudioFileType get_audio_type(const char *content_type); + + AudioReaderState file_read_(); + AudioReaderState http_read_(); + + std::shared_ptr file_ring_buffer_; + std::unique_ptr output_transfer_buffer_; + void cleanup_connection_(); + + size_t buffer_size_; + uint32_t no_data_read_count_; + + esp_http_client_handle_t client_{nullptr}; + + AudioFile *current_audio_file_{nullptr}; + AudioFileType audio_file_type_{AudioFileType::NONE}; + const uint8_t *file_current_{nullptr}; +}; +} // namespace audio +} // namespace esphome + +#endif From b8f9eaecd85fc2b9e6b3086f030b6fe76d94c656 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Mon, 3 Feb 2025 17:47:50 -0600 Subject: [PATCH 2/3] [audio] Media Player Components PR5 (#8167) --- esphome/components/audio/audio_decoder.cpp | 362 +++++++++++++++++++++ esphome/components/audio/audio_decoder.h | 135 ++++++++ 2 files changed, 497 insertions(+) create mode 100644 esphome/components/audio/audio_decoder.cpp create mode 100644 esphome/components/audio/audio_decoder.h diff --git a/esphome/components/audio/audio_decoder.cpp b/esphome/components/audio/audio_decoder.cpp new file mode 100644 index 0000000000..b249f1381d --- /dev/null +++ b/esphome/components/audio/audio_decoder.cpp @@ -0,0 +1,362 @@ +#include "audio_decoder.h" + +#ifdef USE_ESP32 + +#include "esphome/core/hal.h" + +namespace esphome { +namespace audio { + +static const uint32_t DECODING_TIMEOUT_MS = 50; // The decode function will yield after this duration +static const uint32_t READ_WRITE_TIMEOUT_MS = 20; // Timeout for transferring audio data + +static const uint32_t MAX_POTENTIALLY_FAILED_COUNT = 10; + +AudioDecoder::AudioDecoder(size_t input_buffer_size, size_t output_buffer_size) { + this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size); + this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size); +} + +AudioDecoder::~AudioDecoder() { +#ifdef USE_AUDIO_MP3_SUPPORT + if (this->audio_file_type_ == AudioFileType::MP3) { + esp_audio_libs::helix_decoder::MP3FreeDecoder(this->mp3_decoder_); + } +#endif +} + +esp_err_t AudioDecoder::add_source(std::weak_ptr &input_ring_buffer) { + if (this->input_transfer_buffer_ != nullptr) { + this->input_transfer_buffer_->set_source(input_ring_buffer); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} + +esp_err_t AudioDecoder::add_sink(std::weak_ptr &output_ring_buffer) { + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(output_ring_buffer); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} + +#ifdef USE_SPEAKER +esp_err_t AudioDecoder::add_sink(speaker::Speaker *speaker) { + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(speaker); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} +#endif + +esp_err_t AudioDecoder::start(AudioFileType audio_file_type) { + if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + this->audio_file_type_ = audio_file_type; + + this->potentially_failed_count_ = 0; + this->end_of_file_ = false; + + switch (this->audio_file_type_) { +#ifdef USE_AUDIO_FLAC_SUPPORT + case AudioFileType::FLAC: + this->flac_decoder_ = make_unique(); + this->free_buffer_required_ = + this->output_transfer_buffer_->capacity(); // We'll revise this after reading the header + break; +#endif +#ifdef USE_AUDIO_MP3_SUPPORT + case AudioFileType::MP3: + this->mp3_decoder_ = esp_audio_libs::helix_decoder::MP3InitDecoder(); + this->free_buffer_required_ = 1152 * sizeof(int16_t) * 2; // samples * size per sample * channels + break; +#endif + case AudioFileType::WAV: + this->wav_decoder_ = make_unique(); + this->wav_decoder_->reset(); + this->free_buffer_required_ = 1024; + break; + case AudioFileType::NONE: + default: + return ESP_ERR_NOT_SUPPORTED; + break; + } + + return ESP_OK; +} + +AudioDecoderState AudioDecoder::decode(bool stop_gracefully) { + if (stop_gracefully) { + if (this->output_transfer_buffer_->available() == 0) { + if (this->end_of_file_) { + // The file decoder indicates it reached the end of file + return AudioDecoderState::FINISHED; + } + + if (!this->input_transfer_buffer_->has_buffered_data()) { + // If all the internal buffers are empty, the decoding is done + return AudioDecoderState::FINISHED; + } + } + } + + if (this->potentially_failed_count_ > MAX_POTENTIALLY_FAILED_COUNT) { + if (stop_gracefully) { + // No more new data is going to come in, so decoding is done + return AudioDecoderState::FINISHED; + } + return AudioDecoderState::FAILED; + } + + FileDecoderState state = FileDecoderState::MORE_TO_PROCESS; + + uint32_t decoding_start = millis(); + + while (state == FileDecoderState::MORE_TO_PROCESS) { + // Transfer decoded out + if (!this->pause_output_) { + size_t bytes_written = this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + if (this->audio_stream_info_.has_value()) { + this->accumulated_frames_written_ += this->audio_stream_info_.value().bytes_to_frames(bytes_written); + this->playback_ms_ += + this->audio_stream_info_.value().frames_to_milliseconds_with_remainder(&this->accumulated_frames_written_); + } + } else { + // If paused, block to avoid wasting CPU resources + delay(READ_WRITE_TIMEOUT_MS); + } + + // Verify there is enough space to store more decoded audio and that the function hasn't been running too long + if ((this->output_transfer_buffer_->free() < this->free_buffer_required_) || + (millis() - decoding_start > DECODING_TIMEOUT_MS)) { + return AudioDecoderState::DECODING; + } + + // Decode more audio + + size_t bytes_read = this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + if ((this->potentially_failed_count_ > 0) && (bytes_read == 0)) { + // Failed to decode in last attempt and there is no new data + + if (this->input_transfer_buffer_->free() == 0) { + // The input buffer is full. Since it previously failed on the exact same data, we can never recover + state = FileDecoderState::FAILED; + } else { + // Attempt to get more data next time + state = FileDecoderState::IDLE; + } + } else if (this->input_transfer_buffer_->available() == 0) { + // No data to decode, attempt to get more data next time + state = FileDecoderState::IDLE; + } else { + switch (this->audio_file_type_) { +#ifdef USE_AUDIO_FLAC_SUPPORT + case AudioFileType::FLAC: + state = this->decode_flac_(); + break; +#endif +#ifdef USE_AUDIO_MP3_SUPPORT + case AudioFileType::MP3: + state = this->decode_mp3_(); + break; +#endif + case AudioFileType::WAV: + state = this->decode_wav_(); + break; + case AudioFileType::NONE: + default: + state = FileDecoderState::IDLE; + break; + } + } + + if (state == FileDecoderState::POTENTIALLY_FAILED) { + ++this->potentially_failed_count_; + } else if (state == FileDecoderState::END_OF_FILE) { + this->end_of_file_ = true; + } else if (state == FileDecoderState::FAILED) { + return AudioDecoderState::FAILED; + } else if (state == FileDecoderState::MORE_TO_PROCESS) { + this->potentially_failed_count_ = 0; + } + } + return AudioDecoderState::DECODING; +} + +#ifdef USE_AUDIO_FLAC_SUPPORT +FileDecoderState AudioDecoder::decode_flac_() { + if (!this->audio_stream_info_.has_value()) { + // Header hasn't been read + auto result = this->flac_decoder_->read_header(this->input_transfer_buffer_->get_buffer_start(), + this->input_transfer_buffer_->available()); + + if (result == esp_audio_libs::flac::FLAC_DECODER_HEADER_OUT_OF_DATA) { + return FileDecoderState::POTENTIALLY_FAILED; + } + + if (result != esp_audio_libs::flac::FLAC_DECODER_SUCCESS) { + // Couldn't read FLAC header + return FileDecoderState::FAILED; + } + + size_t bytes_consumed = this->flac_decoder_->get_bytes_index(); + this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed); + + this->free_buffer_required_ = flac_decoder_->get_output_buffer_size_bytes(); + if (this->output_transfer_buffer_->capacity() < this->free_buffer_required_) { + // Output buffer is not big enough + if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) { + // Couldn't reallocate output buffer + return FileDecoderState::FAILED; + } + } + + this->audio_stream_info_ = + audio::AudioStreamInfo(this->flac_decoder_->get_sample_depth(), this->flac_decoder_->get_num_channels(), + this->flac_decoder_->get_sample_rate()); + + return FileDecoderState::MORE_TO_PROCESS; + } + + uint32_t output_samples = 0; + auto result = this->flac_decoder_->decode_frame( + this->input_transfer_buffer_->get_buffer_start(), this->input_transfer_buffer_->available(), + reinterpret_cast(this->output_transfer_buffer_->get_buffer_end()), &output_samples); + + if (result == esp_audio_libs::flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { + // Not an issue, just needs more data that we'll get next time. + return FileDecoderState::POTENTIALLY_FAILED; + } + + size_t bytes_consumed = this->flac_decoder_->get_bytes_index(); + this->input_transfer_buffer_->decrease_buffer_length(bytes_consumed); + + if (result > esp_audio_libs::flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { + // Corrupted frame, don't retry with current buffer content, wait for new sync + return FileDecoderState::POTENTIALLY_FAILED; + } + + // We have successfully decoded some input data and have new output data + this->output_transfer_buffer_->increase_buffer_length( + this->audio_stream_info_.value().samples_to_bytes(output_samples)); + + if (result == esp_audio_libs::flac::FLAC_DECODER_NO_MORE_FRAMES) { + return FileDecoderState::END_OF_FILE; + } + + return FileDecoderState::MORE_TO_PROCESS; +} +#endif + +#ifdef USE_AUDIO_MP3_SUPPORT +FileDecoderState AudioDecoder::decode_mp3_() { + // Look for the next sync word + int buffer_length = (int) this->input_transfer_buffer_->available(); + int32_t offset = + esp_audio_libs::helix_decoder::MP3FindSyncWord(this->input_transfer_buffer_->get_buffer_start(), buffer_length); + + if (offset < 0) { + // New data may have the sync word + this->input_transfer_buffer_->decrease_buffer_length(buffer_length); + return FileDecoderState::POTENTIALLY_FAILED; + } + + // Advance read pointer to match the offset for the syncword + this->input_transfer_buffer_->decrease_buffer_length(offset); + uint8_t *buffer_start = this->input_transfer_buffer_->get_buffer_start(); + + buffer_length = (int) this->input_transfer_buffer_->available(); + int err = esp_audio_libs::helix_decoder::MP3Decode(this->mp3_decoder_, &buffer_start, &buffer_length, + (int16_t *) this->output_transfer_buffer_->get_buffer_end(), 0); + + size_t consumed = this->input_transfer_buffer_->available() - buffer_length; + this->input_transfer_buffer_->decrease_buffer_length(consumed); + + if (err) { + switch (err) { + case esp_audio_libs::helix_decoder::ERR_MP3_OUT_OF_MEMORY: + return FileDecoderState::FAILED; + break; + case esp_audio_libs::helix_decoder::ERR_MP3_NULL_POINTER: + return FileDecoderState::FAILED; + break; + default: + // Most errors are recoverable by moving on to the next frame, so mark as potentailly failed + return FileDecoderState::POTENTIALLY_FAILED; + break; + } + } else { + esp_audio_libs::helix_decoder::MP3FrameInfo mp3_frame_info; + esp_audio_libs::helix_decoder::MP3GetLastFrameInfo(this->mp3_decoder_, &mp3_frame_info); + if (mp3_frame_info.outputSamps > 0) { + int bytes_per_sample = (mp3_frame_info.bitsPerSample / 8); + this->output_transfer_buffer_->increase_buffer_length(mp3_frame_info.outputSamps * bytes_per_sample); + + if (!this->audio_stream_info_.has_value()) { + this->audio_stream_info_ = + audio::AudioStreamInfo(mp3_frame_info.bitsPerSample, mp3_frame_info.nChans, mp3_frame_info.samprate); + } + } + } + + return FileDecoderState::MORE_TO_PROCESS; +} +#endif + +FileDecoderState AudioDecoder::decode_wav_() { + if (!this->audio_stream_info_.has_value()) { + // Header hasn't been processed + + esp_audio_libs::wav_decoder::WAVDecoderResult result = this->wav_decoder_->decode_header( + this->input_transfer_buffer_->get_buffer_start(), this->input_transfer_buffer_->available()); + + if (result == esp_audio_libs::wav_decoder::WAV_DECODER_SUCCESS_IN_DATA) { + this->input_transfer_buffer_->decrease_buffer_length(this->wav_decoder_->bytes_processed()); + + this->audio_stream_info_ = audio::AudioStreamInfo( + this->wav_decoder_->bits_per_sample(), this->wav_decoder_->num_channels(), this->wav_decoder_->sample_rate()); + + this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left(); + this->wav_has_known_end_ = (this->wav_bytes_left_ > 0); + return FileDecoderState::MORE_TO_PROCESS; + } else if (result == esp_audio_libs::wav_decoder::WAV_DECODER_WARNING_INCOMPLETE_DATA) { + // Available data didn't have the full header + return FileDecoderState::POTENTIALLY_FAILED; + } else { + return FileDecoderState::FAILED; + } + } else { + if (!this->wav_has_known_end_ || (this->wav_bytes_left_ > 0)) { + size_t bytes_to_copy = this->input_transfer_buffer_->available(); + + if (this->wav_has_known_end_) { + bytes_to_copy = std::min(bytes_to_copy, this->wav_bytes_left_); + } + + bytes_to_copy = std::min(bytes_to_copy, this->output_transfer_buffer_->free()); + + if (bytes_to_copy > 0) { + std::memcpy(this->output_transfer_buffer_->get_buffer_end(), this->input_transfer_buffer_->get_buffer_start(), + bytes_to_copy); + this->input_transfer_buffer_->decrease_buffer_length(bytes_to_copy); + this->output_transfer_buffer_->increase_buffer_length(bytes_to_copy); + if (this->wav_has_known_end_) { + this->wav_bytes_left_ -= bytes_to_copy; + } + } + return FileDecoderState::IDLE; + } + } + + return FileDecoderState::END_OF_FILE; +} + +} // namespace audio +} // namespace esphome + +#endif diff --git a/esphome/components/audio/audio_decoder.h b/esphome/components/audio/audio_decoder.h new file mode 100644 index 0000000000..2ca1d623fe --- /dev/null +++ b/esphome/components/audio/audio_decoder.h @@ -0,0 +1,135 @@ +#pragma once + +#ifdef USE_ESP32 + +#include "audio.h" +#include "audio_transfer_buffer.h" + +#include "esphome/core/defines.h" +#include "esphome/core/helpers.h" +#include "esphome/core/ring_buffer.h" + +#ifdef USE_SPEAKER +#include "esphome/components/speaker/speaker.h" +#endif + +#include "esp_err.h" + +// esp-audio-libs +#ifdef USE_AUDIO_FLAC_SUPPORT +#include +#endif +#ifdef USE_AUDIO_MP3_SUPPORT +#include +#endif +#include + +namespace esphome { +namespace audio { + +enum class AudioDecoderState : uint8_t { + DECODING = 0, // More data is available to decode + FINISHED, // All file data has been decoded and transferred + FAILED, // Encountered an error +}; + +// Only used within the AudioDecoder class; conveys the state of the particular file type decoder +enum class FileDecoderState : uint8_t { + MORE_TO_PROCESS, // Successsfully read a file chunk and more data is available to decode + IDLE, // Not enough data to decode, waiting for more to be transferred + POTENTIALLY_FAILED, // Decoder encountered a potentially recoverable error if more file data is available + FAILED, // Decoder encoutnered an uncrecoverable error + END_OF_FILE, // The specific file decoder knows its the end of the file +}; + +class AudioDecoder { + /* + * @brief Class that facilitates decoding an audio file. + * The audio file is read from a ring buffer source, decoded, and sent to an audio sink (ring buffer or speaker + * component). + * Supports wav, flac, and mp3 formats. + */ + public: + /// @brief Allocates the input and output transfer buffers + /// @param input_buffer_size Size of the input transfer buffer in bytes. + /// @param output_buffer_size Size of the output transfer buffer in bytes. + AudioDecoder(size_t input_buffer_size, size_t output_buffer_size); + + /// @brief Deallocates the MP3 decoder (the flac and wav decoders are deallocated automatically) + ~AudioDecoder(); + + /// @brief Adds a source ring buffer for raw file data. Takes ownership of the ring buffer in a shared_ptr. + /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_source(std::weak_ptr &input_ring_buffer); + + /// @brief Adds a sink ring buffer for decoded audio. Takes ownership of the ring buffer in a shared_ptr. + /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_sink(std::weak_ptr &output_ring_buffer); + +#ifdef USE_SPEAKER + /// @brief Adds a sink speaker for decoded audio. + /// @param speaker pointer to speaker component + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_sink(speaker::Speaker *speaker); +#endif + + /// @brief Sets up decoding the file + /// @param audio_file_type AudioFileType of the file + /// @return ESP_OK if successful, ESP_ERR_NO_MEM if the transfer buffers fail to allocate, or ESP_ERR_NOT_SUPPORTED if + /// the format isn't supported. + esp_err_t start(AudioFileType audio_file_type); + + /// @brief Decodes audio from the ring buffer source and writes to the sink. + /// @param stop_gracefully If true, it indicates the file source is finished. The decoder will decode all the + /// reamining data and then finish. + /// @return AudioDecoderState + AudioDecoderState decode(bool stop_gracefully); + + /// @brief Gets the audio stream information, if it has been decoded from the files header + /// @return optional with the audio information. If not available yet, returns no value. + const optional &get_audio_stream_info() const { return this->audio_stream_info_; } + + /// @brief Returns the duration of audio (in milliseconds) decoded and sent to the sink + /// @return Duration of decoded audio in milliseconds + uint32_t get_playback_ms() const { return this->playback_ms_; } + + /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers. + /// @param pause_state If true, audio data is not sent to the sink. + void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; } + + protected: + std::unique_ptr wav_decoder_; +#ifdef USE_AUDIO_FLAC_SUPPORT + FileDecoderState decode_flac_(); + std::unique_ptr flac_decoder_; +#endif +#ifdef USE_AUDIO_MP3_SUPPORT + FileDecoderState decode_mp3_(); + esp_audio_libs::helix_decoder::HMP3Decoder mp3_decoder_; +#endif + FileDecoderState decode_wav_(); + + std::unique_ptr input_transfer_buffer_; + std::unique_ptr output_transfer_buffer_; + + AudioFileType audio_file_type_{AudioFileType::NONE}; + optional audio_stream_info_{}; + + size_t free_buffer_required_{0}; + size_t wav_bytes_left_{0}; + + uint32_t potentially_failed_count_{0}; + bool end_of_file_{false}; + bool wav_has_known_end_{false}; + + bool pause_output_{false}; + + uint32_t accumulated_frames_written_{0}; + uint32_t playback_ms_{0}; +}; +} // namespace audio +} // namespace esphome + +#endif From 6b55df36c7e21997d69c4688ee1627e545596087 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Mon, 3 Feb 2025 20:58:35 -0600 Subject: [PATCH 3/3] [audio] Media Player Components PR6 (#8168) Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com> --- esphome/components/audio/audio_resampler.cpp | 159 +++++++++++++++++++ esphome/components/audio/audio_resampler.h | 100 ++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 esphome/components/audio/audio_resampler.cpp create mode 100644 esphome/components/audio/audio_resampler.h diff --git a/esphome/components/audio/audio_resampler.cpp b/esphome/components/audio/audio_resampler.cpp new file mode 100644 index 0000000000..05e9ff6ca1 --- /dev/null +++ b/esphome/components/audio/audio_resampler.cpp @@ -0,0 +1,159 @@ +#include "audio_resampler.h" + +#ifdef USE_ESP32 + +#include "esphome/core/hal.h" + +namespace esphome { +namespace audio { + +static const uint32_t READ_WRITE_TIMEOUT_MS = 20; + +AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size) + : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) { + this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size); + this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size); +} + +esp_err_t AudioResampler::add_source(std::weak_ptr &input_ring_buffer) { + if (this->input_transfer_buffer_ != nullptr) { + this->input_transfer_buffer_->set_source(input_ring_buffer); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} + +esp_err_t AudioResampler::add_sink(std::weak_ptr &output_ring_buffer) { + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(output_ring_buffer); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} + +#ifdef USE_SPEAKER +esp_err_t AudioResampler::add_sink(speaker::Speaker *speaker) { + if (this->output_transfer_buffer_ != nullptr) { + this->output_transfer_buffer_->set_sink(speaker); + return ESP_OK; + } + return ESP_ERR_NO_MEM; +} +#endif + +esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, + uint16_t number_of_taps, uint16_t number_of_filters) { + this->input_stream_info_ = input_stream_info; + this->output_stream_info_ = output_stream_info; + + if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) { + return ESP_ERR_NO_MEM; + } + + if ((input_stream_info.get_bits_per_sample() > 32) || (output_stream_info.get_bits_per_sample() > 32) || + (input_stream_info_.get_channels() != output_stream_info.get_channels())) { + return ESP_ERR_NOT_SUPPORTED; + } + + if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) || + (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) { + this->resampler_ = make_unique( + input_stream_info.bytes_to_samples(this->input_buffer_size_), + output_stream_info.bytes_to_samples(this->output_buffer_size_)); + + // Use cascaded biquad filters when downsampling to avoid aliasing + bool use_pre_filter = output_stream_info.get_sample_rate() < input_stream_info.get_sample_rate(); + + esp_audio_libs::resampler::ResamplerConfiguration resample_config = { + .source_sample_rate = static_cast(input_stream_info.get_sample_rate()), + .target_sample_rate = static_cast(output_stream_info.get_sample_rate()), + .source_bits_per_sample = input_stream_info.get_bits_per_sample(), + .target_bits_per_sample = output_stream_info.get_bits_per_sample(), + .channels = input_stream_info_.get_channels(), + .use_pre_or_post_filter = use_pre_filter, + .subsample_interpolate = false, // Doubles the CPU load. Using more filters is a better alternative + .number_of_taps = number_of_taps, + .number_of_filters = number_of_filters, + }; + + if (!this->resampler_->initialize(resample_config)) { + // Failed to allocate the resampler's internal buffers + return ESP_ERR_NO_MEM; + } + } + + return ESP_OK; +} + +AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) { + if (stop_gracefully) { + if (!this->input_transfer_buffer_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) { + return AudioResamplerState::FINISHED; + } + } + + if (!this->pause_output_) { + // Move audio data to the sink + this->output_transfer_buffer_->transfer_data_to_sink(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + } else { + // If paused, block to avoid wasting CPU resources + delay(READ_WRITE_TIMEOUT_MS); + } + + this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + + if (this->input_transfer_buffer_->available() == 0) { + // No samples available to process + return AudioResamplerState::RESAMPLING; + } + + const size_t bytes_free = this->output_transfer_buffer_->free(); + const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free); + + const size_t bytes_available = this->input_transfer_buffer_->available(); + const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available); + + if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) || + (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) { + esp_audio_libs::resampler::ResamplerResults results = + this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(), + this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3); + + this->input_transfer_buffer_->decrease_buffer_length(this->input_stream_info_.frames_to_bytes(results.frames_used)); + this->output_transfer_buffer_->increase_buffer_length( + this->output_stream_info_.frames_to_bytes(results.frames_generated)); + + // Resampling causes slight differences in the durations used versus generated. Computes the difference in + // millisconds. The callback function passing the played audio duration uses the difference to convert from output + // duration to input duration. + this->accumulated_frames_used_ += results.frames_used; + this->accumulated_frames_generated_ += results.frames_generated; + + const int32_t used_ms = + this->input_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_used_); + const int32_t generated_ms = + this->output_stream_info_.frames_to_milliseconds_with_remainder(&this->accumulated_frames_generated_); + + *ms_differential = used_ms - generated_ms; + + } else { + // No resampling required, copy samples directly to the output transfer buffer + *ms_differential = 0; + + const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free), + this->input_stream_info_.frames_to_bytes(frames_available)); + + std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), + (void *) this->input_transfer_buffer_->get_buffer_start(), bytes_to_transfer); + + this->input_transfer_buffer_->decrease_buffer_length(bytes_to_transfer); + this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer); + } + + return AudioResamplerState::RESAMPLING; +} + +} // namespace audio +} // namespace esphome + +#endif diff --git a/esphome/components/audio/audio_resampler.h b/esphome/components/audio/audio_resampler.h new file mode 100644 index 0000000000..a348aaf783 --- /dev/null +++ b/esphome/components/audio/audio_resampler.h @@ -0,0 +1,100 @@ +#pragma once + +#ifdef USE_ESP32 + +#include "audio.h" +#include "audio_transfer_buffer.h" + +#ifdef USE_SPEAKER +#include "esphome/components/speaker/speaker.h" +#endif + +#include "esphome/core/ring_buffer.h" + +#include "esp_err.h" + +#include // esp-audio-libs + +namespace esphome { +namespace audio { + +enum class AudioResamplerState : uint8_t { + RESAMPLING, // More data is available to resample + FINISHED, // All file data has been resampled and transferred + FAILED, // Unused state included for consistency among Audio classes +}; + +class AudioResampler { + /* + * @brief Class that facilitates resampling audio. + * The audio data is read from a ring buffer source, resampled, and sent to an audio sink (ring buffer or speaker + * component). Also supports converting bits per sample. + */ + public: + /// @brief Allocates the input and output transfer buffers + /// @param input_buffer_size Size of the input transfer buffer in bytes. + /// @param output_buffer_size Size of the output transfer buffer in bytes. + AudioResampler(size_t input_buffer_size, size_t output_buffer_size); + + /// @brief Adds a source ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr. + /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_source(std::weak_ptr &input_ring_buffer); + + /// @brief Adds a sink ring buffer for resampled audio. Takes ownership of the ring buffer in a shared_ptr. + /// @param output_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_sink(std::weak_ptr &output_ring_buffer); + +#ifdef USE_SPEAKER + /// @brief Adds a sink speaker for decoded audio. + /// @param speaker pointer to speaker component + /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + esp_err_t add_sink(speaker::Speaker *speaker); +#endif + + /// @brief Sets up the class to resample. + /// @param input_stream_info The incoming sample rate, bits per sample, and number of channels + /// @param output_stream_info The desired outgoing sample rate, bits per sample, and number of channels + /// @param number_of_taps Number of taps per FIR filter + /// @param number_of_filters Number of FIR filters + /// @return ESP_OK if it is able to convert the incoming stream, + /// ESP_ERR_NO_MEM if the transfer buffers failed to allocate, + /// ESP_ERR_NOT_SUPPORTED if the stream can't be converted. + esp_err_t start(AudioStreamInfo &input_stream_info, AudioStreamInfo &output_stream_info, uint16_t number_of_taps, + uint16_t number_of_filters); + + /// @brief Resamples audio from the ring buffer source and writes to the sink. + /// @param stop_gracefully If true, it indicates the file decoder is finished. The resampler will resample all the + /// remaining audio and then finish. + /// @param ms_differential Pointer to a (int32_t) variable that will store the difference, in milliseconds, between + /// the duration of input audio used and the duration of output audio generated. + /// @return AudioResamplerState + AudioResamplerState resample(bool stop_gracefully, int32_t *ms_differential); + + /// @brief Pauses sending resampled audio to the sink. If paused, it will continue to process internal buffers. + /// @param pause_state If true, audio data is not sent to the sink. + void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; } + + protected: + std::unique_ptr input_transfer_buffer_; + std::unique_ptr output_transfer_buffer_; + + size_t input_buffer_size_; + size_t output_buffer_size_; + + uint32_t accumulated_frames_used_{0}; + uint32_t accumulated_frames_generated_{0}; + + bool pause_output_{false}; + + AudioStreamInfo input_stream_info_; + AudioStreamInfo output_stream_info_; + + std::unique_ptr resampler_; +}; + +} // namespace audio +} // namespace esphome + +#endif