diff --git a/CODEOWNERS b/CODEOWNERS index 9fbf191be0..d26e153c1a 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -277,6 +277,7 @@ esphome/components/mics_4514/* @jesserockz esphome/components/midea/* @dudanov esphome/components/midea_ir/* @dudanov esphome/components/mitsubishi/* @RubyBailey +esphome/components/mixer/speaker/* @kahrendt esphome/components/mlx90393/* @functionpointer esphome/components/mlx90614/* @jesserockz esphome/components/mmc5603/* @benhoff diff --git a/esphome/components/mixer/__init__.py b/esphome/components/mixer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/esphome/components/mixer/speaker/__init__.py b/esphome/components/mixer/speaker/__init__.py new file mode 100644 index 0000000000..a451f2b7b4 --- /dev/null +++ b/esphome/components/mixer/speaker/__init__.py @@ -0,0 +1,172 @@ +from esphome import automation +import esphome.codegen as cg +from esphome.components import audio, esp32, speaker +import esphome.config_validation as cv +from esphome.const import ( + CONF_BITS_PER_SAMPLE, + CONF_BUFFER_DURATION, + CONF_DURATION, + CONF_ID, + CONF_NEVER, + CONF_NUM_CHANNELS, + CONF_OUTPUT_SPEAKER, + CONF_SAMPLE_RATE, + CONF_TASK_STACK_IN_PSRAM, + CONF_TIMEOUT, + PLATFORM_ESP32, +) +from esphome.core.entity_helpers import inherit_property_from +import esphome.final_validate as fv + +AUTO_LOAD = ["audio"] +CODEOWNERS = ["@kahrendt"] + +mixer_speaker_ns = cg.esphome_ns.namespace("mixer_speaker") +MixerSpeaker = mixer_speaker_ns.class_("MixerSpeaker", cg.Component) +SourceSpeaker = mixer_speaker_ns.class_("SourceSpeaker", cg.Component, speaker.Speaker) + +CONF_DECIBEL_REDUCTION = "decibel_reduction" +CONF_QUEUE_MODE = "queue_mode" +CONF_SOURCE_SPEAKERS = "source_speakers" + +DuckingApplyAction = mixer_speaker_ns.class_( + "DuckingApplyAction", automation.Action, cg.Parented.template(SourceSpeaker) +) + + +SOURCE_SPEAKER_SCHEMA = speaker.SPEAKER_SCHEMA.extend( + { + cv.GenerateID(): cv.declare_id(SourceSpeaker), + cv.Optional( + CONF_BUFFER_DURATION, default="100ms" + ): cv.positive_time_period_milliseconds, + cv.Optional(CONF_TIMEOUT, default="500ms"): cv.Any( + cv.positive_time_period_milliseconds, + cv.one_of(CONF_NEVER, lower=True), + ), + cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(16, 16), + } +) + + +def _set_stream_limits(config): + audio.set_stream_limits( + min_bits_per_sample=16, + max_bits_per_sample=16, + )(config) + + return config + + +def _validate_source_speaker(config): + fconf = fv.full_config.get() + + # Get ID for the output speaker and add it to the source speakrs config to easily inherit properties + path = fconf.get_path_for_id(config[CONF_ID])[:-3] + path.append(CONF_OUTPUT_SPEAKER) + output_speaker_id = fconf.get_config_for_path(path) + config[CONF_OUTPUT_SPEAKER] = output_speaker_id + + inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER)(config) + inherit_property_from(CONF_SAMPLE_RATE, CONF_OUTPUT_SPEAKER)(config) + + audio.final_validate_audio_schema( + "mixer", + audio_device=CONF_OUTPUT_SPEAKER, + bits_per_sample=config.get(CONF_BITS_PER_SAMPLE), + channels=config.get(CONF_NUM_CHANNELS), + sample_rate=config.get(CONF_SAMPLE_RATE), + )(config) + + return config + + +CONFIG_SCHEMA = cv.All( + cv.Schema( + { + cv.GenerateID(): cv.declare_id(MixerSpeaker), + cv.Required(CONF_OUTPUT_SPEAKER): cv.use_id(speaker.Speaker), + cv.Required(CONF_SOURCE_SPEAKERS): cv.All( + cv.ensure_list(SOURCE_SPEAKER_SCHEMA), + cv.Length(min=2, max=8), + [_set_stream_limits], + ), + cv.Optional(CONF_NUM_CHANNELS): cv.int_range(min=1, max=2), + cv.Optional(CONF_QUEUE_MODE, default=False): cv.boolean, + cv.SplitDefault(CONF_TASK_STACK_IN_PSRAM, esp32_idf=False): cv.All( + cv.boolean, cv.only_with_esp_idf + ), + } + ), + cv.only_on([PLATFORM_ESP32]), +) + +FINAL_VALIDATE_SCHEMA = cv.All( + cv.Schema( + { + cv.Optional(CONF_SOURCE_SPEAKERS): [_validate_source_speaker], + }, + extra=cv.ALLOW_EXTRA, + ), + inherit_property_from(CONF_NUM_CHANNELS, CONF_OUTPUT_SPEAKER), +) + + +async def to_code(config): + var = cg.new_Pvariable(config[CONF_ID]) + await cg.register_component(var, config) + + spkr = await cg.get_variable(config[CONF_OUTPUT_SPEAKER]) + + cg.add(var.set_output_channels(config[CONF_NUM_CHANNELS])) + cg.add(var.set_output_speaker(spkr)) + cg.add(var.set_queue_mode(config[CONF_QUEUE_MODE])) + + if task_stack_in_psram := config.get(CONF_TASK_STACK_IN_PSRAM): + cg.add(var.set_task_stack_in_psram(task_stack_in_psram)) + if task_stack_in_psram: + if config[CONF_TASK_STACK_IN_PSRAM]: + esp32.add_idf_sdkconfig_option( + "CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY", True + ) + + for speaker_config in config[CONF_SOURCE_SPEAKERS]: + source_speaker = cg.new_Pvariable(speaker_config[CONF_ID]) + + cg.add(source_speaker.set_buffer_duration(speaker_config[CONF_BUFFER_DURATION])) + + if speaker_config[CONF_TIMEOUT] != CONF_NEVER: + cg.add(source_speaker.set_timeout(speaker_config[CONF_TIMEOUT])) + + await cg.register_component(source_speaker, speaker_config) + await cg.register_parented(source_speaker, config[CONF_ID]) + await speaker.register_speaker(source_speaker, speaker_config) + + cg.add(var.add_source_speaker(source_speaker)) + + +@automation.register_action( + "mixer_speaker.apply_ducking", + DuckingApplyAction, + cv.Schema( + { + cv.GenerateID(): cv.use_id(SourceSpeaker), + cv.Required(CONF_DECIBEL_REDUCTION): cv.templatable( + cv.int_range(min=0, max=51) + ), + cv.Optional(CONF_DURATION, default="0.0s"): cv.templatable( + cv.positive_time_period_milliseconds + ), + } + ), +) +async def ducking_set_to_code(config, action_id, template_arg, args): + var = cg.new_Pvariable(action_id, template_arg) + await cg.register_parented(var, config[CONF_ID]) + decibel_reduction = await cg.templatable( + config[CONF_DECIBEL_REDUCTION], args, cg.uint8 + ) + cg.add(var.set_decibel_reduction(decibel_reduction)) + duration = await cg.templatable(config[CONF_DURATION], args, cg.uint32) + cg.add(var.set_duration(duration)) + return var diff --git a/esphome/components/mixer/speaker/automation.h b/esphome/components/mixer/speaker/automation.h new file mode 100644 index 0000000000..b688fa2c1e --- /dev/null +++ b/esphome/components/mixer/speaker/automation.h @@ -0,0 +1,19 @@ +#pragma once + +#include "mixer_speaker.h" + +#ifdef USE_ESP32 + +namespace esphome { +namespace mixer_speaker { +template class DuckingApplyAction : public Action, public Parented { + TEMPLATABLE_VALUE(uint8_t, decibel_reduction) + TEMPLATABLE_VALUE(uint32_t, duration) + void play(Ts... x) override { + this->parent_->apply_ducking(this->decibel_reduction_.value(x...), this->duration_.value(x...)); + } +}; +} // namespace mixer_speaker +} // namespace esphome + +#endif diff --git a/esphome/components/mixer/speaker/mixer_speaker.cpp b/esphome/components/mixer/speaker/mixer_speaker.cpp new file mode 100644 index 0000000000..60cff95eb2 --- /dev/null +++ b/esphome/components/mixer/speaker/mixer_speaker.cpp @@ -0,0 +1,624 @@ +#include "mixer_speaker.h" + +#ifdef USE_ESP32 + +#include "esphome/core/hal.h" +#include "esphome/core/helpers.h" +#include "esphome/core/log.h" + +#include +#include + +namespace esphome { +namespace mixer_speaker { + +static const UBaseType_t MIXER_TASK_PRIORITY = 10; + +static const uint32_t TRANSFER_BUFFER_DURATION_MS = 50; +static const uint32_t TASK_DELAY_MS = 25; + +static const size_t TASK_STACK_SIZE = 4096; + +static const int16_t MAX_AUDIO_SAMPLE_VALUE = INT16_MAX; +static const int16_t MIN_AUDIO_SAMPLE_VALUE = INT16_MIN; + +static const char *const TAG = "speaker_mixer"; + +// Gives the Q15 fixed point scaling factor to reduce by 0 dB, 1dB, ..., 50 dB +// dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014) +// float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15) +static const std::vector DECIBEL_REDUCTION_TABLE = { + 32767, 29201, 26022, 23189, 20665, 18415, 16410, 14624, 13032, 11613, 10349, 9222, 8218, 7324, 6527, 5816, 5183, + 4619, 4116, 3668, 3269, 2913, 2596, 2313, 2061, 1837, 1637, 1459, 1300, 1158, 1032, 920, 820, 731, + 651, 580, 517, 461, 411, 366, 326, 291, 259, 231, 206, 183, 163, 146, 130, 116, 103}; + +enum MixerEventGroupBits : uint32_t { + COMMAND_STOP = (1 << 0), // stops the mixer task + STATE_STARTING = (1 << 10), + STATE_RUNNING = (1 << 11), + STATE_STOPPING = (1 << 12), + STATE_STOPPED = (1 << 13), + ERR_ESP_NO_MEM = (1 << 19), + ALL_BITS = 0x00FFFFFF, // All valid FreeRTOS event group bits +}; + +void SourceSpeaker::dump_config() { + ESP_LOGCONFIG(TAG, "Mixer Source Speaker"); + ESP_LOGCONFIG(TAG, " Buffer Duration: %" PRIu32 " ms", this->buffer_duration_ms_); + if (this->timeout_ms_.has_value()) { + ESP_LOGCONFIG(TAG, " Timeout: %" PRIu32 " ms", this->timeout_ms_.value()); + } else { + ESP_LOGCONFIG(TAG, " Timeout: never"); + } +} + +void SourceSpeaker::setup() { + this->parent_->get_output_speaker()->add_audio_output_callback( + [this](uint32_t new_playback_ms, uint32_t remainder_us, uint32_t pending_ms, uint32_t write_timestamp) { + uint32_t personal_playback_ms = std::min(new_playback_ms, this->pending_playback_ms_); + if (personal_playback_ms > 0) { + this->pending_playback_ms_ -= personal_playback_ms; + this->audio_output_callback_(personal_playback_ms, remainder_us, this->pending_playback_ms_, write_timestamp); + } + }); +} + +void SourceSpeaker::loop() { + switch (this->state_) { + case speaker::STATE_STARTING: { + esp_err_t err = this->start_(); + if (err == ESP_OK) { + this->state_ = speaker::STATE_RUNNING; + this->stop_gracefully_ = false; + this->last_seen_data_ms_ = millis(); + this->status_clear_error(); + } else { + switch (err) { + case ESP_ERR_NO_MEM: + this->status_set_error("Failed to start mixer: not enough memory"); + break; + case ESP_ERR_NOT_SUPPORTED: + this->status_set_error("Failed to start mixer: unsupported bits per sample"); + break; + case ESP_ERR_INVALID_ARG: + this->status_set_error("Failed to start mixer: audio stream isn't compatible with the other audio stream."); + break; + case ESP_ERR_INVALID_STATE: + this->status_set_error("Failed to start mixer: mixer task failed to start"); + break; + default: + this->status_set_error("Failed to start mixer"); + break; + } + + this->state_ = speaker::STATE_STOPPING; + } + break; + } + case speaker::STATE_RUNNING: + if (!this->transfer_buffer_->has_buffered_data()) { + if ((this->timeout_ms_.has_value() && ((millis() - this->last_seen_data_ms_) > this->timeout_ms_.value())) || + this->stop_gracefully_) { + this->state_ = speaker::STATE_STOPPING; + } + } + break; + case speaker::STATE_STOPPING: + this->stop_(); + this->stop_gracefully_ = false; + this->state_ = speaker::STATE_STOPPED; + break; + case speaker::STATE_STOPPED: + break; + } +} + +size_t SourceSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) { + if (this->is_stopped()) { + this->start(); + } + size_t bytes_written = 0; + if (this->ring_buffer_.use_count() == 1) { + std::shared_ptr temp_ring_buffer = this->ring_buffer_.lock(); + bytes_written = temp_ring_buffer->write_without_replacement(data, length, ticks_to_wait); + if (bytes_written > 0) { + this->last_seen_data_ms_ = millis(); + } + } + return bytes_written; +} + +void SourceSpeaker::start() { this->state_ = speaker::STATE_STARTING; } + +esp_err_t SourceSpeaker::start_() { + const size_t ring_buffer_size = this->audio_stream_info_.ms_to_bytes(this->buffer_duration_ms_); + if (this->transfer_buffer_.use_count() == 0) { + this->transfer_buffer_ = + audio::AudioSourceTransferBuffer::create(this->audio_stream_info_.ms_to_bytes(TRANSFER_BUFFER_DURATION_MS)); + + if (this->transfer_buffer_ == nullptr) { + return ESP_ERR_NO_MEM; + } + std::shared_ptr temp_ring_buffer; + + if (!this->ring_buffer_.use_count()) { + temp_ring_buffer = RingBuffer::create(ring_buffer_size); + this->ring_buffer_ = temp_ring_buffer; + } + + if (!this->ring_buffer_.use_count()) { + return ESP_ERR_NO_MEM; + } else { + this->transfer_buffer_->set_source(temp_ring_buffer); + } + } + + return this->parent_->start(this->audio_stream_info_); +} + +void SourceSpeaker::stop() { + if (this->state_ != speaker::STATE_STOPPED) { + this->state_ = speaker::STATE_STOPPING; + } +} + +void SourceSpeaker::stop_() { + this->transfer_buffer_.reset(); // deallocates the transfer buffer +} + +void SourceSpeaker::finish() { this->stop_gracefully_ = true; } + +bool SourceSpeaker::has_buffered_data() const { + return ((this->transfer_buffer_.use_count() > 0) && this->transfer_buffer_->has_buffered_data()); +} + +void SourceSpeaker::set_mute_state(bool mute_state) { + this->mute_state_ = mute_state; + this->parent_->get_output_speaker()->set_mute_state(mute_state); +} + +void SourceSpeaker::set_volume(float volume) { + this->volume_ = volume; + this->parent_->get_output_speaker()->set_volume(volume); +} + +size_t SourceSpeaker::process_data_from_source(TickType_t ticks_to_wait) { + if (!this->transfer_buffer_.use_count()) { + return 0; + } + + // Store current offset, as these samples are already ducked + const size_t current_length = this->transfer_buffer_->available(); + + size_t bytes_read = this->transfer_buffer_->transfer_data_from_source(ticks_to_wait); + + uint32_t samples_to_duck = this->audio_stream_info_.bytes_to_samples(bytes_read); + if (samples_to_duck > 0) { + int16_t *current_buffer = reinterpret_cast(this->transfer_buffer_->get_buffer_start() + current_length); + + duck_samples(current_buffer, samples_to_duck, &this->current_ducking_db_reduction_, + &this->ducking_transition_samples_remaining_, this->samples_per_ducking_step_, + this->db_change_per_ducking_step_); + } + + return bytes_read; +} + +void SourceSpeaker::apply_ducking(uint8_t decibel_reduction, uint32_t duration) { + if (this->target_ducking_db_reduction_ != decibel_reduction) { + this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_; + + this->target_ducking_db_reduction_ = decibel_reduction; + + uint8_t total_ducking_steps = 0; + if (this->target_ducking_db_reduction_ > this->current_ducking_db_reduction_) { + // The dB reduction level is increasing (which results in quieter audio) + total_ducking_steps = this->target_ducking_db_reduction_ - this->current_ducking_db_reduction_ - 1; + this->db_change_per_ducking_step_ = 1; + } else { + // The dB reduction level is decreasing (which results in louder audio) + total_ducking_steps = this->current_ducking_db_reduction_ - this->target_ducking_db_reduction_ - 1; + this->db_change_per_ducking_step_ = -1; + } + if ((duration > 0) && (total_ducking_steps > 0)) { + this->ducking_transition_samples_remaining_ = this->audio_stream_info_.ms_to_samples(duration); + + this->samples_per_ducking_step_ = this->ducking_transition_samples_remaining_ / total_ducking_steps; + this->ducking_transition_samples_remaining_ = + this->samples_per_ducking_step_ * total_ducking_steps; // Adjust for integer division rounding + + this->current_ducking_db_reduction_ += this->db_change_per_ducking_step_; + } else { + this->ducking_transition_samples_remaining_ = 0; + this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_; + } + } +} + +void SourceSpeaker::duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck, + int8_t *current_ducking_db_reduction, uint32_t *ducking_transition_samples_remaining, + uint32_t samples_per_ducking_step, int8_t db_change_per_ducking_step) { + if (*ducking_transition_samples_remaining > 0) { + // Ducking level is still transitioning + + // Takes the ceiling of input_samples_to_duck/samples_per_ducking_step + uint32_t ducking_steps_in_batch = + input_samples_to_duck / samples_per_ducking_step + (input_samples_to_duck % samples_per_ducking_step != 0); + + for (uint32_t i = 0; i < ducking_steps_in_batch; ++i) { + uint32_t samples_left_in_step = *ducking_transition_samples_remaining % samples_per_ducking_step; + + if (samples_left_in_step == 0) { + samples_left_in_step = samples_per_ducking_step; + } + + uint32_t samples_to_duck = std::min(input_samples_to_duck, samples_left_in_step); + samples_to_duck = std::min(samples_to_duck, *ducking_transition_samples_remaining); + + // Ensure we only point to valid index in the Q15 scaling factor table + uint8_t safe_db_reduction_index = + clamp(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1); + int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index]; + + audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, samples_to_duck); + + if (samples_left_in_step - samples_to_duck == 0) { + // After scaling the current samples, we are ready to transition to the next step + *current_ducking_db_reduction += db_change_per_ducking_step; + } + + input_buffer += samples_to_duck; + *ducking_transition_samples_remaining -= samples_to_duck; + input_samples_to_duck -= samples_to_duck; + } + } + + if ((*current_ducking_db_reduction > 0) && (input_samples_to_duck > 0)) { + // Audio is ducked, but its not in the middle of a transition step + + uint8_t safe_db_reduction_index = + clamp(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1); + int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index]; + + audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, input_samples_to_duck); + } +} + +void MixerSpeaker::dump_config() { + ESP_LOGCONFIG(TAG, "Speaker Mixer:"); + ESP_LOGCONFIG(TAG, " Number of output channels: %u", this->output_channels_); +} + +void MixerSpeaker::setup() { + this->event_group_ = xEventGroupCreate(); + + if (this->event_group_ == nullptr) { + ESP_LOGE(TAG, "Failed to create event group"); + this->mark_failed(); + return; + } +} + +void MixerSpeaker::loop() { + uint32_t event_group_bits = xEventGroupGetBits(this->event_group_); + + if (event_group_bits & MixerEventGroupBits::STATE_STARTING) { + ESP_LOGD(TAG, "Starting speaker mixer"); + xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STARTING); + } + if (event_group_bits & MixerEventGroupBits::ERR_ESP_NO_MEM) { + this->status_set_error("Failed to allocate the mixer's internal buffer"); + xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ERR_ESP_NO_MEM); + } + if (event_group_bits & MixerEventGroupBits::STATE_RUNNING) { + ESP_LOGD(TAG, "Started speaker mixer"); + this->status_clear_error(); + xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_RUNNING); + } + if (event_group_bits & MixerEventGroupBits::STATE_STOPPING) { + ESP_LOGD(TAG, "Stopping speaker mixer"); + xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STOPPING); + } + if (event_group_bits & MixerEventGroupBits::STATE_STOPPED) { + if (this->delete_task_() == ESP_OK) { + xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ALL_BITS); + } + } + + if (this->task_handle_ != nullptr) { + bool all_stopped = true; + + for (auto &speaker : this->source_speakers_) { + all_stopped &= speaker->is_stopped(); + } + + if (all_stopped) { + this->stop(); + } + } +} + +esp_err_t MixerSpeaker::start(audio::AudioStreamInfo &stream_info) { + if (!this->audio_stream_info_.has_value()) { + if (stream_info.get_bits_per_sample() != 16) { + // Audio streams that don't have 16 bits per sample are not supported + return ESP_ERR_NOT_SUPPORTED; + } + + this->audio_stream_info_ = audio::AudioStreamInfo(stream_info.get_bits_per_sample(), this->output_channels_, + stream_info.get_sample_rate()); + this->output_speaker_->set_audio_stream_info(this->audio_stream_info_.value()); + } else { + if (!this->queue_mode_ && (stream_info.get_sample_rate() != this->audio_stream_info_.value().get_sample_rate())) { + // The two audio streams must have the same sample rate to mix properly if not in queue mode + return ESP_ERR_INVALID_ARG; + } + } + + return this->start_task_(); +} + +esp_err_t MixerSpeaker::start_task_() { + if (this->task_stack_buffer_ == nullptr) { + if (this->task_stack_in_psram_) { + RAMAllocator stack_allocator(RAMAllocator::ALLOC_EXTERNAL); + this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE); + } else { + RAMAllocator stack_allocator(RAMAllocator::ALLOC_INTERNAL); + this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE); + } + } + + if (this->task_stack_buffer_ == nullptr) { + return ESP_ERR_NO_MEM; + } + + if (this->task_handle_ == nullptr) { + this->task_handle_ = xTaskCreateStatic(audio_mixer_task, "mixer", TASK_STACK_SIZE, (void *) this, + MIXER_TASK_PRIORITY, this->task_stack_buffer_, &this->task_stack_); + } + + if (this->task_handle_ == nullptr) { + return ESP_ERR_INVALID_STATE; + } + + return ESP_OK; +} + +esp_err_t MixerSpeaker::delete_task_() { + if (!this->task_created_) { + this->task_handle_ = nullptr; + + if (this->task_stack_buffer_ != nullptr) { + if (this->task_stack_in_psram_) { + RAMAllocator stack_allocator(RAMAllocator::ALLOC_EXTERNAL); + stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE); + } else { + RAMAllocator stack_allocator(RAMAllocator::ALLOC_INTERNAL); + stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE); + } + + this->task_stack_buffer_ = nullptr; + } + + return ESP_OK; + } + + return ESP_ERR_INVALID_STATE; +} + +void MixerSpeaker::stop() { xEventGroupSetBits(this->event_group_, MixerEventGroupBits::COMMAND_STOP); } + +void MixerSpeaker::copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info, + int16_t *output_buffer, audio::AudioStreamInfo output_stream_info, + uint32_t frames_to_transfer) { + uint8_t input_channels = input_stream_info.get_channels(); + uint8_t output_channels = output_stream_info.get_channels(); + const uint8_t max_input_channel_index = input_channels - 1; + + if (input_channels == output_channels) { + size_t bytes_to_copy = input_stream_info.frames_to_bytes(frames_to_transfer); + memcpy(output_buffer, input_buffer, bytes_to_copy); + + return; + } + + for (uint32_t frame_index = 0; frame_index < frames_to_transfer; ++frame_index) { + for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) { + uint8_t input_channel_index = std::min(output_channel_index, max_input_channel_index); + output_buffer[output_channels * frame_index + output_channel_index] = + input_buffer[input_channels * frame_index + input_channel_index]; + } + } +} + +void MixerSpeaker::mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info, + const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info, + int16_t *output_buffer, audio::AudioStreamInfo output_stream_info, + uint32_t frames_to_mix) { + const uint8_t primary_channels = primary_stream_info.get_channels(); + const uint8_t secondary_channels = secondary_stream_info.get_channels(); + const uint8_t output_channels = output_stream_info.get_channels(); + + const uint8_t max_primary_channel_index = primary_channels - 1; + const uint8_t max_secondary_channel_index = secondary_channels - 1; + + for (uint32_t frames_index = 0; frames_index < frames_to_mix; ++frames_index) { + for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) { + const uint32_t secondary_channel_index = std::min(output_channel_index, max_secondary_channel_index); + const int32_t secondary_sample = secondary_buffer[frames_index * secondary_channels + secondary_channel_index]; + + const uint32_t primary_channel_index = std::min(output_channel_index, max_primary_channel_index); + const int32_t primary_sample = + static_cast(primary_buffer[frames_index * primary_channels + primary_channel_index]); + + const int32_t added_sample = secondary_sample + primary_sample; + + output_buffer[frames_index * output_channels + output_channel_index] = + static_cast(clamp(added_sample, MIN_AUDIO_SAMPLE_VALUE, MAX_AUDIO_SAMPLE_VALUE)); + } + } +} + +void MixerSpeaker::audio_mixer_task(void *params) { + MixerSpeaker *this_mixer = (MixerSpeaker *) params; + + xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STARTING); + + this_mixer->task_created_ = true; + + std::unique_ptr output_transfer_buffer = audio::AudioSinkTransferBuffer::create( + this_mixer->audio_stream_info_.value().ms_to_bytes(TRANSFER_BUFFER_DURATION_MS)); + + if (output_transfer_buffer == nullptr) { + xEventGroupSetBits(this_mixer->event_group_, + MixerEventGroupBits::STATE_STOPPED | MixerEventGroupBits::ERR_ESP_NO_MEM); + + this_mixer->task_created_ = false; + vTaskDelete(nullptr); + } + + output_transfer_buffer->set_sink(this_mixer->output_speaker_); + + xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_RUNNING); + + bool sent_finished = false; + + while (true) { + uint32_t event_group_bits = xEventGroupGetBits(this_mixer->event_group_); + if (event_group_bits & MixerEventGroupBits::COMMAND_STOP) { + break; + } + + output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS)); + + const uint32_t output_frames_free = + this_mixer->audio_stream_info_.value().bytes_to_frames(output_transfer_buffer->free()); + + std::vector speakers_with_data; + std::vector> transfer_buffers_with_data; + + for (auto &speaker : this_mixer->source_speakers_) { + if (speaker->get_transfer_buffer().use_count() > 0) { + std::shared_ptr transfer_buffer = speaker->get_transfer_buffer().lock(); + speaker->process_data_from_source(0); // Transfers and ducks audio from source ring buffers + + if ((transfer_buffer->available() > 0) && !speaker->get_pause_state()) { + // Store the locked transfer buffers in their own vector to avoid releasing ownership until after the loop + transfer_buffers_with_data.push_back(transfer_buffer); + speakers_with_data.push_back(speaker); + } + } + } + + if (transfer_buffers_with_data.empty()) { + // No audio available for transferring, block task temporarily + delay(TASK_DELAY_MS); + continue; + } + + uint32_t frames_to_mix = output_frames_free; + + if ((transfer_buffers_with_data.size() == 1) || this_mixer->queue_mode_) { + // Only one speaker has audio data, just copy samples over + + audio::AudioStreamInfo active_stream_info = speakers_with_data[0]->get_audio_stream_info(); + + if (active_stream_info.get_sample_rate() == + this_mixer->output_speaker_->get_audio_stream_info().get_sample_rate()) { + // Speaker's sample rate matches the output speaker's, copy directly + + const uint32_t frames_available_in_buffer = + active_stream_info.bytes_to_frames(transfer_buffers_with_data[0]->available()); + frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer); + copy_frames(reinterpret_cast(transfer_buffers_with_data[0]->get_buffer_start()), active_stream_info, + reinterpret_cast(output_transfer_buffer->get_buffer_end()), + this_mixer->audio_stream_info_.value(), frames_to_mix); + + // Update source speaker buffer length + transfer_buffers_with_data[0]->decrease_buffer_length(active_stream_info.frames_to_bytes(frames_to_mix)); + speakers_with_data[0]->accumulated_frames_read_ += frames_to_mix; + + // Add new audio duration to the source speaker pending playback + speakers_with_data[0]->pending_playback_ms_ += + active_stream_info.frames_to_milliseconds_with_remainder(&speakers_with_data[0]->accumulated_frames_read_); + + // Update output transfer buffer length + output_transfer_buffer->increase_buffer_length( + this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix)); + } else { + // Speaker's stream info doesn't match the output speaker's, so it's a new source speaker + if (!this_mixer->output_speaker_->is_stopped()) { + if (!sent_finished) { + this_mixer->output_speaker_->finish(); + sent_finished = true; // Avoid repeatedly sending the finish command + } + } else { + // Speaker has finished writing the current audio, update the stream information and restart the speaker + this_mixer->audio_stream_info_ = + audio::AudioStreamInfo(active_stream_info.get_bits_per_sample(), this_mixer->output_channels_, + active_stream_info.get_sample_rate()); + this_mixer->output_speaker_->set_audio_stream_info(this_mixer->audio_stream_info_.value()); + this_mixer->output_speaker_->start(); + sent_finished = false; + } + } + } else { + // Determine how many frames to mix + for (int i = 0; i < transfer_buffers_with_data.size(); ++i) { + const uint32_t frames_available_in_buffer = + speakers_with_data[i]->get_audio_stream_info().bytes_to_frames(transfer_buffers_with_data[i]->available()); + frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer); + } + int16_t *primary_buffer = reinterpret_cast(transfer_buffers_with_data[0]->get_buffer_start()); + audio::AudioStreamInfo primary_stream_info = speakers_with_data[0]->get_audio_stream_info(); + + // Mix two streams together + for (int i = 1; i < transfer_buffers_with_data.size(); ++i) { + mix_audio_samples(primary_buffer, primary_stream_info, + reinterpret_cast(transfer_buffers_with_data[i]->get_buffer_start()), + speakers_with_data[i]->get_audio_stream_info(), + reinterpret_cast(output_transfer_buffer->get_buffer_end()), + this_mixer->audio_stream_info_.value(), frames_to_mix); + + speakers_with_data[i]->pending_playback_ms_ += + speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder( + &speakers_with_data[i]->accumulated_frames_read_); + + if (i != transfer_buffers_with_data.size() - 1) { + // Need to mix more streams together, point primary buffer and stream info to the already mixed output + primary_buffer = reinterpret_cast(output_transfer_buffer->get_buffer_end()); + primary_stream_info = this_mixer->audio_stream_info_.value(); + } + } + + // Update source transfer buffer lengths and add new audio durations to the source speaker pending playbacks + for (int i = 0; i < transfer_buffers_with_data.size(); ++i) { + transfer_buffers_with_data[i]->decrease_buffer_length( + speakers_with_data[i]->get_audio_stream_info().frames_to_bytes(frames_to_mix)); + speakers_with_data[i]->accumulated_frames_read_ += frames_to_mix; + + speakers_with_data[i]->pending_playback_ms_ += + speakers_with_data[i]->get_audio_stream_info().frames_to_milliseconds_with_remainder( + &speakers_with_data[i]->accumulated_frames_read_); + } + + // Update output transfer buffer length + output_transfer_buffer->increase_buffer_length( + this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix)); + } + } + + xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPING); + + output_transfer_buffer.reset(); + + xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPED); + this_mixer->task_created_ = false; + vTaskDelete(nullptr); +} + +} // namespace mixer_speaker +} // namespace esphome + +#endif diff --git a/esphome/components/mixer/speaker/mixer_speaker.h b/esphome/components/mixer/speaker/mixer_speaker.h new file mode 100644 index 0000000000..b2cb3e1e39 --- /dev/null +++ b/esphome/components/mixer/speaker/mixer_speaker.h @@ -0,0 +1,207 @@ +#pragma once + +#ifdef USE_ESP32 + +#include "esphome/components/audio/audio.h" +#include "esphome/components/audio/audio_transfer_buffer.h" +#include "esphome/components/speaker/speaker.h" + +#include "esphome/core/component.h" + +#include +#include + +namespace esphome { +namespace mixer_speaker { + +/* Classes for mixing several source speaker audio streams and writing it to another speaker component. + * - Volume controls are passed through to the output speaker + * - Directly handles pausing at the SourceSpeaker level; pause state is not passed through to the output speaker. + * - Audio sent to the SourceSpeaker's must have 16 bits per sample. + * - Audio sent to the SourceSpeaker can have any number of channels. They are duplicated or ignored as needed to match + * the number of channels required for the output speaker. + * - In queue mode, the audio sent to the SoureSpeakers can have different sample rates. + * - In non-queue mode, the audio sent to the SourceSpeakers must have the same sample rates. + * - SourceSpeaker has an internal ring buffer. It also allocates a shared_ptr for an AudioTranserBuffer object. + * - Audio Data Flow: + * - Audio data played on a SourceSpeaker first writes to its internal ring buffer. + * - MixerSpeaker task temporarily takes shared ownership of each SourceSpeaker's AudioTransferBuffer. + * - MixerSpeaker calls SourceSpeaker's `process_data_from_source`, which tranfers audio from the SourceSpeaker's + * ring buffer to its AudioTransferBuffer. Audio ducking is applied at this step. + * - In queue mode, MixerSpeaker prioritizes the earliest configured SourceSpeaker with audio data. Audio data is + * sent to the output speaker. + * - In non-queue mode, MixerSpeaker adds all the audio data in each SourceSpeaker into one stream that is written + * to the output speaker. + */ + +class MixerSpeaker; + +class SourceSpeaker : public speaker::Speaker, public Component { + public: + void dump_config() override; + void setup() override; + void loop() override; + + size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) override; + size_t play(const uint8_t *data, size_t length) override { return this->play(data, length, 0); } + + void start() override; + void stop() override; + void finish() override; + + bool has_buffered_data() const override; + + /// @brief Mute state changes are passed to the parent's output speaker + void set_mute_state(bool mute_state) override; + + /// @brief Volume state changes are passed to the parent's output speaker + void set_volume(float volume) override; + + void set_pause_state(bool pause_state) override { this->pause_state_ = pause_state; } + bool get_pause_state() const override { return this->pause_state_; } + + /// @brief Transfers audio from the ring buffer into the transfer buffer. Ducks audio while transferring. + /// @param ticks_to_wait FreeRTOS ticks to wait while waiting to read from the ring buffer. + /// @return Number of bytes transferred from the ring buffer. + size_t process_data_from_source(TickType_t ticks_to_wait); + + /// @brief Sets the ducking level for the source speaker. + /// @param decibel_reduction (uint8_t) The dB reduction level. For example, 0 is no change, 10 is a reduction by 10 dB + /// @param duration (uint32_t) The number of milliseconds to transition from the current level to the new level + void apply_ducking(uint8_t decibel_reduction, uint32_t duration); + + void set_buffer_duration(uint32_t buffer_duration_ms) { this->buffer_duration_ms_ = buffer_duration_ms; } + void set_parent(MixerSpeaker *parent) { this->parent_ = parent; } + void set_timeout(uint32_t ms) { this->timeout_ms_ = ms; } + + std::weak_ptr get_transfer_buffer() { return this->transfer_buffer_; } + + protected: + friend class MixerSpeaker; + esp_err_t start_(); + void stop_(); + + /// @brief Ducks audio samples by a specified amount. When changing the ducking amount, it can transition gradually + /// over a specified amount of samples. + /// @param input_buffer buffer with audio samples to be ducked in place + /// @param input_samples_to_duck number of samples to process in ``input_buffer`` + /// @param current_ducking_db_reduction pointer to the current dB reduction + /// @param ducking_transition_samples_remaining pointer to the total number of samples left before the the + /// transition is finished + /// @param samples_per_ducking_step total number of samples per ducking step for the transition + /// @param db_change_per_ducking_step the change in dB reduction per step + static void duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck, int8_t *current_ducking_db_reduction, + uint32_t *ducking_transition_samples_remaining, uint32_t samples_per_ducking_step, + int8_t db_change_per_ducking_step); + + MixerSpeaker *parent_; + + std::shared_ptr transfer_buffer_; + std::weak_ptr ring_buffer_; + + uint32_t buffer_duration_ms_; + uint32_t last_seen_data_ms_{0}; + optional timeout_ms_; + bool stop_gracefully_{false}; + + bool pause_state_{false}; + + int8_t target_ducking_db_reduction_{0}; + int8_t current_ducking_db_reduction_{0}; + int8_t db_change_per_ducking_step_{1}; + uint32_t ducking_transition_samples_remaining_{0}; + uint32_t samples_per_ducking_step_{0}; + + uint32_t accumulated_frames_read_{0}; + + uint32_t pending_playback_ms_{0}; +}; + +class MixerSpeaker : public Component { + public: + void dump_config() override; + void setup() override; + void loop() override; + + void add_source_speaker(SourceSpeaker *source_speaker) { this->source_speakers_.push_back(source_speaker); } + + /// @brief Starts the mixer task. Called by a source speaker giving the current audio stream information + /// @param stream_info The calling source speakers audio stream information + /// @return ESP_ERR_NOT_SUPPORTED if the incoming stream is incompatible due to unsupported bits per sample + /// ESP_ERR_INVALID_ARG if the incoming stream is incompatible to be mixed with the other input audio stream + /// ESP_ERR_NO_MEM if there isn't enough memory for the task's stack + /// ESP_ERR_INVALID_STATE if the task fails to start + /// ESP_OK if the incoming stream is compatible and the mixer task starts + esp_err_t start(audio::AudioStreamInfo &stream_info); + + void stop(); + + void set_output_channels(uint8_t output_channels) { this->output_channels_ = output_channels; } + void set_output_speaker(speaker::Speaker *speaker) { this->output_speaker_ = speaker; } + void set_queue_mode(bool queue_mode) { this->queue_mode_ = queue_mode; } + void set_task_stack_in_psram(bool task_stack_in_psram) { this->task_stack_in_psram_ = task_stack_in_psram; } + + speaker::Speaker *get_output_speaker() const { return this->output_speaker_; } + + protected: + /// @brief Copies audio frames from the input buffer to the output buffer taking into account the number of channels + /// in each stream. If the output stream has more channels, the input samples are duplicated. If the output stream has + /// less channels, the extra channel input samples are dropped. + /// @param input_buffer + /// @param input_stream_info + /// @param output_buffer + /// @param output_stream_info + /// @param frames_to_transfer number of frames (consisting of a sample for each channel) to copy from the input buffer + static void copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info, int16_t *output_buffer, + audio::AudioStreamInfo output_stream_info, uint32_t frames_to_transfer); + + /// @brief Mixes the primary and secondary streams taking into account the number of channels in each stream. Primary + /// and secondary samples are duplicated or dropped as necessary to ensure the output stream has the configured number + /// of channels. Output samples are clamped to the corresponding int16 min or max values if the mixed sample + /// overflows. + /// @param primary_buffer (int16_t *) samples buffer for the primary stream + /// @param primary_stream_info stream info for the primary stream + /// @param secondary_buffer (int16_t *) samples buffer for secondary stream + /// @param secondary_stream_info stream info for the secondary stream + /// @param output_buffer (int16_t *) buffer for the mixed samples + /// @param output_stream_info stream info for the output buffer + /// @param frames_to_mix number of frames in the primary and secondary buffers to mix together + static void mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info, + const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info, + int16_t *output_buffer, audio::AudioStreamInfo output_stream_info, + uint32_t frames_to_mix); + + static void audio_mixer_task(void *params); + + /// @brief Starts the mixer task after allocating memory for the task stack. + /// @return ESP_ERR_NO_MEM if there isn't enough memory for the task's stack + /// ESP_ERR_INVALID_STATE if the task didn't start + /// ESP_OK if successful + esp_err_t start_task_(); + + /// @brief If the task is stopped, it sets the task handle to the nullptr and deallocates its stack + /// @return ESP_OK if the task was stopped, ESP_ERR_INVALID_STATE otherwise. + esp_err_t delete_task_(); + + EventGroupHandle_t event_group_{nullptr}; + + std::vector source_speakers_; + speaker::Speaker *output_speaker_{nullptr}; + + uint8_t output_channels_; + bool queue_mode_; + bool task_stack_in_psram_{false}; + + bool task_created_{false}; + + TaskHandle_t task_handle_{nullptr}; + StaticTask_t task_stack_; + StackType_t *task_stack_buffer_{nullptr}; + + optional audio_stream_info_; +}; + +} // namespace mixer_speaker +} // namespace esphome + +#endif diff --git a/tests/components/mixer/common.yaml b/tests/components/mixer/common.yaml new file mode 100644 index 0000000000..e171b9499c --- /dev/null +++ b/tests/components/mixer/common.yaml @@ -0,0 +1,23 @@ +esphome: + on_boot: + then: + - mixer_speaker.apply_ducking: + id: source_speaker_1_id + decibel_reduction: 10 + duration: 1s + +i2s_audio: + i2s_lrclk_pin: ${lrclk_pin} + i2s_bclk_pin: ${bclk_pin} + i2s_mclk_pin: ${mclk_pin} + +speaker: + - platform: i2s_audio + id: speaker_id + dac_type: external + i2s_dout_pin: ${dout_pin} + - platform: mixer + output_speaker: speaker_id + source_speakers: + - id: source_speaker_1_id + - id: source_speaker_2_id diff --git a/tests/components/mixer/test.esp32-ard.yaml b/tests/components/mixer/test.esp32-ard.yaml new file mode 100644 index 0000000000..96d2d37458 --- /dev/null +++ b/tests/components/mixer/test.esp32-ard.yaml @@ -0,0 +1,7 @@ +substitutions: + lrclk_pin: GPIO16 + bclk_pin: GPIO17 + mclk_pin: GPIO15 + dout_pin: GPIO14 + +<<: !include common.yaml diff --git a/tests/components/mixer/test.esp32-c3-ard.yaml b/tests/components/mixer/test.esp32-c3-ard.yaml new file mode 100644 index 0000000000..f1721f0862 --- /dev/null +++ b/tests/components/mixer/test.esp32-c3-ard.yaml @@ -0,0 +1,7 @@ +substitutions: + lrclk_pin: GPIO4 + bclk_pin: GPIO5 + mclk_pin: GPIO6 + dout_pin: GPIO7 + +<<: !include common.yaml diff --git a/tests/components/mixer/test.esp32-c3-idf.yaml b/tests/components/mixer/test.esp32-c3-idf.yaml new file mode 100644 index 0000000000..f1721f0862 --- /dev/null +++ b/tests/components/mixer/test.esp32-c3-idf.yaml @@ -0,0 +1,7 @@ +substitutions: + lrclk_pin: GPIO4 + bclk_pin: GPIO5 + mclk_pin: GPIO6 + dout_pin: GPIO7 + +<<: !include common.yaml diff --git a/tests/components/mixer/test.esp32-idf.yaml b/tests/components/mixer/test.esp32-idf.yaml new file mode 100644 index 0000000000..96d2d37458 --- /dev/null +++ b/tests/components/mixer/test.esp32-idf.yaml @@ -0,0 +1,7 @@ +substitutions: + lrclk_pin: GPIO16 + bclk_pin: GPIO17 + mclk_pin: GPIO15 + dout_pin: GPIO14 + +<<: !include common.yaml diff --git a/tests/components/mixer/test.esp32-s3-ard.yaml b/tests/components/mixer/test.esp32-s3-ard.yaml new file mode 100644 index 0000000000..f1721f0862 --- /dev/null +++ b/tests/components/mixer/test.esp32-s3-ard.yaml @@ -0,0 +1,7 @@ +substitutions: + lrclk_pin: GPIO4 + bclk_pin: GPIO5 + mclk_pin: GPIO6 + dout_pin: GPIO7 + +<<: !include common.yaml diff --git a/tests/components/mixer/test.esp32-s3-idf.yaml b/tests/components/mixer/test.esp32-s3-idf.yaml new file mode 100644 index 0000000000..f1721f0862 --- /dev/null +++ b/tests/components/mixer/test.esp32-s3-idf.yaml @@ -0,0 +1,7 @@ +substitutions: + lrclk_pin: GPIO4 + bclk_pin: GPIO5 + mclk_pin: GPIO6 + dout_pin: GPIO7 + +<<: !include common.yaml