[audio, microphone] Add MicrophoneSource helper class (#8641)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
2025-10-31 15:12:06 +00:00 · 2025-04-28 19:05:07 -05:00
parent 43580739ac
commit 844569e96b
8 changed files with 365 additions and 39 deletions
--- a/2
+++ b/2
@@ -278,7 +278,7 @@ esphome/components/mdns/* @esphome/core
 esphome/components/media_player/* @jesserockz
 esphome/components/micro_wake_word/* @jesserockz @kahrendt
 esphome/components/micronova/* @jorre05
-esphome/components/microphone/* @jesserockz
+esphome/components/microphone/* @jesserockz @kahrendt
 esphome/components/mics_4514/* @jesserockz
 esphome/components/midea/* @dudanov
 esphome/components/midea_ir/* @dudanov
--- a/esphome/components/audio/init.py
+++ b/esphome/components/audio/init.py
@@ -48,6 +48,12 @@ def set_stream_limits(
    min_sample_rate: int = _UNDEF,
    max_sample_rate: int = _UNDEF,
 ):
    """Sets the limits for the audio stream that audio component can handle
    When the component sinks audio (e.g., a speaker), these indicate the limits to the audio it can receive.
    When the component sources audio (e.g., a microphone), these indicate the limits to the audio it can send.
    """
    def set_limits_in_config(config):
        if min_bits_per_sample is not _UNDEF:
            config[CONF_MIN_BITS_PER_SAMPLE] = min_bits_per_sample
@@ -69,43 +75,87 @@ def final_validate_audio_schema(
    name: str,
    *,
    audio_device: str,
-    bits_per_sample: int,
+    bits_per_sample: int = _UNDEF,
-    channels: int,
+    channels: int = _UNDEF,
-    sample_rate: int,
+    sample_rate: int = _UNDEF,
    enabled_channels: list[int] = _UNDEF,
    audio_device_issue: bool = False,
 ):
    """Validates audio compatibility when passed between different components.
    The component derived from ``AUDIO_COMPONENT_SCHEMA`` should call ``set_stream_limits`` in a validator to specify its compatible settings
      - If audio_device_issue is True, then the error message indicates the user should adjust the AUDIO_COMPONENT_SCHEMA derived component's configuration to match the values passed to this function
      - If audio_device_issue is False, then the error message indicates the user should adjust the configuration of the component calling this function, as it falls out of the valid stream limits
    Args:
        name (str): Friendly name of the component calling this function with an audio component to validate
        audio_device (str): The configuration parameter name that contains the ID of an AUDIO_COMPONENT_SCHEMA derived component to validate against
        bits_per_sample (int, optional): The desired bits per sample
        channels (int, optional): The desired number of channels
        sample_rate (int, optional): The desired sample rate
        enabled_channels (list[int], optional): The desired enabled channels
        audio_device_issue (bool, optional): Format the error message to indicate the problem is in the configuration for the ``audio_device`` component. Defaults to False.
    """
    def validate_audio_compatiblity(audio_config):
        audio_schema = {}
        if bits_per_sample is not _UNDEF:
            try:
                cv.int_range(
                    min=audio_config.get(CONF_MIN_BITS_PER_SAMPLE),
                    max=audio_config.get(CONF_MAX_BITS_PER_SAMPLE),
                )(bits_per_sample)
            except cv.Invalid as exc:
-            raise cv.Invalid(
+                if audio_device_issue:
-                f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
+                    error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires {bits_per_sample} bits per sample."
-            ) from exc
+                else:
                    error_string = f"Invalid configuration for the {name} component. The {CONF_BITS_PER_SAMPLE} {str(exc)}"
                raise cv.Invalid(error_string) from exc
        if channels is not _UNDEF:
            try:
                cv.int_range(
                    min=audio_config.get(CONF_MIN_CHANNELS),
                    max=audio_config.get(CONF_MAX_CHANNELS),
                )(channels)
            except cv.Invalid as exc:
-            raise cv.Invalid(
+                if audio_device_issue:
-                f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
+                    error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires {channels} channels."
-            ) from exc
+                else:
                    error_string = f"Invalid configuration for the {name} component. The {CONF_NUM_CHANNELS} {str(exc)}"
                raise cv.Invalid(error_string) from exc
        if sample_rate is not _UNDEF:
            try:
                cv.int_range(
                    min=audio_config.get(CONF_MIN_SAMPLE_RATE),
                    max=audio_config.get(CONF_MAX_SAMPLE_RATE),
                )(sample_rate)
            return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)
            except cv.Invalid as exc:
-            raise cv.Invalid(
+                if audio_device_issue:
-                f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
+                    error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires a {sample_rate} sample rate."
-            ) from exc
+                else:
                    error_string = f"Invalid configuration for the {name} component. The {CONF_SAMPLE_RATE} {str(exc)}"
                raise cv.Invalid(error_string) from exc
        if enabled_channels is not _UNDEF:
            for channel in enabled_channels:
                try:
                    # Channels are 0-indexed
                    cv.int_range(
                        min=0,
                        max=audio_config.get(CONF_MAX_CHANNELS) - 1,
                    )(channel)
                except cv.Invalid as exc:
                    if audio_device_issue:
                        error_string = f"Invalid configuration for the specified {audio_device}. The {name} component requires channel {channel}."
                    else:
                        error_string = f"Invalid configuration for the {name} component. Enabled channel {channel} {str(exc)}"
                    raise cv.Invalid(error_string) from exc
        return cv.Schema(audio_schema, extra=cv.ALLOW_EXTRA)(audio_config)
    return cv.Schema(
        {
--- a/esphome/components/audio/audio_resampler.cpp
+++ b/esphome/components/audio/audio_resampler.cpp
@@ -4,6 +4,8 @@
 #include "esphome/core/hal.h"
 #include <cstring>
 namespace esphome {
 namespace audio {
--- a/esphome/components/audio/audio_resampler.h
+++ b/esphome/components/audio/audio_resampler.h
@@ -6,6 +6,7 @@
 #include "audio_transfer_buffer.h"
 #include "esphome/core/defines.h"
 #include "esphome/core/helpers.h"
 #include "esphome/core/ring_buffer.h"
 #ifdef USE_SPEAKER
--- a/esphome/components/microphone/init.py
+++ b/esphome/components/microphone/init.py
@@ -1,12 +1,21 @@
 from esphome import automation
 from esphome.automation import maybe_simple_id
 import esphome.codegen as cg
 from esphome.components import audio
 import esphome.config_validation as cv
-from esphome.const import CONF_ID, CONF_TRIGGER_ID
+from esphome.const import (
    CONF_BITS_PER_SAMPLE,
    CONF_CHANNELS,
    CONF_GAIN_FACTOR,
    CONF_ID,
    CONF_MICROPHONE,
    CONF_TRIGGER_ID,
 )
 from esphome.core import CORE
 from esphome.coroutine import coroutine_with_priority
-CODEOWNERS = ["@jesserockz"]
+AUTO_LOAD = ["audio"]
 CODEOWNERS = ["@jesserockz", "@kahrendt"]
 IS_PLATFORM_COMPONENT = True
@@ -15,6 +24,7 @@ CONF_ON_DATA = "on_data"
 microphone_ns = cg.esphome_ns.namespace("microphone")
 Microphone = microphone_ns.class_("Microphone")
 MicrophoneSource = microphone_ns.class_("MicrophoneSource")
 CaptureAction = microphone_ns.class_(
    "CaptureAction", automation.Action, cg.Parented.template(Microphone)
@@ -37,6 +47,7 @@ IsCapturingCondition = microphone_ns.class_(
 async def setup_microphone_core_(var, config):
    for conf in config.get(CONF_ON_DATA, []):
        trigger = cg.new_Pvariable(conf[CONF_TRIGGER_ID], var)
        # Future PR will change the vector type to uint8
        await automation.build_automation(
            trigger,
            [(cg.std_vector.template(cg.int16).operator("ref").operator("const"), "x")],
@@ -50,7 +61,7 @@ async def register_microphone(var, config):
    await setup_microphone_core_(var, config)
-MICROPHONE_SCHEMA = cv.Schema(
+MICROPHONE_SCHEMA = cv.Schema.extend(audio.AUDIO_COMPONENT_SCHEMA).extend(
    {
        cv.Optional(CONF_ON_DATA): automation.validate_automation(
            {
@@ -64,7 +75,104 @@ MICROPHONE_SCHEMA = cv.Schema(
 MICROPHONE_ACTION_SCHEMA = maybe_simple_id({cv.GenerateID(): cv.use_id(Microphone)})
-async def media_player_action(config, action_id, template_arg, args):
+def microphone_source_schema(
    min_bits_per_sample: int = 16,
    max_bits_per_sample: int = 16,
    min_channels: int = 1,
    max_channels: int = 1,
 ):
    """Schema for a microphone source
    Components requesting microphone data should use this schema instead of accessing a microphone directly.
    Args:
      min_bits_per_sample (int, optional): Minimum number of bits per sample the requesting component supports. Defaults to 16.
      max_bits_per_sample (int, optional): Maximum number of bits per sample the requesting component supports. Defaults to 16.
      min_channels (int, optional): Minimum number of channels the requesting component supports. Defaults to 1.
      max_channels (int, optional): Maximum number of channels the requesting component supports. Defaults to 1.
    """
    def _validate_unique_channels(config):
        if len(config) != len(set(config)):
            raise cv.Invalid("Channels must be unique")
        return config
    return cv.All(
        cv.maybe_simple_value(
            {
                cv.GenerateID(CONF_ID): cv.declare_id(MicrophoneSource),
                cv.Required(CONF_MICROPHONE): cv.use_id(Microphone),
                cv.Optional(CONF_BITS_PER_SAMPLE, default=16): cv.int_range(
                    min_bits_per_sample, max_bits_per_sample
                ),
                cv.Optional(CONF_CHANNELS, default="0"): cv.All(
                    cv.ensure_list(cv.int_range(0, 7)),
                    cv.Length(min=min_channels, max=max_channels),
                    _validate_unique_channels,
                ),
                cv.Optional(CONF_GAIN_FACTOR, default="1"): cv.int_range(1, 64),
            },
            key=CONF_MICROPHONE,
        ),
    )
 _UNDEF = object()
 def final_validate_microphone_source_schema(
    component_name: str, sample_rate: int = _UNDEF
 ):
    """Validates that the microphone source can provide audio in the correct format. In particular it validates the sample rate and the enabled channels.
    Note that:
      - MicrophoneSource class automatically handles converting bits per sample, so no need to validate
      - microphone_source_schema already validates that channels are unique and specifies the max number of channels the component supports
    Args:
        component_name (str): The name of the component requesting mic audio
        sample_rate (int, optional): The sample rate the component requesting mic audio requires
    """
    def _validate_audio_compatability(config):
        if sample_rate is not _UNDEF:
            # Issues require changing the microphone configuration
            #  - Verifies sample rates match
            audio.final_validate_audio_schema(
                component_name,
                audio_device=CONF_MICROPHONE,
                sample_rate=sample_rate,
                audio_device_issue=True,
            )(config)
        # Issues require changing the MicrophoneSource configuration
        # - Verifies that each of the enabled channels are available
        audio.final_validate_audio_schema(
            component_name,
            audio_device=CONF_MICROPHONE,
            enabled_channels=config[CONF_CHANNELS],
            audio_device_issue=False,
        )(config)
        return config
    return _validate_audio_compatability
 async def microphone_source_to_code(config):
    mic = await cg.get_variable(config[CONF_MICROPHONE])
    mic_source = cg.new_Pvariable(
        config[CONF_ID],
        mic,
        config[CONF_BITS_PER_SAMPLE],
        config[CONF_GAIN_FACTOR],
    )
    for channel in config[CONF_CHANNELS]:
        cg.add(mic_source.add_channel(channel))
    return mic_source
 async def microphone_action(config, action_id, template_arg, args):
    var = cg.new_Pvariable(action_id, template_arg)
    await cg.register_parented(var, config[CONF_ID])
    return var
@@ -72,15 +180,15 @@ async def media_player_action(config, action_id, template_arg, args):
 automation.register_action(
    "microphone.capture", CaptureAction, MICROPHONE_ACTION_SCHEMA
-)(media_player_action)
+)(microphone_action)
 automation.register_action(
    "microphone.stop_capture", StopCaptureAction, MICROPHONE_ACTION_SCHEMA
-)(media_player_action)
+)(microphone_action)
 automation.register_condition(
    "microphone.is_capturing", IsCapturingCondition, MICROPHONE_ACTION_SCHEMA
-)(media_player_action)
+)(microphone_action)
@coroutine_with_priority(100.0)
--- a/esphome/components/microphone/microphone.h
+++ b/esphome/components/microphone/microphone.h
@@ -1,5 +1,7 @@
 #pragma once
 #include "esphome/components/audio/audio.h"
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -28,9 +30,13 @@ class Microphone {
  bool is_running() const { return this->state_ == STATE_RUNNING; }
  bool is_stopped() const { return this->state_ == STATE_STOPPED; }
  audio::AudioStreamInfo get_audio_stream_info() { return this->audio_stream_info_; }
 protected:
  State state_{STATE_STOPPED};
  audio::AudioStreamInfo audio_stream_info_;
  CallbackManager<void(const std::vector<int16_t> &)> data_callbacks_{};
 };
--- a/esphome/components/microphone/microphone_source.cpp
+++ b/esphome/components/microphone/microphone_source.cpp
@@ -0,0 +1,96 @@
 #include "microphone_source.h"
 namespace esphome {
 namespace microphone {
 void MicrophoneSource::add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback) {
  std::function<void(const std::vector<uint8_t> &)> filtered_callback =
      [this, data_callback](const std::vector<uint8_t> &data) {
        if (this->enabled_) {
          data_callback(this->process_audio_(data));
        }
      };
  // Future PR will uncomment this! It requires changing the callback vector to an uint8_t in every component using a
  // mic callback.
  // this->mic_->add_data_callback(std::move(filtered_callback));
 }
 void MicrophoneSource::start() {
  this->enabled_ = true;
  this->mic_->start();
 }
 void MicrophoneSource::stop() {
  this->enabled_ = false;
  this->mic_->stop();
 }
 std::vector<uint8_t> MicrophoneSource::process_audio_(const std::vector<uint8_t> &data) {
  // Bit depth conversions are obtained by truncating bits or padding with zeros - no dithering is applied.
  const size_t source_bytes_per_sample = this->mic_->get_audio_stream_info().samples_to_bytes(1);
  const size_t source_channels = this->mic_->get_audio_stream_info().get_channels();
  const size_t source_bytes_per_frame = this->mic_->get_audio_stream_info().frames_to_bytes(1);
  const uint32_t total_frames = this->mic_->get_audio_stream_info().bytes_to_frames(data.size());
  const size_t target_bytes_per_sample = (this->bits_per_sample_ + 7) / 8;
  const size_t target_bytes_per_frame = target_bytes_per_sample * this->channels_.count();
  std::vector<uint8_t> filtered_data;
  filtered_data.reserve(target_bytes_per_frame * total_frames);
  const int32_t target_min_value = -(1 << (8 * target_bytes_per_sample - 1));
  const int32_t target_max_value = (1 << (8 * target_bytes_per_sample - 1)) - 1;
  for (size_t frame_index = 0; frame_index < total_frames; ++frame_index) {
    for (size_t channel_index = 0; channel_index < source_channels; ++channel_index) {
      if (this->channels_.test(channel_index)) {
        // Channel's current sample is included in the target mask. Convert bits per sample, if necessary.
        size_t sample_index = frame_index * source_bytes_per_frame + channel_index * source_bytes_per_sample;
        int32_t sample = 0;
        // Copy the data into the most significant bits of the sample variable to ensure the sign bit is correct
        uint8_t bit_offset = (4 - source_bytes_per_sample) * 8;
        for (int i = 0; i < source_bytes_per_sample; ++i) {
          sample |= data[sample_index + i] << bit_offset;
          bit_offset += 8;
        }
        // Shift data back to the least significant bits
        if (source_bytes_per_sample >= target_bytes_per_sample) {
          // Keep source bytes per sample of data so that the gain multiplication uses all significant bits instead of
          // shifting to the target bytes per sample immediately, potentially losing information.
          sample >>= (4 - source_bytes_per_sample) * 8;  // ``source_bytes_per_sample`` bytes of valid data
        } else {
          // Keep padded zeros to match the target bytes per sample
          sample >>= (4 - target_bytes_per_sample) * 8;  // ``target_bytes_per_sample`` bytes of valid data
        }
        // Apply gain using multiplication
        sample *= this->gain_factor_;
        // Match target output bytes by shifting out the least significant bits
        if (source_bytes_per_sample > target_bytes_per_sample) {
          sample >>= 8 * (source_bytes_per_sample -
                          target_bytes_per_sample);  //  ``target_bytes_per_sample`` bytes of valid data
        }
        // Clamp ``sample`` to the target bytes per sample range in case gain multiplication overflows
        sample = clamp<int32_t>(sample, target_min_value, target_max_value);
        // Copy ``target_bytes_per_sample`` bytes to the output buffer.
        for (int i = 0; i < target_bytes_per_sample; ++i) {
          filtered_data.push_back(static_cast<uint8_t>(sample));
          sample >>= 8;
        }
      }
    }
  }
  return filtered_data;
 }
 }  // namespace microphone
 }  // namespace esphome
--- a/esphome/components/microphone/microphone_source.h
+++ b/esphome/components/microphone/microphone_source.h
@@ -0,0 +1,63 @@
 #pragma once
 #include <bitset>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <vector>
 #include "microphone.h"
 namespace esphome {
 namespace microphone {
 class MicrophoneSource {
  /*
   * @brief Helper class that handles converting raw microphone data to a requested format.
   * Components requesting microphone audio should register a callback through this class instead of registering a
   * callback directly with the microphone if a particular format is required.
   *
   * Raw microphone data may have a different number of bits per sample and number of channels than the requesting
   * component needs. This class handles the conversion by:
   *   - Internally adds a callback to receive the raw microphone data
   *   - The ``process_audio_`` handles the raw data
   *     - Only the channels set in the ``channels_`` bitset are passed through
   *     - Passed through samples have the bits per sample converted
   *     - A gain factor is optionally applied to increase the volume - audio may clip!
   *   - The processed audio is passed to the callback of the component requesting microphone data
   *   - It tracks an internal enabled state, so it ignores raw microphone data when the component requesting
   *     microphone data is not actively requesting audio.
   *
   * Note that this class cannot convert sample rates!
   */
 public:
  MicrophoneSource(Microphone *mic, uint8_t bits_per_sample, int32_t gain_factor)
      : mic_(mic), bits_per_sample_(bits_per_sample), gain_factor_(gain_factor) {}
  /// @brief Enables a channel to be processed through the callback.
  ///
  /// If the microphone component only has reads from one channel, it is always in channel number 0, regardless if it
  /// represents left or right. If the microphone reads from both left and right, channel number 0 and 1 represent the
  /// left and right channels respectively.
  ///
  /// @param channel 0-indexed channel number to enable
  void add_channel(uint8_t channel) { this->channels_.set(channel); }
  void add_data_callback(std::function<void(const std::vector<uint8_t> &)> &&data_callback);
  void start();
  void stop();
  bool is_running() const { return (this->mic_->is_running() && this->enabled_); }
  bool is_stopped() const { return !this->enabled_; }
 protected:
  std::vector<uint8_t> process_audio_(const std::vector<uint8_t> &data);
  Microphone *mic_;
  uint8_t bits_per_sample_;
  std::bitset<8> channels_;
  int32_t gain_factor_;
  bool enabled_{false};
 };
 }  // namespace microphone
 }  // namespace esphome