[i2s_audio, microphone, micro_wake_word, voice_assistant] Use microphone source to process incoming audio (#8645)

Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
2025-09-25 06:32:22 +01:00 · 2025-04-29 17:27:03 -05:00
parent 0fe6c65ba3
commit 9f629dcaa2
15 changed files with 166 additions and 98 deletions
--- a/esphome/components/i2s_audio/microphone/init.py
+++ b/esphome/components/i2s_audio/microphone/init.py
@@ -1,13 +1,20 @@
 from esphome import pins
 import esphome.codegen as cg
-from esphome.components import esp32, microphone
+from esphome.components import audio, esp32, microphone
 from esphome.components.adc import ESP32_VARIANT_ADC1_PIN_TO_CHANNEL, validate_adc_pin
 import esphome.config_validation as cv
-from esphome.const import CONF_ID, CONF_NUMBER
+from esphome.const import (
+    CONF_BITS_PER_SAMPLE,
+    CONF_CHANNEL,
+    CONF_ID,
+    CONF_NUM_CHANNELS,
+    CONF_NUMBER,
+    CONF_SAMPLE_RATE,
+)

 from .. import (
-    CONF_CHANNEL,
    CONF_I2S_DIN_PIN,
+    CONF_LEFT,
    CONF_MONO,
    CONF_RIGHT,
    I2SAudioIn,
@@ -32,7 +39,7 @@ INTERNAL_ADC_VARIANTS = [esp32.const.VARIANT_ESP32]
 PDM_VARIANTS = [esp32.const.VARIANT_ESP32, esp32.const.VARIANT_ESP32S3]


-def validate_esp32_variant(config):
+def _validate_esp32_variant(config):
    variant = esp32.get_esp32_variant()
    if config[CONF_ADC_TYPE] == "external":
        if config[CONF_PDM]:
@@ -46,12 +53,34 @@ def validate_esp32_variant(config):
    raise NotImplementedError


-def validate_channel(config):
+def _validate_channel(config):
    if config[CONF_CHANNEL] == CONF_MONO:
        raise cv.Invalid(f"I2S microphone does not support {CONF_MONO}.")
    return config


+def _set_num_channels_from_config(config):
+    if config[CONF_CHANNEL] in (CONF_LEFT, CONF_RIGHT):
+        config[CONF_NUM_CHANNELS] = 1
+    else:
+        config[CONF_NUM_CHANNELS] = 2
+
+    return config
+
+
+def _set_stream_limits(config):
+    audio.set_stream_limits(
+        min_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+        min_channels=config.get(CONF_NUM_CHANNELS),
+        max_channels=config.get(CONF_NUM_CHANNELS),
+        min_sample_rate=config.get(CONF_SAMPLE_RATE),
+        max_sample_rate=config.get(CONF_SAMPLE_RATE),
+    )(config)
+
+    return config
+
+
 BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend(
    i2s_audio_component_schema(
        I2SAudioMicrophone,
@@ -79,8 +108,10 @@ CONFIG_SCHEMA = cv.All(
        },
        key=CONF_ADC_TYPE,
    ),
-    validate_esp32_variant,
-    validate_channel,
+    _validate_esp32_variant,
+    _validate_channel,
+    _set_num_channels_from_config,
+    _set_stream_limits,
 )


--- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp
+++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp
@@ -56,6 +56,35 @@ void I2SAudioMicrophone::start_() {
  }
  esp_err_t err;

+  uint8_t channel_count = 1;
+#ifdef USE_I2S_LEGACY
+  uint8_t bits_per_sample = this->bits_per_sample_;
+
+  if (this->channel_ == I2S_CHANNEL_FMT_RIGHT_LEFT) {
+    channel_count = 2;
+  }
+#else
+  if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_AUTO) {
+    this->slot_bit_width_ = I2S_SLOT_BIT_WIDTH_16BIT;
+  }
+  uint8_t bits_per_sample = this->slot_bit_width_;
+
+  if (this->slot_mode_ == I2S_SLOT_MODE_STEREO) {
+    channel_count = 2;
+  }
+#endif
+
+#ifdef USE_ESP32_VARIANT_ESP32
+  // ESP32 reads audio aligned to a multiple of 2 bytes. For example, if configured for 24 bits per sample, then it will
+  // produce 32 bits per sample, where the actual data is in the most significant bits. Other ESP32 variants produce 24
+  // bits per sample in this situation.
+  if (bits_per_sample < 16) {
+    bits_per_sample = 16;
+  } else if ((bits_per_sample > 16) && (bits_per_sample <= 32)) {
+    bits_per_sample = 32;
+  }
+#endif
+
 #ifdef USE_I2S_LEGACY
  i2s_driver_config_t config = {
      .mode = (i2s_mode_t) (this->i2s_mode_ | I2S_MODE_RX),
@@ -144,6 +173,8 @@ void I2SAudioMicrophone::start_() {
  i2s_std_gpio_config_t pin_config = this->parent_->get_pin_config();
 #if SOC_I2S_SUPPORTS_PDM_RX
  if (this->pdm_) {
+    bits_per_sample = 16;  // PDM mics are always 16 bits per sample with the IDF 5 driver
+
    i2s_pdm_rx_clk_config_t clk_cfg = {
        .sample_rate_hz = this->sample_rate_,
        .clk_src = clk_src,
@@ -187,13 +218,8 @@ void I2SAudioMicrophone::start_() {
        .clk_src = clk_src,
        .mclk_multiple = I2S_MCLK_MULTIPLE_256,
    };
-    i2s_data_bit_width_t data_bit_width;
-    if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_8BIT) {
-      data_bit_width = I2S_DATA_BIT_WIDTH_16BIT;
-    } else {
-      data_bit_width = I2S_DATA_BIT_WIDTH_8BIT;
-    }
-    i2s_std_slot_config_t std_slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(data_bit_width, this->slot_mode_);
+    i2s_std_slot_config_t std_slot_cfg =
+        I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) this->slot_bit_width_, this->slot_mode_);
    std_slot_cfg.slot_bit_width = this->slot_bit_width_;
    std_slot_cfg.slot_mask = this->std_slot_mask_;

@@ -222,6 +248,8 @@ void I2SAudioMicrophone::start_() {
  }
 #endif

+  this->audio_stream_info_ = audio::AudioStreamInfo(bits_per_sample, channel_count, this->sample_rate_);
+
  this->state_ = microphone::STATE_RUNNING;
  this->high_freq_.start();
  this->status_clear_error();
@@ -284,7 +312,7 @@ void I2SAudioMicrophone::stop_() {
  this->status_clear_error();
 }

-size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wait) {
+size_t I2SAudioMicrophone::read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait) {
  size_t bytes_read = 0;
 #ifdef USE_I2S_LEGACY
  esp_err_t err = i2s_read(this->parent_->get_port(), buf, len, &bytes_read, ticks_to_wait);
@@ -303,38 +331,7 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
    return 0;
  }
  this->status_clear_warning();
-  // ESP-IDF I2S implementation right-extends 8-bit data to 16 bits,
-  // and 24-bit data to 32 bits.
-#ifdef USE_I2S_LEGACY
-  switch (this->bits_per_sample_) {
-    case I2S_BITS_PER_SAMPLE_8BIT:
-    case I2S_BITS_PER_SAMPLE_16BIT:
-      return bytes_read;
-    case I2S_BITS_PER_SAMPLE_24BIT:
-    case I2S_BITS_PER_SAMPLE_32BIT: {
-      size_t samples_read = bytes_read / sizeof(int32_t);
-      for (size_t i = 0; i < samples_read; i++) {
-        int32_t temp = reinterpret_cast<int32_t *>(buf)[i] >> 14;
-        buf[i] = clamp<int16_t>(temp, INT16_MIN, INT16_MAX);
-      }
-      return samples_read * sizeof(int16_t);
-    }
-    default:
-      ESP_LOGE(TAG, "Unsupported bits per sample: %d", this->bits_per_sample_);
-      return 0;
-  }
-#else
-#ifndef USE_ESP32_VARIANT_ESP32
-  // For newer ESP32 variants 8 bit data needs to be extended to 16 bit.
-  if (this->slot_bit_width_ == I2S_SLOT_BIT_WIDTH_8BIT) {
-    size_t samples_read = bytes_read / sizeof(int8_t);
-    for (size_t i = samples_read - 1; i >= 0; i--) {
-      int16_t temp = static_cast<int16_t>(reinterpret_cast<int8_t *>(buf)[i]) << 8;
-      buf[i] = temp;
-    }
-    return samples_read * sizeof(int16_t);
-  }
-#else
+#if defined(USE_ESP32_VARIANT_ESP32) and not defined(USE_I2S_LEGACY)
  // For ESP32 8/16 bit standard mono mode samples need to be switched.
  if (this->slot_mode_ == I2S_SLOT_MODE_MONO && this->slot_bit_width_ <= 16 && !this->pdm_) {
    size_t samples_read = bytes_read / sizeof(int16_t);
@@ -346,14 +343,14 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len, TickType_t ticks_to_wa
  }
 #endif
  return bytes_read;
-#endif
 }

 void I2SAudioMicrophone::read_() {
-  std::vector<int16_t> samples;
-  samples.resize(BUFFER_SIZE);
-  size_t bytes_read = this->read(samples.data(), BUFFER_SIZE * sizeof(int16_t), 0);
-  samples.resize(bytes_read / sizeof(int16_t));
+  std::vector<uint8_t> samples;
+  const size_t bytes_to_read = this->audio_stream_info_.ms_to_bytes(32);
+  samples.resize(bytes_to_read);
+  size_t bytes_read = this->read_(samples.data(), bytes_to_read, 0);
+  samples.resize(bytes_read);
  this->data_callbacks_.call(samples);
 }

--- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h
+++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.h
@@ -25,9 +25,6 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub

  void set_pdm(bool pdm) { this->pdm_ = pdm; }

-  size_t read(int16_t *buf, size_t len, TickType_t ticks_to_wait);
-  size_t read(int16_t *buf, size_t len) override { return this->read(buf, len, pdMS_TO_TICKS(100)); }
-
 #ifdef USE_I2S_LEGACY
 #if SOC_I2S_SUPPORTS_ADC
  void set_adc_channel(adc1_channel_t channel) {
@@ -41,6 +38,7 @@ class I2SAudioMicrophone : public I2SAudioIn, public microphone::Microphone, pub
  void start_();
  void stop_();
  void read_();
+  size_t read_(uint8_t *buf, size_t len, TickType_t ticks_to_wait);

 #ifdef USE_I2S_LEGACY
  int8_t din_pin_{I2S_PIN_NO_CHANGE};