From 9f121e6016921591d26d08dbe5340d6e5e3766be Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Sun, 17 Mar 2024 15:13:55 -0400 Subject: [PATCH] microWakeWord - add new ops and small improvements (#6360) --- .../micro_wake_word/micro_wake_word.cpp | 66 ++++++++----------- .../micro_wake_word/micro_wake_word.h | 3 +- 2 files changed, 27 insertions(+), 42 deletions(-) diff --git a/esphome/components/micro_wake_word/micro_wake_word.cpp b/esphome/components/micro_wake_word/micro_wake_word.cpp index 7321e5b05b..f637f8b2bb 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.cpp +++ b/esphome/components/micro_wake_word/micro_wake_word.cpp @@ -93,11 +93,18 @@ int MicroWakeWord::read_microphone_() { return 0; } - size_t bytes_written = this->ring_buffer_->write((void *) this->input_buffer_, bytes_read); - if (bytes_written != bytes_read) { - ESP_LOGW(TAG, "Failed to write some data to ring buffer (written=%d, expected=%d)", bytes_written, bytes_read); + size_t bytes_free = this->ring_buffer_->free(); + + if (bytes_free < bytes_read) { + ESP_LOGW(TAG, + "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). " + "Resetting the ring buffer. Wake word detection accuracy will be reduced.", + bytes_free, bytes_read); + + this->ring_buffer_->reset(); } - return bytes_written; + + return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read); } void MicroWakeWord::loop() { @@ -206,12 +213,6 @@ bool MicroWakeWord::initialize_models() { return false; } - this->preprocessor_stride_buffer_ = audio_samples_allocator.allocate(HISTORY_SAMPLES_TO_KEEP); - if (this->preprocessor_stride_buffer_ == nullptr) { - ESP_LOGE(TAG, "Could not allocate the audio preprocessor's stride buffer."); - return false; - } - this->preprocessor_model_ = tflite::GetModel(G_AUDIO_PREPROCESSOR_INT8_TFLITE); if (this->preprocessor_model_->version() != TFLITE_SCHEMA_VERSION) { ESP_LOGE(TAG, "Wake word's audio preprocessor model's schema is not supported"); @@ -225,7 +226,7 @@ bool MicroWakeWord::initialize_models() { } static tflite::MicroMutableOpResolver<18> preprocessor_op_resolver; - static tflite::MicroMutableOpResolver<14> streaming_op_resolver; + static tflite::MicroMutableOpResolver<17> streaming_op_resolver; if (!this->register_preprocessor_ops_(preprocessor_op_resolver)) return false; @@ -329,7 +330,6 @@ bool MicroWakeWord::detect_wake_word_() { } // Perform inference - uint32_t streaming_size = micros(); float streaming_prob = this->perform_streaming_inference_(); // Add the most recent probability to the sliding window @@ -357,6 +357,9 @@ bool MicroWakeWord::detect_wake_word_() { for (auto &prob : this->recent_streaming_probabilities_) { prob = 0; } + + ESP_LOGD(TAG, "Wake word sliding average probability is %.3f and most recent probability is %.3f", + sliding_window_average, streaming_prob); return true; } @@ -371,23 +374,6 @@ void MicroWakeWord::set_sliding_window_average_size(size_t size) { bool MicroWakeWord::slice_available_() { size_t available = this->ring_buffer_->available(); - size_t free = this->ring_buffer_->free(); - - if (free < NEW_SAMPLES_TO_GET * sizeof(int16_t)) { - // If the ring buffer is within one audio slice of being full, then wake word detection will have issues. - // If this is constantly occuring, then some possibilities why are - // 1) there are too many other slow components configured - // 2) the ESP32 isn't fast enough; e.g., an ESP32 is much slower than an ESP32-S3 at inferences. - // 3) the model is too large - // 4) the model uses operations that are not optimized - ESP_LOGW(TAG, - "Audio buffer is nearly full. Wake word detection may be less accurate and have slower reponse times. " -#if !defined(USE_ESP32_VARIANT_ESP32S3) - "microWakeWord is designed for the ESP32-S3. The current platform is too slow for this model." -#endif - ); - } - return available > (NEW_SAMPLES_TO_GET * sizeof(int16_t)); } @@ -396,13 +382,12 @@ bool MicroWakeWord::stride_audio_samples_(int16_t **audio_samples) { return false; } - // Copy 320 bytes (160 samples over 10 ms) into preprocessor_audio_buffer_ from history in - // preprocessor_stride_buffer_ - memcpy((void *) (this->preprocessor_audio_buffer_), (void *) (this->preprocessor_stride_buffer_), + // Copy the last 320 bytes (160 samples over 10 ms) from the audio buffer to the start of the audio buffer + memcpy((void *) (this->preprocessor_audio_buffer_), (void *) (this->preprocessor_audio_buffer_ + NEW_SAMPLES_TO_GET), HISTORY_SAMPLES_TO_KEEP * sizeof(int16_t)); - // Copy 640 bytes (320 samples over 20 ms) from the ring buffer - // The first 320 bytes (160 samples over 10 ms) will be from history + // Copy 640 bytes (320 samples over 20 ms) from the ring buffer into the audio buffer offset 320 bytes (160 samples + // over 10 ms) size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_ + HISTORY_SAMPLES_TO_KEEP), NEW_SAMPLES_TO_GET * sizeof(int16_t), pdMS_TO_TICKS(200)); @@ -415,11 +400,6 @@ bool MicroWakeWord::stride_audio_samples_(int16_t **audio_samples) { return false; } - // Copy the last 320 bytes (160 samples over 10 ms) from the audio buffer into history stride buffer for the next - // iteration - memcpy((void *) (this->preprocessor_stride_buffer_), (void *) (this->preprocessor_audio_buffer_ + NEW_SAMPLES_TO_GET), - HISTORY_SAMPLES_TO_KEEP * sizeof(int16_t)); - *audio_samples = this->preprocessor_audio_buffer_; return true; } @@ -480,7 +460,7 @@ bool MicroWakeWord::register_preprocessor_ops_(tflite::MicroMutableOpResolver<18 return true; } -bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<14> &op_resolver) { +bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &op_resolver) { if (op_resolver.AddCallOnce() != kTfLiteOk) return false; if (op_resolver.AddVarHandle() != kTfLiteOk) @@ -509,6 +489,12 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<14> & return false; if (op_resolver.AddQuantize() != kTfLiteOk) return false; + if (op_resolver.AddDepthwiseConv2D() != kTfLiteOk) + return false; + if (op_resolver.AddAveragePool2D() != kTfLiteOk) + return false; + if (op_resolver.AddMaxPool2D() != kTfLiteOk) + return false; return true; } diff --git a/esphome/components/micro_wake_word/micro_wake_word.h b/esphome/components/micro_wake_word/micro_wake_word.h index 27d05c3e09..1d7c18d686 100644 --- a/esphome/components/micro_wake_word/micro_wake_word.h +++ b/esphome/components/micro_wake_word/micro_wake_word.h @@ -128,7 +128,6 @@ class MicroWakeWord : public Component { // Stores audio fed into feature generator preprocessor int16_t *preprocessor_audio_buffer_; - int16_t *preprocessor_stride_buffer_; bool detected_{false}; @@ -181,7 +180,7 @@ class MicroWakeWord : public Component { bool register_preprocessor_ops_(tflite::MicroMutableOpResolver<18> &op_resolver); /// @brief Returns true if successfully registered the streaming model's TensorFlow operations - bool register_streaming_ops_(tflite::MicroMutableOpResolver<14> &op_resolver); + bool register_streaming_ops_(tflite::MicroMutableOpResolver<17> &op_resolver); }; template class StartAction : public Action, public Parented {