#pragma once #include "esphome/core/defines.h" #ifdef USE_VOICE_ASSISTANT #include "esphome/core/automation.h" #include "esphome/core/component.h" #include "esphome/core/helpers.h" #include "esphome/core/ring_buffer.h" #include "esphome/components/api/api_connection.h" #include "esphome/components/api/api_pb2.h" #include "esphome/components/microphone/microphone.h" #ifdef USE_SPEAKER #include "esphome/components/speaker/speaker.h" #endif #ifdef USE_MEDIA_PLAYER #include "esphome/components/media_player/media_player.h" #endif #include "esphome/components/socket/socket.h" #ifdef USE_ESP_ADF #include #endif #include #include namespace esphome { namespace voice_assistant { // Version 1: Initial version // Version 2: Adds raw speaker support static const uint32_t LEGACY_INITIAL_VERSION = 1; static const uint32_t LEGACY_SPEAKER_SUPPORT = 2; enum VoiceAssistantFeature : uint32_t { FEATURE_VOICE_ASSISTANT = 1 << 0, FEATURE_SPEAKER = 1 << 1, FEATURE_API_AUDIO = 1 << 2, FEATURE_TIMERS = 1 << 3, }; enum class State { IDLE, START_MICROPHONE, STARTING_MICROPHONE, WAIT_FOR_VAD, WAITING_FOR_VAD, START_PIPELINE, STARTING_PIPELINE, STREAMING_MICROPHONE, STOP_MICROPHONE, STOPPING_MICROPHONE, AWAITING_RESPONSE, STREAMING_RESPONSE, RESPONSE_FINISHED, }; enum AudioMode : uint8_t { AUDIO_MODE_UDP, AUDIO_MODE_API, }; struct Timer { std::string id; std::string name; uint32_t total_seconds; uint32_t seconds_left; bool is_active; std::string to_string() const { return str_sprintf("Timer(id=%s, name=%s, total_seconds=%" PRIu32 ", seconds_left=%" PRIu32 ", is_active=%s)", this->id.c_str(), this->name.c_str(), this->total_seconds, this->seconds_left, YESNO(this->is_active)); } }; struct WakeWord { std::string id; std::string wake_word; std::vector trained_languages; }; struct Configuration { std::vector available_wake_words; std::vector active_wake_words; uint32_t max_active_wake_words; }; class VoiceAssistant : public Component { public: VoiceAssistant(); void loop() override; float get_setup_priority() const override; void start_streaming(); void start_streaming(struct sockaddr_storage *addr, uint16_t port); void failed_to_start(); void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; } #ifdef USE_SPEAKER void set_speaker(speaker::Speaker *speaker) { this->speaker_ = speaker; this->local_output_ = true; } #endif #ifdef USE_MEDIA_PLAYER void set_media_player(media_player::MediaPlayer *media_player) { this->media_player_ = media_player; this->local_output_ = true; } #endif uint32_t get_legacy_version() const { #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { return LEGACY_SPEAKER_SUPPORT; } #endif return LEGACY_INITIAL_VERSION; } uint32_t get_feature_flags() const { uint32_t flags = 0; flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT; flags |= VoiceAssistantFeature::FEATURE_API_AUDIO; #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { flags |= VoiceAssistantFeature::FEATURE_SPEAKER; } #endif if (this->has_timers_) { flags |= VoiceAssistantFeature::FEATURE_TIMERS; } return flags; } void request_start(bool continuous, bool silence_detection); void request_stop(); void on_event(const api::VoiceAssistantEventResponse &msg); void on_audio(const api::VoiceAssistantAudio &msg); void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg); void on_announce(const api::VoiceAssistantAnnounceRequest &msg); void on_set_configuration(const std::vector &active_wake_words){}; const Configuration &get_configuration() { return this->config_; }; bool is_running() const { return this->state_ != State::IDLE; } void set_continuous(bool continuous) { this->continuous_ = continuous; } bool is_continuous() const { return this->continuous_; } void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; } #ifdef USE_ESP_ADF void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; } #endif void set_noise_suppression_level(uint8_t noise_suppression_level) { this->noise_suppression_level_ = noise_suppression_level; } void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; } void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; } void set_conversation_timeout(uint32_t conversation_timeout) { this->conversation_timeout_ = conversation_timeout; } void reset_conversation_id(); Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; } Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; } Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } Trigger<> *get_end_trigger() const { return this->end_trigger_; } Trigger<> *get_start_trigger() const { return this->start_trigger_; } Trigger<> *get_stt_vad_end_trigger() const { return this->stt_vad_end_trigger_; } Trigger<> *get_stt_vad_start_trigger() const { return this->stt_vad_start_trigger_; } #ifdef USE_SPEAKER Trigger<> *get_tts_stream_start_trigger() const { return this->tts_stream_start_trigger_; } Trigger<> *get_tts_stream_end_trigger() const { return this->tts_stream_end_trigger_; } #endif Trigger<> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; } Trigger *get_stt_end_trigger() const { return this->stt_end_trigger_; } Trigger *get_tts_end_trigger() const { return this->tts_end_trigger_; } Trigger *get_tts_start_trigger() const { return this->tts_start_trigger_; } Trigger *get_error_trigger() const { return this->error_trigger_; } Trigger<> *get_idle_trigger() const { return this->idle_trigger_; } Trigger<> *get_client_connected_trigger() const { return this->client_connected_trigger_; } Trigger<> *get_client_disconnected_trigger() const { return this->client_disconnected_trigger_; } void client_subscription(api::APIConnection *client, bool subscribe); api::APIConnection *get_api_connection() const { return this->api_client_; } void set_wake_word(const std::string &wake_word) { this->wake_word_ = wake_word; } Trigger *get_timer_started_trigger() const { return this->timer_started_trigger_; } Trigger *get_timer_updated_trigger() const { return this->timer_updated_trigger_; } Trigger *get_timer_cancelled_trigger() const { return this->timer_cancelled_trigger_; } Trigger *get_timer_finished_trigger() const { return this->timer_finished_trigger_; } Trigger> *get_timer_tick_trigger() const { return this->timer_tick_trigger_; } void set_has_timers(bool has_timers) { this->has_timers_ = has_timers; } const std::unordered_map &get_timers() const { return this->timers_; } protected: bool allocate_buffers_(); void clear_buffers_(); void deallocate_buffers_(); int read_microphone_(); void set_state_(State state); void set_state_(State state, State desired_state); void signal_stop_(); std::unique_ptr socket_ = nullptr; struct sockaddr_storage dest_addr_; Trigger<> *intent_end_trigger_ = new Trigger<>(); Trigger<> *intent_start_trigger_ = new Trigger<>(); Trigger<> *listening_trigger_ = new Trigger<>(); Trigger<> *end_trigger_ = new Trigger<>(); Trigger<> *start_trigger_ = new Trigger<>(); Trigger<> *stt_vad_start_trigger_ = new Trigger<>(); Trigger<> *stt_vad_end_trigger_ = new Trigger<>(); #ifdef USE_SPEAKER Trigger<> *tts_stream_start_trigger_ = new Trigger<>(); Trigger<> *tts_stream_end_trigger_ = new Trigger<>(); #endif Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); Trigger *stt_end_trigger_ = new Trigger(); Trigger *tts_end_trigger_ = new Trigger(); Trigger *tts_start_trigger_ = new Trigger(); Trigger *error_trigger_ = new Trigger(); Trigger<> *idle_trigger_ = new Trigger<>(); Trigger<> *client_connected_trigger_ = new Trigger<>(); Trigger<> *client_disconnected_trigger_ = new Trigger<>(); api::APIConnection *api_client_{nullptr}; std::unordered_map timers_; void timer_tick_(); Trigger *timer_started_trigger_ = new Trigger(); Trigger *timer_finished_trigger_ = new Trigger(); Trigger *timer_updated_trigger_ = new Trigger(); Trigger *timer_cancelled_trigger_ = new Trigger(); Trigger> *timer_tick_trigger_ = new Trigger>(); bool has_timers_{false}; bool timer_tick_running_{false}; microphone::Microphone *mic_{nullptr}; #ifdef USE_SPEAKER void write_speaker_(); speaker::Speaker *speaker_{nullptr}; uint8_t *speaker_buffer_{nullptr}; size_t speaker_buffer_index_{0}; size_t speaker_buffer_size_{0}; size_t speaker_bytes_received_{0}; bool wait_for_stream_end_{false}; bool stream_ended_{false}; #endif #ifdef USE_MEDIA_PLAYER media_player::MediaPlayer *media_player_{nullptr}; #endif bool local_output_{false}; std::string conversation_id_{""}; std::string wake_word_{""}; HighFrequencyLoopRequester high_freq_; #ifdef USE_ESP_ADF vad_handle_t vad_instance_; uint8_t vad_threshold_{5}; uint8_t vad_counter_{0}; #endif std::unique_ptr ring_buffer_; bool use_wake_word_; uint8_t noise_suppression_level_; uint8_t auto_gain_; float volume_multiplier_; uint32_t conversation_timeout_; uint8_t *send_buffer_{nullptr}; int16_t *input_buffer_{nullptr}; bool continuous_{false}; bool silence_detection_; State state_{State::IDLE}; State desired_state_{State::IDLE}; AudioMode audio_mode_{AUDIO_MODE_UDP}; bool udp_socket_running_{false}; bool start_udp_socket_(); Configuration config_{}; }; template class StartAction : public Action, public Parented { TEMPLATABLE_VALUE(std::string, wake_word); public: void play(Ts... x) override { this->parent_->set_wake_word(this->wake_word_.value(x...)); this->parent_->request_start(false, this->silence_detection_); } void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; } protected: bool silence_detection_; }; template class StartContinuousAction : public Action, public Parented { public: void play(Ts... x) override { this->parent_->request_start(true, true); } }; template class StopAction : public Action, public Parented { public: void play(Ts... x) override { this->parent_->request_stop(); } }; template class IsRunningCondition : public Condition, public Parented { public: bool check(Ts... x) override { return this->parent_->is_running() || this->parent_->is_continuous(); } }; template class ConnectedCondition : public Condition, public Parented { public: bool check(Ts... x) override { return this->parent_->get_api_connection() != nullptr; } }; extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) } // namespace voice_assistant } // namespace esphome #endif // USE_VOICE_ASSISTANT