mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-30 22:53:59 +00:00 
			
		
		
		
	Add stream start and end events (#5545)
This commit is contained in:
		| @@ -1459,6 +1459,8 @@ enum VoiceAssistantEvent { | |||||||
|   VOICE_ASSISTANT_WAKE_WORD_END = 10; |   VOICE_ASSISTANT_WAKE_WORD_END = 10; | ||||||
|   VOICE_ASSISTANT_STT_VAD_START = 11; |   VOICE_ASSISTANT_STT_VAD_START = 11; | ||||||
|   VOICE_ASSISTANT_STT_VAD_END = 12; |   VOICE_ASSISTANT_STT_VAD_END = 12; | ||||||
|  |   VOICE_ASSISTANT_TTS_STREAM_START = 98; | ||||||
|  |   VOICE_ASSISTANT_TTS_STREAM_END = 99; | ||||||
| } | } | ||||||
|  |  | ||||||
| message VoiceAssistantEventData { | message VoiceAssistantEventData { | ||||||
|   | |||||||
| @@ -452,6 +452,10 @@ template<> const char *proto_enum_to_string<enums::VoiceAssistantEvent>(enums::V | |||||||
|       return "VOICE_ASSISTANT_STT_VAD_START"; |       return "VOICE_ASSISTANT_STT_VAD_START"; | ||||||
|     case enums::VOICE_ASSISTANT_STT_VAD_END: |     case enums::VOICE_ASSISTANT_STT_VAD_END: | ||||||
|       return "VOICE_ASSISTANT_STT_VAD_END"; |       return "VOICE_ASSISTANT_STT_VAD_END"; | ||||||
|  |     case enums::VOICE_ASSISTANT_TTS_STREAM_START: | ||||||
|  |       return "VOICE_ASSISTANT_TTS_STREAM_START"; | ||||||
|  |     case enums::VOICE_ASSISTANT_TTS_STREAM_END: | ||||||
|  |       return "VOICE_ASSISTANT_TTS_STREAM_END"; | ||||||
|     default: |     default: | ||||||
|       return "UNKNOWN"; |       return "UNKNOWN"; | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -184,6 +184,8 @@ enum VoiceAssistantEvent : uint32_t { | |||||||
|   VOICE_ASSISTANT_WAKE_WORD_END = 10, |   VOICE_ASSISTANT_WAKE_WORD_END = 10, | ||||||
|   VOICE_ASSISTANT_STT_VAD_START = 11, |   VOICE_ASSISTANT_STT_VAD_START = 11, | ||||||
|   VOICE_ASSISTANT_STT_VAD_END = 12, |   VOICE_ASSISTANT_STT_VAD_END = 12, | ||||||
|  |   VOICE_ASSISTANT_TTS_STREAM_START = 98, | ||||||
|  |   VOICE_ASSISTANT_TTS_STREAM_END = 99, | ||||||
| }; | }; | ||||||
| enum AlarmControlPanelState : uint32_t { | enum AlarmControlPanelState : uint32_t { | ||||||
|   ALARM_STATE_DISARMED = 0, |   ALARM_STATE_DISARMED = 0, | ||||||
|   | |||||||
| @@ -158,8 +158,13 @@ void I2SAudioSpeaker::watch_() { | |||||||
|   if (xQueueReceive(this->event_queue_, &event, 0) == pdTRUE) { |   if (xQueueReceive(this->event_queue_, &event, 0) == pdTRUE) { | ||||||
|     switch (event.type) { |     switch (event.type) { | ||||||
|       case TaskEventType::STARTING: |       case TaskEventType::STARTING: | ||||||
|  |         ESP_LOGD(TAG, "Starting I2S Audio Speaker"); | ||||||
|  |         break; | ||||||
|       case TaskEventType::STARTED: |       case TaskEventType::STARTED: | ||||||
|  |         ESP_LOGD(TAG, "Started I2S Audio Speaker"); | ||||||
|  |         break; | ||||||
|       case TaskEventType::STOPPING: |       case TaskEventType::STOPPING: | ||||||
|  |         ESP_LOGD(TAG, "Stopping I2S Audio Speaker"); | ||||||
|         break; |         break; | ||||||
|       case TaskEventType::PLAYING: |       case TaskEventType::PLAYING: | ||||||
|         this->status_clear_warning(); |         this->status_clear_warning(); | ||||||
| @@ -170,6 +175,7 @@ void I2SAudioSpeaker::watch_() { | |||||||
|         this->player_task_handle_ = nullptr; |         this->player_task_handle_ = nullptr; | ||||||
|         this->parent_->unlock(); |         this->parent_->unlock(); | ||||||
|         xQueueReset(this->buffer_queue_); |         xQueueReset(this->buffer_queue_); | ||||||
|  |         ESP_LOGD(TAG, "Stopped I2S Audio Speaker"); | ||||||
|         break; |         break; | ||||||
|       case TaskEventType::WARNING: |       case TaskEventType::WARNING: | ||||||
|         ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(event.err)); |         ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(event.err)); | ||||||
|   | |||||||
| @@ -281,11 +281,14 @@ void VoiceAssistant::loop() { | |||||||
|             memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written); |             memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written); | ||||||
|             this->speaker_buffer_size_ -= written; |             this->speaker_buffer_size_ -= written; | ||||||
|             this->speaker_buffer_index_ -= written; |             this->speaker_buffer_index_ -= written; | ||||||
|             this->set_timeout("speaker-timeout", 1000, [this]() { this->speaker_->stop(); }); |             this->set_timeout("speaker-timeout", 2000, [this]() { this->speaker_->stop(); }); | ||||||
|           } else { |           } else { | ||||||
|             ESP_LOGW(TAG, "Speaker buffer full."); |             ESP_LOGW(TAG, "Speaker buffer full."); | ||||||
|           } |           } | ||||||
|         } |         } | ||||||
|  |         if (this->wait_for_stream_end_) { | ||||||
|  |           break;  // We dont want to timeout here as the STREAM_END event will take care of that. | ||||||
|  |         } | ||||||
|         playing = this->speaker_->is_running(); |         playing = this->speaker_->is_running(); | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
| @@ -295,28 +298,77 @@ void VoiceAssistant::loop() { | |||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|       if (playing) { |       if (playing) { | ||||||
|         this->set_timeout("playing", 100, [this]() { |         this->set_timeout("playing", 2000, [this]() { | ||||||
|           this->cancel_timeout("speaker-timeout"); |           this->cancel_timeout("speaker-timeout"); | ||||||
|           this->set_state_(State::IDLE, State::IDLE); |           this->set_state_(State::IDLE, State::IDLE); | ||||||
|         }); |         }); | ||||||
|       } |       } | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|  |     case State::RESPONSE_FINISHED: { | ||||||
|  | #ifdef USE_SPEAKER | ||||||
|  |       if (this->speaker_ != nullptr) { | ||||||
|  |         this->speaker_->stop(); | ||||||
|  |         this->cancel_timeout("speaker-timeout"); | ||||||
|  |         this->cancel_timeout("playing"); | ||||||
|  |         this->speaker_buffer_size_ = 0; | ||||||
|  |         this->speaker_buffer_index_ = 0; | ||||||
|  |         memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE); | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|  |       this->wait_for_stream_end_ = false; | ||||||
|  |       this->set_state_(State::IDLE, State::IDLE); | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|     default: |     default: | ||||||
|       break; |       break; | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static const LogString *voice_assistant_state_to_string(State state) { | ||||||
|  |   switch (state) { | ||||||
|  |     case State::IDLE: | ||||||
|  |       return LOG_STR("IDLE"); | ||||||
|  |     case State::START_MICROPHONE: | ||||||
|  |       return LOG_STR("START_MICROPHONE"); | ||||||
|  |     case State::STARTING_MICROPHONE: | ||||||
|  |       return LOG_STR("STARTING_MICROPHONE"); | ||||||
|  |     case State::WAIT_FOR_VAD: | ||||||
|  |       return LOG_STR("WAIT_FOR_VAD"); | ||||||
|  |     case State::WAITING_FOR_VAD: | ||||||
|  |       return LOG_STR("WAITING_FOR_VAD"); | ||||||
|  |     case State::START_PIPELINE: | ||||||
|  |       return LOG_STR("START_PIPELINE"); | ||||||
|  |     case State::STARTING_PIPELINE: | ||||||
|  |       return LOG_STR("STARTING_PIPELINE"); | ||||||
|  |     case State::STREAMING_MICROPHONE: | ||||||
|  |       return LOG_STR("STREAMING_MICROPHONE"); | ||||||
|  |     case State::STOP_MICROPHONE: | ||||||
|  |       return LOG_STR("STOP_MICROPHONE"); | ||||||
|  |     case State::STOPPING_MICROPHONE: | ||||||
|  |       return LOG_STR("STOPPING_MICROPHONE"); | ||||||
|  |     case State::AWAITING_RESPONSE: | ||||||
|  |       return LOG_STR("AWAITING_RESPONSE"); | ||||||
|  |     case State::STREAMING_RESPONSE: | ||||||
|  |       return LOG_STR("STREAMING_RESPONSE"); | ||||||
|  |     case State::RESPONSE_FINISHED: | ||||||
|  |       return LOG_STR("RESPONSE_FINISHED"); | ||||||
|  |     default: | ||||||
|  |       return LOG_STR("UNKNOWN"); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| void VoiceAssistant::set_state_(State state) { | void VoiceAssistant::set_state_(State state) { | ||||||
|   State old_state = this->state_; |   State old_state = this->state_; | ||||||
|   this->state_ = state; |   this->state_ = state; | ||||||
|   ESP_LOGD(TAG, "State changed from %d to %d", static_cast<uint8_t>(old_state), static_cast<uint8_t>(state)); |   ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)), | ||||||
|  |            LOG_STR_ARG(voice_assistant_state_to_string(state))); | ||||||
| } | } | ||||||
|  |  | ||||||
| void VoiceAssistant::set_state_(State state, State desired_state) { | void VoiceAssistant::set_state_(State state, State desired_state) { | ||||||
|   this->set_state_(state); |   this->set_state_(state); | ||||||
|   this->desired_state_ = desired_state; |   this->desired_state_ = desired_state; | ||||||
|   ESP_LOGD(TAG, "Desired state set to %d", static_cast<uint8_t>(desired_state)); |   ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state))); | ||||||
| } | } | ||||||
|  |  | ||||||
| void VoiceAssistant::failed_to_start() { | void VoiceAssistant::failed_to_start() { | ||||||
| @@ -400,6 +452,7 @@ void VoiceAssistant::request_stop() { | |||||||
|       break; |       break; | ||||||
|     case State::AWAITING_RESPONSE: |     case State::AWAITING_RESPONSE: | ||||||
|     case State::STREAMING_RESPONSE: |     case State::STREAMING_RESPONSE: | ||||||
|  |     case State::RESPONSE_FINISHED: | ||||||
|       break;  // Let the incoming audio stream finish then it will go to idle. |       break;  // Let the incoming audio stream finish then it will go to idle. | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -531,6 +584,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|       this->error_trigger_->trigger(code, message); |       this->error_trigger_->trigger(code, message); | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|  |     case api::enums::VOICE_ASSISTANT_TTS_STREAM_START: { | ||||||
|  |       this->wait_for_stream_end_ = true; | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|  |     case api::enums::VOICE_ASSISTANT_TTS_STREAM_END: { | ||||||
|  |       this->set_state_(State::RESPONSE_FINISHED, State::IDLE); | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|     default: |     default: | ||||||
|       ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); |       ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); | ||||||
|       break; |       break; | ||||||
|   | |||||||
| @@ -46,6 +46,7 @@ enum class State { | |||||||
|   STOPPING_MICROPHONE, |   STOPPING_MICROPHONE, | ||||||
|   AWAITING_RESPONSE, |   AWAITING_RESPONSE, | ||||||
|   STREAMING_RESPONSE, |   STREAMING_RESPONSE, | ||||||
|  |   RESPONSE_FINISHED, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| class VoiceAssistant : public Component { | class VoiceAssistant : public Component { | ||||||
| @@ -132,10 +133,10 @@ class VoiceAssistant : public Component { | |||||||
|   uint8_t *speaker_buffer_; |   uint8_t *speaker_buffer_; | ||||||
|   size_t speaker_buffer_index_{0}; |   size_t speaker_buffer_index_{0}; | ||||||
|   size_t speaker_buffer_size_{0}; |   size_t speaker_buffer_size_{0}; | ||||||
|  |   bool wait_for_stream_end_{false}; | ||||||
| #endif | #endif | ||||||
| #ifdef USE_MEDIA_PLAYER | #ifdef USE_MEDIA_PLAYER | ||||||
|   media_player::MediaPlayer *media_player_{nullptr}; |   media_player::MediaPlayer *media_player_{nullptr}; | ||||||
|   bool playing_tts_{false}; |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   bool local_output_{false}; |   bool local_output_{false}; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user