mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-30 22:53:59 +00:00 
			
		
		
		
	Send/Receive Voice Assistant audio via API (#6471)
Co-authored-by: Michael Hansen <mike@rhasspy.org>
This commit is contained in:
		| @@ -217,7 +217,8 @@ message DeviceInfoResponse { | ||||
|  | ||||
|   string friendly_name = 13; | ||||
|  | ||||
|   uint32 voice_assistant_version = 14; | ||||
|   uint32 legacy_voice_assistant_version = 14; | ||||
|   uint32 voice_assistant_feature_flags = 17; | ||||
|  | ||||
|   string suggested_area = 16; | ||||
| } | ||||
| @@ -1422,12 +1423,18 @@ message BluetoothDeviceClearCacheResponse { | ||||
| } | ||||
|  | ||||
| // ==================== PUSH TO TALK ==================== | ||||
| enum VoiceAssistantSubscribeFlag { | ||||
|   VOICE_ASSISTANT_SUBSCRIBE_NONE = 0; | ||||
|   VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1; | ||||
| } | ||||
|  | ||||
| message SubscribeVoiceAssistantRequest { | ||||
|   option (id) = 89; | ||||
|   option (source) = SOURCE_CLIENT; | ||||
|   option (ifdef) = "USE_VOICE_ASSISTANT"; | ||||
|  | ||||
|   bool subscribe = 1; | ||||
|   uint32 flags = 2; | ||||
| } | ||||
|  | ||||
| enum VoiceAssistantRequestFlag { | ||||
| @@ -1495,6 +1502,16 @@ message VoiceAssistantEventResponse { | ||||
|   repeated VoiceAssistantEventData data = 2; | ||||
| } | ||||
|  | ||||
| message VoiceAssistantAudio { | ||||
|   option (id) = 106; | ||||
|   option (source) = SOURCE_BOTH; | ||||
|   option (ifdef) = "USE_VOICE_ASSISTANT"; | ||||
|  | ||||
|   bytes data = 1; | ||||
|   bool end = 2; | ||||
| } | ||||
|  | ||||
|  | ||||
| // ==================== ALARM CONTROL PANEL ==================== | ||||
| enum AlarmControlPanelState { | ||||
|   ALARM_STATE_DISARMED = 0; | ||||
|   | ||||
| @@ -1040,10 +1040,15 @@ void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &ms | ||||
|       voice_assistant::global_voice_assistant->failed_to_start(); | ||||
|       return; | ||||
|     } | ||||
|     struct sockaddr_storage storage; | ||||
|     socklen_t len = sizeof(storage); | ||||
|     this->helper_->getpeername((struct sockaddr *) &storage, &len); | ||||
|     voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); | ||||
|     if (msg.port == 0) { | ||||
|       // Use API Audio | ||||
|       voice_assistant::global_voice_assistant->start_streaming(); | ||||
|     } else { | ||||
|       struct sockaddr_storage storage; | ||||
|       socklen_t len = sizeof(storage); | ||||
|       this->helper_->getpeername((struct sockaddr *) &storage, &len); | ||||
|       voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) { | ||||
| @@ -1055,6 +1060,15 @@ void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventR | ||||
|     voice_assistant::global_voice_assistant->on_event(msg); | ||||
|   } | ||||
| } | ||||
| void APIConnection::on_voice_assistant_audio(const VoiceAssistantAudio &msg) { | ||||
|   if (voice_assistant::global_voice_assistant != nullptr) { | ||||
|     if (voice_assistant::global_voice_assistant->get_api_connection() != this) { | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     voice_assistant::global_voice_assistant->on_audio(msg); | ||||
|   } | ||||
| }; | ||||
|  | ||||
| #endif | ||||
|  | ||||
| @@ -1142,7 +1156,7 @@ HelloResponse APIConnection::hello(const HelloRequest &msg) { | ||||
|  | ||||
|   HelloResponse resp; | ||||
|   resp.api_version_major = 1; | ||||
|   resp.api_version_minor = 9; | ||||
|   resp.api_version_minor = 10; | ||||
|   resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")"; | ||||
|   resp.name = App.get_name(); | ||||
|  | ||||
| @@ -1203,7 +1217,8 @@ DeviceInfoResponse APIConnection::device_info(const DeviceInfoRequest &msg) { | ||||
|   resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags(); | ||||
| #endif | ||||
| #ifdef USE_VOICE_ASSISTANT | ||||
|   resp.voice_assistant_version = voice_assistant::global_voice_assistant->get_version(); | ||||
|   resp.legacy_voice_assistant_version = voice_assistant::global_voice_assistant->get_legacy_version(); | ||||
|   resp.voice_assistant_feature_flags = voice_assistant::global_voice_assistant->get_feature_flags(); | ||||
| #endif | ||||
|   return resp; | ||||
| } | ||||
|   | ||||
| @@ -134,6 +134,7 @@ class APIConnection : public APIServerConnection { | ||||
|   void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override; | ||||
|   void on_voice_assistant_response(const VoiceAssistantResponse &msg) override; | ||||
|   void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override; | ||||
|   void on_voice_assistant_audio(const VoiceAssistantAudio &msg) override; | ||||
| #endif | ||||
|  | ||||
| #ifdef USE_ALARM_CONTROL_PANEL | ||||
|   | ||||
| @@ -410,6 +410,19 @@ const char *proto_enum_to_string<enums::BluetoothDeviceRequestType>(enums::Bluet | ||||
| } | ||||
| #endif | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
| template<> | ||||
| const char *proto_enum_to_string<enums::VoiceAssistantSubscribeFlag>(enums::VoiceAssistantSubscribeFlag value) { | ||||
|   switch (value) { | ||||
|     case enums::VOICE_ASSISTANT_SUBSCRIBE_NONE: | ||||
|       return "VOICE_ASSISTANT_SUBSCRIBE_NONE"; | ||||
|     case enums::VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO: | ||||
|       return "VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO"; | ||||
|     default: | ||||
|       return "UNKNOWN"; | ||||
|   } | ||||
| } | ||||
| #endif | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
| template<> const char *proto_enum_to_string<enums::VoiceAssistantRequestFlag>(enums::VoiceAssistantRequestFlag value) { | ||||
|   switch (value) { | ||||
|     case enums::VOICE_ASSISTANT_REQUEST_NONE: | ||||
| @@ -716,7 +729,11 @@ bool DeviceInfoResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { | ||||
|       return true; | ||||
|     } | ||||
|     case 14: { | ||||
|       this->voice_assistant_version = value.as_uint32(); | ||||
|       this->legacy_voice_assistant_version = value.as_uint32(); | ||||
|       return true; | ||||
|     } | ||||
|     case 17: { | ||||
|       this->voice_assistant_feature_flags = value.as_uint32(); | ||||
|       return true; | ||||
|     } | ||||
|     default: | ||||
| @@ -784,7 +801,8 @@ void DeviceInfoResponse::encode(ProtoWriteBuffer buffer) const { | ||||
|   buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags); | ||||
|   buffer.encode_string(12, this->manufacturer); | ||||
|   buffer.encode_string(13, this->friendly_name); | ||||
|   buffer.encode_uint32(14, this->voice_assistant_version); | ||||
|   buffer.encode_uint32(14, this->legacy_voice_assistant_version); | ||||
|   buffer.encode_uint32(17, this->voice_assistant_feature_flags); | ||||
|   buffer.encode_string(16, this->suggested_area); | ||||
| } | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
| @@ -850,8 +868,13 @@ void DeviceInfoResponse::dump_to(std::string &out) const { | ||||
|   out.append("'").append(this->friendly_name).append("'"); | ||||
|   out.append("\n"); | ||||
|  | ||||
|   out.append("  voice_assistant_version: "); | ||||
|   sprintf(buffer, "%" PRIu32, this->voice_assistant_version); | ||||
|   out.append("  legacy_voice_assistant_version: "); | ||||
|   sprintf(buffer, "%" PRIu32, this->legacy_voice_assistant_version); | ||||
|   out.append(buffer); | ||||
|   out.append("\n"); | ||||
|  | ||||
|   out.append("  voice_assistant_feature_flags: "); | ||||
|   sprintf(buffer, "%" PRIu32, this->voice_assistant_feature_flags); | ||||
|   out.append(buffer); | ||||
|   out.append("\n"); | ||||
|  | ||||
| @@ -6514,11 +6537,18 @@ bool SubscribeVoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarIn | ||||
|       this->subscribe = value.as_bool(); | ||||
|       return true; | ||||
|     } | ||||
|     case 2: { | ||||
|       this->flags = value.as_uint32(); | ||||
|       return true; | ||||
|     } | ||||
|     default: | ||||
|       return false; | ||||
|   } | ||||
| } | ||||
| void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->subscribe); } | ||||
| void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { | ||||
|   buffer.encode_bool(1, this->subscribe); | ||||
|   buffer.encode_uint32(2, this->flags); | ||||
| } | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
| void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { | ||||
|   __attribute__((unused)) char buffer[64]; | ||||
| @@ -6526,6 +6556,11 @@ void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { | ||||
|   out.append("  subscribe: "); | ||||
|   out.append(YESNO(this->subscribe)); | ||||
|   out.append("\n"); | ||||
|  | ||||
|   out.append("  flags: "); | ||||
|   sprintf(buffer, "%" PRIu32, this->flags); | ||||
|   out.append(buffer); | ||||
|   out.append("\n"); | ||||
|   out.append("}"); | ||||
| } | ||||
| #endif | ||||
| @@ -6752,6 +6787,44 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const { | ||||
|   out.append("}"); | ||||
| } | ||||
| #endif | ||||
| bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) { | ||||
|   switch (field_id) { | ||||
|     case 2: { | ||||
|       this->end = value.as_bool(); | ||||
|       return true; | ||||
|     } | ||||
|     default: | ||||
|       return false; | ||||
|   } | ||||
| } | ||||
| bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) { | ||||
|   switch (field_id) { | ||||
|     case 1: { | ||||
|       this->data = value.as_string(); | ||||
|       return true; | ||||
|     } | ||||
|     default: | ||||
|       return false; | ||||
|   } | ||||
| } | ||||
| void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const { | ||||
|   buffer.encode_string(1, this->data); | ||||
|   buffer.encode_bool(2, this->end); | ||||
| } | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
| void VoiceAssistantAudio::dump_to(std::string &out) const { | ||||
|   __attribute__((unused)) char buffer[64]; | ||||
|   out.append("VoiceAssistantAudio {\n"); | ||||
|   out.append("  data: "); | ||||
|   out.append("'").append(this->data).append("'"); | ||||
|   out.append("\n"); | ||||
|  | ||||
|   out.append("  end: "); | ||||
|   out.append(YESNO(this->end)); | ||||
|   out.append("\n"); | ||||
|   out.append("}"); | ||||
| } | ||||
| #endif | ||||
| bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { | ||||
|   switch (field_id) { | ||||
|     case 6: { | ||||
|   | ||||
| @@ -165,6 +165,10 @@ enum BluetoothDeviceRequestType : uint32_t { | ||||
|   BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5, | ||||
|   BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6, | ||||
| }; | ||||
| enum VoiceAssistantSubscribeFlag : uint32_t { | ||||
|   VOICE_ASSISTANT_SUBSCRIBE_NONE = 0, | ||||
|   VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1, | ||||
| }; | ||||
| enum VoiceAssistantRequestFlag : uint32_t { | ||||
|   VOICE_ASSISTANT_REQUEST_NONE = 0, | ||||
|   VOICE_ASSISTANT_REQUEST_USE_VAD = 1, | ||||
| @@ -327,7 +331,8 @@ class DeviceInfoResponse : public ProtoMessage { | ||||
|   uint32_t bluetooth_proxy_feature_flags{0}; | ||||
|   std::string manufacturer{}; | ||||
|   std::string friendly_name{}; | ||||
|   uint32_t voice_assistant_version{0}; | ||||
|   uint32_t legacy_voice_assistant_version{0}; | ||||
|   uint32_t voice_assistant_feature_flags{0}; | ||||
|   std::string suggested_area{}; | ||||
|   void encode(ProtoWriteBuffer buffer) const override; | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
| @@ -1674,6 +1679,7 @@ class BluetoothDeviceClearCacheResponse : public ProtoMessage { | ||||
| class SubscribeVoiceAssistantRequest : public ProtoMessage { | ||||
|  public: | ||||
|   bool subscribe{false}; | ||||
|   uint32_t flags{0}; | ||||
|   void encode(ProtoWriteBuffer buffer) const override; | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
|   void dump_to(std::string &out) const override; | ||||
| @@ -1749,6 +1755,19 @@ class VoiceAssistantEventResponse : public ProtoMessage { | ||||
|   bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; | ||||
|   bool decode_varint(uint32_t field_id, ProtoVarInt value) override; | ||||
| }; | ||||
| class VoiceAssistantAudio : public ProtoMessage { | ||||
|  public: | ||||
|   std::string data{}; | ||||
|   bool end{false}; | ||||
|   void encode(ProtoWriteBuffer buffer) const override; | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
|   void dump_to(std::string &out) const override; | ||||
| #endif | ||||
|  | ||||
|  protected: | ||||
|   bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; | ||||
|   bool decode_varint(uint32_t field_id, ProtoVarInt value) override; | ||||
| }; | ||||
| class ListEntitiesAlarmControlPanelResponse : public ProtoMessage { | ||||
|  public: | ||||
|   std::string object_id{}; | ||||
|   | ||||
| @@ -476,6 +476,14 @@ bool APIServerConnectionBase::send_voice_assistant_request(const VoiceAssistantR | ||||
| #endif | ||||
| #ifdef USE_VOICE_ASSISTANT | ||||
| #endif | ||||
| #ifdef USE_VOICE_ASSISTANT | ||||
| bool APIServerConnectionBase::send_voice_assistant_audio(const VoiceAssistantAudio &msg) { | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
|   ESP_LOGVV(TAG, "send_voice_assistant_audio: %s", msg.dump().c_str()); | ||||
| #endif | ||||
|   return this->send_message_<VoiceAssistantAudio>(msg, 106); | ||||
| } | ||||
| #endif | ||||
| #ifdef USE_ALARM_CONTROL_PANEL | ||||
| bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response( | ||||
|     const ListEntitiesAlarmControlPanelResponse &msg) { | ||||
| @@ -971,6 +979,17 @@ bool APIServerConnectionBase::read_message(uint32_t msg_size, uint32_t msg_type, | ||||
|       ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str()); | ||||
| #endif | ||||
|       this->on_date_command_request(msg); | ||||
| #endif | ||||
|       break; | ||||
|     } | ||||
|     case 106: { | ||||
| #ifdef USE_VOICE_ASSISTANT | ||||
|       VoiceAssistantAudio msg; | ||||
|       msg.decode(msg_data, msg_size); | ||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | ||||
|       ESP_LOGVV(TAG, "on_voice_assistant_audio: %s", msg.dump().c_str()); | ||||
| #endif | ||||
|       this->on_voice_assistant_audio(msg); | ||||
| #endif | ||||
|       break; | ||||
|     } | ||||
|   | ||||
| @@ -240,6 +240,10 @@ class APIServerConnectionBase : public ProtoService { | ||||
| #ifdef USE_VOICE_ASSISTANT | ||||
|   virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){}; | ||||
| #endif | ||||
| #ifdef USE_VOICE_ASSISTANT | ||||
|   bool send_voice_assistant_audio(const VoiceAssistantAudio &msg); | ||||
|   virtual void on_voice_assistant_audio(const VoiceAssistantAudio &value){}; | ||||
| #endif | ||||
| #ifdef USE_ALARM_CONTROL_PANEL | ||||
|   bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg); | ||||
| #endif | ||||
|   | ||||
| @@ -24,28 +24,24 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE; | ||||
|  | ||||
| float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } | ||||
|  | ||||
| void VoiceAssistant::setup() { | ||||
|   ESP_LOGCONFIG(TAG, "Setting up Voice Assistant..."); | ||||
|  | ||||
|   global_voice_assistant = this; | ||||
|  | ||||
| bool VoiceAssistant::start_udp_socket_() { | ||||
|   this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); | ||||
|   if (socket_ == nullptr) { | ||||
|     ESP_LOGW(TAG, "Could not create socket"); | ||||
|   if (this->socket_ == nullptr) { | ||||
|     ESP_LOGE(TAG, "Could not create socket"); | ||||
|     this->mark_failed(); | ||||
|     return; | ||||
|     return false; | ||||
|   } | ||||
|   int enable = 1; | ||||
|   int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); | ||||
|   int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); | ||||
|   if (err != 0) { | ||||
|     ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err); | ||||
|     // we can still continue | ||||
|   } | ||||
|   err = socket_->setblocking(false); | ||||
|   err = this->socket_->setblocking(false); | ||||
|   if (err != 0) { | ||||
|     ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err); | ||||
|     ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err); | ||||
|     this->mark_failed(); | ||||
|     return; | ||||
|     return false; | ||||
|   } | ||||
|  | ||||
| #ifdef USE_SPEAKER | ||||
| @@ -54,18 +50,30 @@ void VoiceAssistant::setup() { | ||||
|  | ||||
|     socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055); | ||||
|     if (sl == 0) { | ||||
|       ESP_LOGW(TAG, "Socket unable to set sockaddr: errno %d", errno); | ||||
|       ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno); | ||||
|       this->mark_failed(); | ||||
|       return; | ||||
|       return false; | ||||
|     } | ||||
|  | ||||
|     err = socket_->bind((struct sockaddr *) &server, sizeof(server)); | ||||
|     err = this->socket_->bind((struct sockaddr *) &server, sizeof(server)); | ||||
|     if (err != 0) { | ||||
|       ESP_LOGW(TAG, "Socket unable to bind: errno %d", errno); | ||||
|       ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno); | ||||
|       this->mark_failed(); | ||||
|       return; | ||||
|       return false; | ||||
|     } | ||||
|   } | ||||
| #endif | ||||
|   this->udp_socket_running_ = true; | ||||
|   return true; | ||||
| } | ||||
|  | ||||
| void VoiceAssistant::setup() { | ||||
|   ESP_LOGCONFIG(TAG, "Setting up Voice Assistant..."); | ||||
|  | ||||
|   global_voice_assistant = this; | ||||
|  | ||||
| #ifdef USE_SPEAKER | ||||
|   if (this->speaker_ != nullptr) { | ||||
|     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||
|     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); | ||||
|     if (this->speaker_buffer_ == nullptr) { | ||||
| @@ -238,8 +246,20 @@ void VoiceAssistant::loop() { | ||||
|       size_t available = this->ring_buffer_->available(); | ||||
|       while (available >= SEND_BUFFER_SIZE) { | ||||
|         size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); | ||||
|         this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, | ||||
|                               sizeof(this->dest_addr_)); | ||||
|         if (this->audio_mode_ == AUDIO_MODE_API) { | ||||
|           api::VoiceAssistantAudio msg; | ||||
|           msg.data.assign((char *) this->send_buffer_, read_bytes); | ||||
|           this->api_client_->send_voice_assistant_audio(msg); | ||||
|         } else { | ||||
|           if (!this->udp_socket_running_) { | ||||
|             if (!this->start_udp_socket_()) { | ||||
|               this->set_state_(State::STOP_MICROPHONE, State::IDLE); | ||||
|               break; | ||||
|             } | ||||
|           } | ||||
|           this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, | ||||
|                                 sizeof(this->dest_addr_)); | ||||
|         } | ||||
|         available = this->ring_buffer_->available(); | ||||
|       } | ||||
|  | ||||
| @@ -268,22 +288,25 @@ void VoiceAssistant::loop() { | ||||
| #ifdef USE_SPEAKER | ||||
|       if (this->speaker_ != nullptr) { | ||||
|         ssize_t received_len = 0; | ||||
|         if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { | ||||
|           received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); | ||||
|           if (received_len > 0) { | ||||
|             this->speaker_buffer_index_ += received_len; | ||||
|             this->speaker_buffer_size_ += received_len; | ||||
|             this->speaker_bytes_received_ += received_len; | ||||
|         if (this->audio_mode_ == AUDIO_MODE_UDP) { | ||||
|           if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { | ||||
|             received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); | ||||
|             if (received_len > 0) { | ||||
|               this->speaker_buffer_index_ += received_len; | ||||
|               this->speaker_buffer_size_ += received_len; | ||||
|               this->speaker_bytes_received_ += received_len; | ||||
|             } | ||||
|           } else { | ||||
|             ESP_LOGD(TAG, "Receive buffer full"); | ||||
|           } | ||||
|         } else { | ||||
|           ESP_LOGD(TAG, "Receive buffer full"); | ||||
|         } | ||||
|         // Build a small buffer of audio before sending to the speaker | ||||
|         if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4) | ||||
|         bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0); | ||||
|         if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream) | ||||
|           this->write_speaker_(); | ||||
|         if (this->wait_for_stream_end_) { | ||||
|           this->cancel_timeout("playing"); | ||||
|           if (this->stream_ended_ && received_len < 0) { | ||||
|           if (end_of_stream) { | ||||
|             ESP_LOGD(TAG, "End of audio stream received"); | ||||
|             this->cancel_timeout("speaker-timeout"); | ||||
|             this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); | ||||
| @@ -428,6 +451,22 @@ void VoiceAssistant::failed_to_start() { | ||||
|   this->set_state_(State::STOP_MICROPHONE, State::IDLE); | ||||
| } | ||||
|  | ||||
| void VoiceAssistant::start_streaming() { | ||||
|   if (this->state_ != State::STARTING_PIPELINE) { | ||||
|     this->signal_stop_(); | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   ESP_LOGD(TAG, "Client started, streaming microphone"); | ||||
|   this->audio_mode_ = AUDIO_MODE_API; | ||||
|  | ||||
|   if (this->mic_->is_running()) { | ||||
|     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); | ||||
|   } else { | ||||
|     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); | ||||
|   } | ||||
| } | ||||
|  | ||||
| void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) { | ||||
|   if (this->state_ != State::STARTING_PIPELINE) { | ||||
|     this->signal_stop_(); | ||||
| @@ -435,6 +474,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por | ||||
|   } | ||||
|  | ||||
|   ESP_LOGD(TAG, "Client started, streaming microphone"); | ||||
|   this->audio_mode_ = AUDIO_MODE_UDP; | ||||
|  | ||||
|   memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_)); | ||||
|   if (this->dest_addr_.ss_family == AF_INET) { | ||||
| @@ -688,6 +728,17 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | ||||
|   } | ||||
| } | ||||
|  | ||||
| void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) { | ||||
|   if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) { | ||||
|     memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length()); | ||||
|     this->speaker_buffer_index_ += msg.data.length(); | ||||
|     this->speaker_buffer_size_ += msg.data.length(); | ||||
|     this->speaker_bytes_received_ += msg.data.length(); | ||||
|   } else { | ||||
|     ESP_LOGE(TAG, "Cannot receive audio, buffer is full"); | ||||
|   } | ||||
| } | ||||
|  | ||||
| VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) | ||||
|  | ||||
| }  // namespace voice_assistant | ||||
|   | ||||
| @@ -29,9 +29,14 @@ namespace voice_assistant { | ||||
|  | ||||
| // Version 1: Initial version | ||||
| // Version 2: Adds raw speaker support | ||||
| // Version 3: Unused/skip | ||||
| static const uint32_t INITIAL_VERSION = 1; | ||||
| static const uint32_t SPEAKER_SUPPORT = 2; | ||||
| static const uint32_t LEGACY_INITIAL_VERSION = 1; | ||||
| static const uint32_t LEGACY_SPEAKER_SUPPORT = 2; | ||||
|  | ||||
| enum VoiceAssistantFeature : uint32_t { | ||||
|   FEATURE_VOICE_ASSISTANT = 1 << 0, | ||||
|   FEATURE_SPEAKER = 1 << 1, | ||||
|   FEATURE_API_AUDIO = 1 << 2, | ||||
| }; | ||||
|  | ||||
| enum class State { | ||||
|   IDLE, | ||||
| @@ -49,11 +54,17 @@ enum class State { | ||||
|   RESPONSE_FINISHED, | ||||
| }; | ||||
|  | ||||
| enum AudioMode : uint8_t { | ||||
|   AUDIO_MODE_UDP, | ||||
|   AUDIO_MODE_API, | ||||
| }; | ||||
|  | ||||
| class VoiceAssistant : public Component { | ||||
|  public: | ||||
|   void setup() override; | ||||
|   void loop() override; | ||||
|   float get_setup_priority() const override; | ||||
|   void start_streaming(); | ||||
|   void start_streaming(struct sockaddr_storage *addr, uint16_t port); | ||||
|   void failed_to_start(); | ||||
|  | ||||
| @@ -71,19 +82,32 @@ class VoiceAssistant : public Component { | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   uint32_t get_version() const { | ||||
|   uint32_t get_legacy_version() const { | ||||
| #ifdef USE_SPEAKER | ||||
|     if (this->speaker_ != nullptr) { | ||||
|       return SPEAKER_SUPPORT; | ||||
|       return LEGACY_SPEAKER_SUPPORT; | ||||
|     } | ||||
| #endif | ||||
|     return INITIAL_VERSION; | ||||
|     return LEGACY_INITIAL_VERSION; | ||||
|   } | ||||
|  | ||||
|   uint32_t get_feature_flags() const { | ||||
|     uint32_t flags = 0; | ||||
|     flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT; | ||||
| #ifdef USE_SPEAKER | ||||
|     if (this->speaker_ != nullptr) { | ||||
|       flags |= VoiceAssistantFeature::FEATURE_SPEAKER; | ||||
|       flags |= VoiceAssistantFeature::FEATURE_API_AUDIO; | ||||
|     } | ||||
| #endif | ||||
|     return flags; | ||||
|   } | ||||
|  | ||||
|   void request_start(bool continuous, bool silence_detection); | ||||
|   void request_stop(); | ||||
|  | ||||
|   void on_event(const api::VoiceAssistantEventResponse &msg); | ||||
|   void on_audio(const api::VoiceAssistantAudio &msg); | ||||
|  | ||||
|   bool is_running() const { return this->state_ != State::IDLE; } | ||||
|   void set_continuous(bool continuous) { this->continuous_ = continuous; } | ||||
| @@ -201,6 +225,10 @@ class VoiceAssistant : public Component { | ||||
|  | ||||
|   State state_{State::IDLE}; | ||||
|   State desired_state_{State::IDLE}; | ||||
|  | ||||
|   AudioMode audio_mode_{AUDIO_MODE_UDP}; | ||||
|   bool udp_socket_running_{false}; | ||||
|   bool start_udp_socket_(); | ||||
| }; | ||||
|  | ||||
| template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user