mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-26 12:43:48 +00:00 
			
		
		
		
	Send/Receive Voice Assistant audio via API (#6471)
Co-authored-by: Michael Hansen <mike@rhasspy.org>
This commit is contained in:
		| @@ -217,7 +217,8 @@ message DeviceInfoResponse { | |||||||
|  |  | ||||||
|   string friendly_name = 13; |   string friendly_name = 13; | ||||||
|  |  | ||||||
|   uint32 voice_assistant_version = 14; |   uint32 legacy_voice_assistant_version = 14; | ||||||
|  |   uint32 voice_assistant_feature_flags = 17; | ||||||
|  |  | ||||||
|   string suggested_area = 16; |   string suggested_area = 16; | ||||||
| } | } | ||||||
| @@ -1422,12 +1423,18 @@ message BluetoothDeviceClearCacheResponse { | |||||||
| } | } | ||||||
|  |  | ||||||
| // ==================== PUSH TO TALK ==================== | // ==================== PUSH TO TALK ==================== | ||||||
|  | enum VoiceAssistantSubscribeFlag { | ||||||
|  |   VOICE_ASSISTANT_SUBSCRIBE_NONE = 0; | ||||||
|  |   VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1; | ||||||
|  | } | ||||||
|  |  | ||||||
| message SubscribeVoiceAssistantRequest { | message SubscribeVoiceAssistantRequest { | ||||||
|   option (id) = 89; |   option (id) = 89; | ||||||
|   option (source) = SOURCE_CLIENT; |   option (source) = SOURCE_CLIENT; | ||||||
|   option (ifdef) = "USE_VOICE_ASSISTANT"; |   option (ifdef) = "USE_VOICE_ASSISTANT"; | ||||||
|  |  | ||||||
|   bool subscribe = 1; |   bool subscribe = 1; | ||||||
|  |   uint32 flags = 2; | ||||||
| } | } | ||||||
|  |  | ||||||
| enum VoiceAssistantRequestFlag { | enum VoiceAssistantRequestFlag { | ||||||
| @@ -1495,6 +1502,16 @@ message VoiceAssistantEventResponse { | |||||||
|   repeated VoiceAssistantEventData data = 2; |   repeated VoiceAssistantEventData data = 2; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | message VoiceAssistantAudio { | ||||||
|  |   option (id) = 106; | ||||||
|  |   option (source) = SOURCE_BOTH; | ||||||
|  |   option (ifdef) = "USE_VOICE_ASSISTANT"; | ||||||
|  |  | ||||||
|  |   bytes data = 1; | ||||||
|  |   bool end = 2; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| // ==================== ALARM CONTROL PANEL ==================== | // ==================== ALARM CONTROL PANEL ==================== | ||||||
| enum AlarmControlPanelState { | enum AlarmControlPanelState { | ||||||
|   ALARM_STATE_DISARMED = 0; |   ALARM_STATE_DISARMED = 0; | ||||||
|   | |||||||
| @@ -1040,11 +1040,16 @@ void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &ms | |||||||
|       voice_assistant::global_voice_assistant->failed_to_start(); |       voice_assistant::global_voice_assistant->failed_to_start(); | ||||||
|       return; |       return; | ||||||
|     } |     } | ||||||
|  |     if (msg.port == 0) { | ||||||
|  |       // Use API Audio | ||||||
|  |       voice_assistant::global_voice_assistant->start_streaming(); | ||||||
|  |     } else { | ||||||
|       struct sockaddr_storage storage; |       struct sockaddr_storage storage; | ||||||
|       socklen_t len = sizeof(storage); |       socklen_t len = sizeof(storage); | ||||||
|       this->helper_->getpeername((struct sockaddr *) &storage, &len); |       this->helper_->getpeername((struct sockaddr *) &storage, &len); | ||||||
|       voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); |       voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
| }; | }; | ||||||
| void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) { | void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) { | ||||||
|   if (voice_assistant::global_voice_assistant != nullptr) { |   if (voice_assistant::global_voice_assistant != nullptr) { | ||||||
| @@ -1055,6 +1060,15 @@ void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventR | |||||||
|     voice_assistant::global_voice_assistant->on_event(msg); |     voice_assistant::global_voice_assistant->on_event(msg); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | void APIConnection::on_voice_assistant_audio(const VoiceAssistantAudio &msg) { | ||||||
|  |   if (voice_assistant::global_voice_assistant != nullptr) { | ||||||
|  |     if (voice_assistant::global_voice_assistant->get_api_connection() != this) { | ||||||
|  |       return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     voice_assistant::global_voice_assistant->on_audio(msg); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -1142,7 +1156,7 @@ HelloResponse APIConnection::hello(const HelloRequest &msg) { | |||||||
|  |  | ||||||
|   HelloResponse resp; |   HelloResponse resp; | ||||||
|   resp.api_version_major = 1; |   resp.api_version_major = 1; | ||||||
|   resp.api_version_minor = 9; |   resp.api_version_minor = 10; | ||||||
|   resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")"; |   resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")"; | ||||||
|   resp.name = App.get_name(); |   resp.name = App.get_name(); | ||||||
|  |  | ||||||
| @@ -1203,7 +1217,8 @@ DeviceInfoResponse APIConnection::device_info(const DeviceInfoRequest &msg) { | |||||||
|   resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags(); |   resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags(); | ||||||
| #endif | #endif | ||||||
| #ifdef USE_VOICE_ASSISTANT | #ifdef USE_VOICE_ASSISTANT | ||||||
|   resp.voice_assistant_version = voice_assistant::global_voice_assistant->get_version(); |   resp.legacy_voice_assistant_version = voice_assistant::global_voice_assistant->get_legacy_version(); | ||||||
|  |   resp.voice_assistant_feature_flags = voice_assistant::global_voice_assistant->get_feature_flags(); | ||||||
| #endif | #endif | ||||||
|   return resp; |   return resp; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -134,6 +134,7 @@ class APIConnection : public APIServerConnection { | |||||||
|   void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override; |   void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override; | ||||||
|   void on_voice_assistant_response(const VoiceAssistantResponse &msg) override; |   void on_voice_assistant_response(const VoiceAssistantResponse &msg) override; | ||||||
|   void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override; |   void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override; | ||||||
|  |   void on_voice_assistant_audio(const VoiceAssistantAudio &msg) override; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifdef USE_ALARM_CONTROL_PANEL | #ifdef USE_ALARM_CONTROL_PANEL | ||||||
|   | |||||||
| @@ -410,6 +410,19 @@ const char *proto_enum_to_string<enums::BluetoothDeviceRequestType>(enums::Bluet | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
|  | template<> | ||||||
|  | const char *proto_enum_to_string<enums::VoiceAssistantSubscribeFlag>(enums::VoiceAssistantSubscribeFlag value) { | ||||||
|  |   switch (value) { | ||||||
|  |     case enums::VOICE_ASSISTANT_SUBSCRIBE_NONE: | ||||||
|  |       return "VOICE_ASSISTANT_SUBSCRIBE_NONE"; | ||||||
|  |     case enums::VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO: | ||||||
|  |       return "VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO"; | ||||||
|  |     default: | ||||||
|  |       return "UNKNOWN"; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
| template<> const char *proto_enum_to_string<enums::VoiceAssistantRequestFlag>(enums::VoiceAssistantRequestFlag value) { | template<> const char *proto_enum_to_string<enums::VoiceAssistantRequestFlag>(enums::VoiceAssistantRequestFlag value) { | ||||||
|   switch (value) { |   switch (value) { | ||||||
|     case enums::VOICE_ASSISTANT_REQUEST_NONE: |     case enums::VOICE_ASSISTANT_REQUEST_NONE: | ||||||
| @@ -716,7 +729,11 @@ bool DeviceInfoResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { | |||||||
|       return true; |       return true; | ||||||
|     } |     } | ||||||
|     case 14: { |     case 14: { | ||||||
|       this->voice_assistant_version = value.as_uint32(); |       this->legacy_voice_assistant_version = value.as_uint32(); | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|  |     case 17: { | ||||||
|  |       this->voice_assistant_feature_flags = value.as_uint32(); | ||||||
|       return true; |       return true; | ||||||
|     } |     } | ||||||
|     default: |     default: | ||||||
| @@ -784,7 +801,8 @@ void DeviceInfoResponse::encode(ProtoWriteBuffer buffer) const { | |||||||
|   buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags); |   buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags); | ||||||
|   buffer.encode_string(12, this->manufacturer); |   buffer.encode_string(12, this->manufacturer); | ||||||
|   buffer.encode_string(13, this->friendly_name); |   buffer.encode_string(13, this->friendly_name); | ||||||
|   buffer.encode_uint32(14, this->voice_assistant_version); |   buffer.encode_uint32(14, this->legacy_voice_assistant_version); | ||||||
|  |   buffer.encode_uint32(17, this->voice_assistant_feature_flags); | ||||||
|   buffer.encode_string(16, this->suggested_area); |   buffer.encode_string(16, this->suggested_area); | ||||||
| } | } | ||||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
| @@ -850,8 +868,13 @@ void DeviceInfoResponse::dump_to(std::string &out) const { | |||||||
|   out.append("'").append(this->friendly_name).append("'"); |   out.append("'").append(this->friendly_name).append("'"); | ||||||
|   out.append("\n"); |   out.append("\n"); | ||||||
|  |  | ||||||
|   out.append("  voice_assistant_version: "); |   out.append("  legacy_voice_assistant_version: "); | ||||||
|   sprintf(buffer, "%" PRIu32, this->voice_assistant_version); |   sprintf(buffer, "%" PRIu32, this->legacy_voice_assistant_version); | ||||||
|  |   out.append(buffer); | ||||||
|  |   out.append("\n"); | ||||||
|  |  | ||||||
|  |   out.append("  voice_assistant_feature_flags: "); | ||||||
|  |   sprintf(buffer, "%" PRIu32, this->voice_assistant_feature_flags); | ||||||
|   out.append(buffer); |   out.append(buffer); | ||||||
|   out.append("\n"); |   out.append("\n"); | ||||||
|  |  | ||||||
| @@ -6514,11 +6537,18 @@ bool SubscribeVoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarIn | |||||||
|       this->subscribe = value.as_bool(); |       this->subscribe = value.as_bool(); | ||||||
|       return true; |       return true; | ||||||
|     } |     } | ||||||
|  |     case 2: { | ||||||
|  |       this->flags = value.as_uint32(); | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|     default: |     default: | ||||||
|       return false; |       return false; | ||||||
|   } |   } | ||||||
| } | } | ||||||
| void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->subscribe); } | void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { | ||||||
|  |   buffer.encode_bool(1, this->subscribe); | ||||||
|  |   buffer.encode_uint32(2, this->flags); | ||||||
|  | } | ||||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
| void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { | void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { | ||||||
|   __attribute__((unused)) char buffer[64]; |   __attribute__((unused)) char buffer[64]; | ||||||
| @@ -6526,6 +6556,11 @@ void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { | |||||||
|   out.append("  subscribe: "); |   out.append("  subscribe: "); | ||||||
|   out.append(YESNO(this->subscribe)); |   out.append(YESNO(this->subscribe)); | ||||||
|   out.append("\n"); |   out.append("\n"); | ||||||
|  |  | ||||||
|  |   out.append("  flags: "); | ||||||
|  |   sprintf(buffer, "%" PRIu32, this->flags); | ||||||
|  |   out.append(buffer); | ||||||
|  |   out.append("\n"); | ||||||
|   out.append("}"); |   out.append("}"); | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| @@ -6752,6 +6787,44 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const { | |||||||
|   out.append("}"); |   out.append("}"); | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  | bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) { | ||||||
|  |   switch (field_id) { | ||||||
|  |     case 2: { | ||||||
|  |       this->end = value.as_bool(); | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|  |     default: | ||||||
|  |       return false; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) { | ||||||
|  |   switch (field_id) { | ||||||
|  |     case 1: { | ||||||
|  |       this->data = value.as_string(); | ||||||
|  |       return true; | ||||||
|  |     } | ||||||
|  |     default: | ||||||
|  |       return false; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const { | ||||||
|  |   buffer.encode_string(1, this->data); | ||||||
|  |   buffer.encode_bool(2, this->end); | ||||||
|  | } | ||||||
|  | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
|  | void VoiceAssistantAudio::dump_to(std::string &out) const { | ||||||
|  |   __attribute__((unused)) char buffer[64]; | ||||||
|  |   out.append("VoiceAssistantAudio {\n"); | ||||||
|  |   out.append("  data: "); | ||||||
|  |   out.append("'").append(this->data).append("'"); | ||||||
|  |   out.append("\n"); | ||||||
|  |  | ||||||
|  |   out.append("  end: "); | ||||||
|  |   out.append(YESNO(this->end)); | ||||||
|  |   out.append("\n"); | ||||||
|  |   out.append("}"); | ||||||
|  | } | ||||||
|  | #endif | ||||||
| bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { | bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) { | ||||||
|   switch (field_id) { |   switch (field_id) { | ||||||
|     case 6: { |     case 6: { | ||||||
|   | |||||||
| @@ -165,6 +165,10 @@ enum BluetoothDeviceRequestType : uint32_t { | |||||||
|   BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5, |   BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5, | ||||||
|   BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6, |   BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6, | ||||||
| }; | }; | ||||||
|  | enum VoiceAssistantSubscribeFlag : uint32_t { | ||||||
|  |   VOICE_ASSISTANT_SUBSCRIBE_NONE = 0, | ||||||
|  |   VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1, | ||||||
|  | }; | ||||||
| enum VoiceAssistantRequestFlag : uint32_t { | enum VoiceAssistantRequestFlag : uint32_t { | ||||||
|   VOICE_ASSISTANT_REQUEST_NONE = 0, |   VOICE_ASSISTANT_REQUEST_NONE = 0, | ||||||
|   VOICE_ASSISTANT_REQUEST_USE_VAD = 1, |   VOICE_ASSISTANT_REQUEST_USE_VAD = 1, | ||||||
| @@ -327,7 +331,8 @@ class DeviceInfoResponse : public ProtoMessage { | |||||||
|   uint32_t bluetooth_proxy_feature_flags{0}; |   uint32_t bluetooth_proxy_feature_flags{0}; | ||||||
|   std::string manufacturer{}; |   std::string manufacturer{}; | ||||||
|   std::string friendly_name{}; |   std::string friendly_name{}; | ||||||
|   uint32_t voice_assistant_version{0}; |   uint32_t legacy_voice_assistant_version{0}; | ||||||
|  |   uint32_t voice_assistant_feature_flags{0}; | ||||||
|   std::string suggested_area{}; |   std::string suggested_area{}; | ||||||
|   void encode(ProtoWriteBuffer buffer) const override; |   void encode(ProtoWriteBuffer buffer) const override; | ||||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
| @@ -1674,6 +1679,7 @@ class BluetoothDeviceClearCacheResponse : public ProtoMessage { | |||||||
| class SubscribeVoiceAssistantRequest : public ProtoMessage { | class SubscribeVoiceAssistantRequest : public ProtoMessage { | ||||||
|  public: |  public: | ||||||
|   bool subscribe{false}; |   bool subscribe{false}; | ||||||
|  |   uint32_t flags{0}; | ||||||
|   void encode(ProtoWriteBuffer buffer) const override; |   void encode(ProtoWriteBuffer buffer) const override; | ||||||
| #ifdef HAS_PROTO_MESSAGE_DUMP | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
|   void dump_to(std::string &out) const override; |   void dump_to(std::string &out) const override; | ||||||
| @@ -1749,6 +1755,19 @@ class VoiceAssistantEventResponse : public ProtoMessage { | |||||||
|   bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; |   bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; | ||||||
|   bool decode_varint(uint32_t field_id, ProtoVarInt value) override; |   bool decode_varint(uint32_t field_id, ProtoVarInt value) override; | ||||||
| }; | }; | ||||||
|  | class VoiceAssistantAudio : public ProtoMessage { | ||||||
|  |  public: | ||||||
|  |   std::string data{}; | ||||||
|  |   bool end{false}; | ||||||
|  |   void encode(ProtoWriteBuffer buffer) const override; | ||||||
|  | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
|  |   void dump_to(std::string &out) const override; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  protected: | ||||||
|  |   bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; | ||||||
|  |   bool decode_varint(uint32_t field_id, ProtoVarInt value) override; | ||||||
|  | }; | ||||||
| class ListEntitiesAlarmControlPanelResponse : public ProtoMessage { | class ListEntitiesAlarmControlPanelResponse : public ProtoMessage { | ||||||
|  public: |  public: | ||||||
|   std::string object_id{}; |   std::string object_id{}; | ||||||
|   | |||||||
| @@ -476,6 +476,14 @@ bool APIServerConnectionBase::send_voice_assistant_request(const VoiceAssistantR | |||||||
| #endif | #endif | ||||||
| #ifdef USE_VOICE_ASSISTANT | #ifdef USE_VOICE_ASSISTANT | ||||||
| #endif | #endif | ||||||
|  | #ifdef USE_VOICE_ASSISTANT | ||||||
|  | bool APIServerConnectionBase::send_voice_assistant_audio(const VoiceAssistantAudio &msg) { | ||||||
|  | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
|  |   ESP_LOGVV(TAG, "send_voice_assistant_audio: %s", msg.dump().c_str()); | ||||||
|  | #endif | ||||||
|  |   return this->send_message_<VoiceAssistantAudio>(msg, 106); | ||||||
|  | } | ||||||
|  | #endif | ||||||
| #ifdef USE_ALARM_CONTROL_PANEL | #ifdef USE_ALARM_CONTROL_PANEL | ||||||
| bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response( | bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response( | ||||||
|     const ListEntitiesAlarmControlPanelResponse &msg) { |     const ListEntitiesAlarmControlPanelResponse &msg) { | ||||||
| @@ -971,6 +979,17 @@ bool APIServerConnectionBase::read_message(uint32_t msg_size, uint32_t msg_type, | |||||||
|       ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str()); |       ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str()); | ||||||
| #endif | #endif | ||||||
|       this->on_date_command_request(msg); |       this->on_date_command_request(msg); | ||||||
|  | #endif | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|  |     case 106: { | ||||||
|  | #ifdef USE_VOICE_ASSISTANT | ||||||
|  |       VoiceAssistantAudio msg; | ||||||
|  |       msg.decode(msg_data, msg_size); | ||||||
|  | #ifdef HAS_PROTO_MESSAGE_DUMP | ||||||
|  |       ESP_LOGVV(TAG, "on_voice_assistant_audio: %s", msg.dump().c_str()); | ||||||
|  | #endif | ||||||
|  |       this->on_voice_assistant_audio(msg); | ||||||
| #endif | #endif | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -240,6 +240,10 @@ class APIServerConnectionBase : public ProtoService { | |||||||
| #ifdef USE_VOICE_ASSISTANT | #ifdef USE_VOICE_ASSISTANT | ||||||
|   virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){}; |   virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){}; | ||||||
| #endif | #endif | ||||||
|  | #ifdef USE_VOICE_ASSISTANT | ||||||
|  |   bool send_voice_assistant_audio(const VoiceAssistantAudio &msg); | ||||||
|  |   virtual void on_voice_assistant_audio(const VoiceAssistantAudio &value){}; | ||||||
|  | #endif | ||||||
| #ifdef USE_ALARM_CONTROL_PANEL | #ifdef USE_ALARM_CONTROL_PANEL | ||||||
|   bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg); |   bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -24,28 +24,24 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE; | |||||||
|  |  | ||||||
| float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } | float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } | ||||||
|  |  | ||||||
| void VoiceAssistant::setup() { | bool VoiceAssistant::start_udp_socket_() { | ||||||
|   ESP_LOGCONFIG(TAG, "Setting up Voice Assistant..."); |  | ||||||
|  |  | ||||||
|   global_voice_assistant = this; |  | ||||||
|  |  | ||||||
|   this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); |   this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); | ||||||
|   if (socket_ == nullptr) { |   if (this->socket_ == nullptr) { | ||||||
|     ESP_LOGW(TAG, "Could not create socket"); |     ESP_LOGE(TAG, "Could not create socket"); | ||||||
|     this->mark_failed(); |     this->mark_failed(); | ||||||
|     return; |     return false; | ||||||
|   } |   } | ||||||
|   int enable = 1; |   int enable = 1; | ||||||
|   int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); |   int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); | ||||||
|   if (err != 0) { |   if (err != 0) { | ||||||
|     ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err); |     ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err); | ||||||
|     // we can still continue |     // we can still continue | ||||||
|   } |   } | ||||||
|   err = socket_->setblocking(false); |   err = this->socket_->setblocking(false); | ||||||
|   if (err != 0) { |   if (err != 0) { | ||||||
|     ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err); |     ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err); | ||||||
|     this->mark_failed(); |     this->mark_failed(); | ||||||
|     return; |     return false; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #ifdef USE_SPEAKER | #ifdef USE_SPEAKER | ||||||
| @@ -54,18 +50,30 @@ void VoiceAssistant::setup() { | |||||||
|  |  | ||||||
|     socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055); |     socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055); | ||||||
|     if (sl == 0) { |     if (sl == 0) { | ||||||
|       ESP_LOGW(TAG, "Socket unable to set sockaddr: errno %d", errno); |       ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno); | ||||||
|       this->mark_failed(); |       this->mark_failed(); | ||||||
|       return; |       return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     err = socket_->bind((struct sockaddr *) &server, sizeof(server)); |     err = this->socket_->bind((struct sockaddr *) &server, sizeof(server)); | ||||||
|     if (err != 0) { |     if (err != 0) { | ||||||
|       ESP_LOGW(TAG, "Socket unable to bind: errno %d", errno); |       ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno); | ||||||
|       this->mark_failed(); |       this->mark_failed(); | ||||||
|       return; |       return false; | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |   this->udp_socket_running_ = true; | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void VoiceAssistant::setup() { | ||||||
|  |   ESP_LOGCONFIG(TAG, "Setting up Voice Assistant..."); | ||||||
|  |  | ||||||
|  |   global_voice_assistant = this; | ||||||
|  |  | ||||||
|  | #ifdef USE_SPEAKER | ||||||
|  |   if (this->speaker_ != nullptr) { | ||||||
|     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); |     ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE); | ||||||
|     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); |     this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); | ||||||
|     if (this->speaker_buffer_ == nullptr) { |     if (this->speaker_buffer_ == nullptr) { | ||||||
| @@ -238,8 +246,20 @@ void VoiceAssistant::loop() { | |||||||
|       size_t available = this->ring_buffer_->available(); |       size_t available = this->ring_buffer_->available(); | ||||||
|       while (available >= SEND_BUFFER_SIZE) { |       while (available >= SEND_BUFFER_SIZE) { | ||||||
|         size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); |         size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); | ||||||
|  |         if (this->audio_mode_ == AUDIO_MODE_API) { | ||||||
|  |           api::VoiceAssistantAudio msg; | ||||||
|  |           msg.data.assign((char *) this->send_buffer_, read_bytes); | ||||||
|  |           this->api_client_->send_voice_assistant_audio(msg); | ||||||
|  |         } else { | ||||||
|  |           if (!this->udp_socket_running_) { | ||||||
|  |             if (!this->start_udp_socket_()) { | ||||||
|  |               this->set_state_(State::STOP_MICROPHONE, State::IDLE); | ||||||
|  |               break; | ||||||
|  |             } | ||||||
|  |           } | ||||||
|           this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, |           this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, | ||||||
|                                 sizeof(this->dest_addr_)); |                                 sizeof(this->dest_addr_)); | ||||||
|  |         } | ||||||
|         available = this->ring_buffer_->available(); |         available = this->ring_buffer_->available(); | ||||||
|       } |       } | ||||||
|  |  | ||||||
| @@ -268,6 +288,7 @@ void VoiceAssistant::loop() { | |||||||
| #ifdef USE_SPEAKER | #ifdef USE_SPEAKER | ||||||
|       if (this->speaker_ != nullptr) { |       if (this->speaker_ != nullptr) { | ||||||
|         ssize_t received_len = 0; |         ssize_t received_len = 0; | ||||||
|  |         if (this->audio_mode_ == AUDIO_MODE_UDP) { | ||||||
|           if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { |           if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { | ||||||
|             received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); |             received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); | ||||||
|             if (received_len > 0) { |             if (received_len > 0) { | ||||||
| @@ -278,12 +299,14 @@ void VoiceAssistant::loop() { | |||||||
|           } else { |           } else { | ||||||
|             ESP_LOGD(TAG, "Receive buffer full"); |             ESP_LOGD(TAG, "Receive buffer full"); | ||||||
|           } |           } | ||||||
|  |         } | ||||||
|         // Build a small buffer of audio before sending to the speaker |         // Build a small buffer of audio before sending to the speaker | ||||||
|         if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4) |         bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0); | ||||||
|  |         if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream) | ||||||
|           this->write_speaker_(); |           this->write_speaker_(); | ||||||
|         if (this->wait_for_stream_end_) { |         if (this->wait_for_stream_end_) { | ||||||
|           this->cancel_timeout("playing"); |           this->cancel_timeout("playing"); | ||||||
|           if (this->stream_ended_ && received_len < 0) { |           if (end_of_stream) { | ||||||
|             ESP_LOGD(TAG, "End of audio stream received"); |             ESP_LOGD(TAG, "End of audio stream received"); | ||||||
|             this->cancel_timeout("speaker-timeout"); |             this->cancel_timeout("speaker-timeout"); | ||||||
|             this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); |             this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); | ||||||
| @@ -428,6 +451,22 @@ void VoiceAssistant::failed_to_start() { | |||||||
|   this->set_state_(State::STOP_MICROPHONE, State::IDLE); |   this->set_state_(State::STOP_MICROPHONE, State::IDLE); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void VoiceAssistant::start_streaming() { | ||||||
|  |   if (this->state_ != State::STARTING_PIPELINE) { | ||||||
|  |     this->signal_stop_(); | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   ESP_LOGD(TAG, "Client started, streaming microphone"); | ||||||
|  |   this->audio_mode_ = AUDIO_MODE_API; | ||||||
|  |  | ||||||
|  |   if (this->mic_->is_running()) { | ||||||
|  |     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); | ||||||
|  |   } else { | ||||||
|  |     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) { | void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) { | ||||||
|   if (this->state_ != State::STARTING_PIPELINE) { |   if (this->state_ != State::STARTING_PIPELINE) { | ||||||
|     this->signal_stop_(); |     this->signal_stop_(); | ||||||
| @@ -435,6 +474,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   ESP_LOGD(TAG, "Client started, streaming microphone"); |   ESP_LOGD(TAG, "Client started, streaming microphone"); | ||||||
|  |   this->audio_mode_ = AUDIO_MODE_UDP; | ||||||
|  |  | ||||||
|   memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_)); |   memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_)); | ||||||
|   if (this->dest_addr_.ss_family == AF_INET) { |   if (this->dest_addr_.ss_family == AF_INET) { | ||||||
| @@ -688,6 +728,17 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) { | ||||||
|  |   if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) { | ||||||
|  |     memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length()); | ||||||
|  |     this->speaker_buffer_index_ += msg.data.length(); | ||||||
|  |     this->speaker_buffer_size_ += msg.data.length(); | ||||||
|  |     this->speaker_bytes_received_ += msg.data.length(); | ||||||
|  |   } else { | ||||||
|  |     ESP_LOGE(TAG, "Cannot receive audio, buffer is full"); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) | VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) | ||||||
|  |  | ||||||
| }  // namespace voice_assistant | }  // namespace voice_assistant | ||||||
|   | |||||||
| @@ -29,9 +29,14 @@ namespace voice_assistant { | |||||||
|  |  | ||||||
| // Version 1: Initial version | // Version 1: Initial version | ||||||
| // Version 2: Adds raw speaker support | // Version 2: Adds raw speaker support | ||||||
| // Version 3: Unused/skip | static const uint32_t LEGACY_INITIAL_VERSION = 1; | ||||||
| static const uint32_t INITIAL_VERSION = 1; | static const uint32_t LEGACY_SPEAKER_SUPPORT = 2; | ||||||
| static const uint32_t SPEAKER_SUPPORT = 2; |  | ||||||
|  | enum VoiceAssistantFeature : uint32_t { | ||||||
|  |   FEATURE_VOICE_ASSISTANT = 1 << 0, | ||||||
|  |   FEATURE_SPEAKER = 1 << 1, | ||||||
|  |   FEATURE_API_AUDIO = 1 << 2, | ||||||
|  | }; | ||||||
|  |  | ||||||
| enum class State { | enum class State { | ||||||
|   IDLE, |   IDLE, | ||||||
| @@ -49,11 +54,17 @@ enum class State { | |||||||
|   RESPONSE_FINISHED, |   RESPONSE_FINISHED, | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | enum AudioMode : uint8_t { | ||||||
|  |   AUDIO_MODE_UDP, | ||||||
|  |   AUDIO_MODE_API, | ||||||
|  | }; | ||||||
|  |  | ||||||
| class VoiceAssistant : public Component { | class VoiceAssistant : public Component { | ||||||
|  public: |  public: | ||||||
|   void setup() override; |   void setup() override; | ||||||
|   void loop() override; |   void loop() override; | ||||||
|   float get_setup_priority() const override; |   float get_setup_priority() const override; | ||||||
|  |   void start_streaming(); | ||||||
|   void start_streaming(struct sockaddr_storage *addr, uint16_t port); |   void start_streaming(struct sockaddr_storage *addr, uint16_t port); | ||||||
|   void failed_to_start(); |   void failed_to_start(); | ||||||
|  |  | ||||||
| @@ -71,19 +82,32 @@ class VoiceAssistant : public Component { | |||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   uint32_t get_version() const { |   uint32_t get_legacy_version() const { | ||||||
| #ifdef USE_SPEAKER | #ifdef USE_SPEAKER | ||||||
|     if (this->speaker_ != nullptr) { |     if (this->speaker_ != nullptr) { | ||||||
|       return SPEAKER_SUPPORT; |       return LEGACY_SPEAKER_SUPPORT; | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
|     return INITIAL_VERSION; |     return LEGACY_INITIAL_VERSION; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   uint32_t get_feature_flags() const { | ||||||
|  |     uint32_t flags = 0; | ||||||
|  |     flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT; | ||||||
|  | #ifdef USE_SPEAKER | ||||||
|  |     if (this->speaker_ != nullptr) { | ||||||
|  |       flags |= VoiceAssistantFeature::FEATURE_SPEAKER; | ||||||
|  |       flags |= VoiceAssistantFeature::FEATURE_API_AUDIO; | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |     return flags; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void request_start(bool continuous, bool silence_detection); |   void request_start(bool continuous, bool silence_detection); | ||||||
|   void request_stop(); |   void request_stop(); | ||||||
|  |  | ||||||
|   void on_event(const api::VoiceAssistantEventResponse &msg); |   void on_event(const api::VoiceAssistantEventResponse &msg); | ||||||
|  |   void on_audio(const api::VoiceAssistantAudio &msg); | ||||||
|  |  | ||||||
|   bool is_running() const { return this->state_ != State::IDLE; } |   bool is_running() const { return this->state_ != State::IDLE; } | ||||||
|   void set_continuous(bool continuous) { this->continuous_ = continuous; } |   void set_continuous(bool continuous) { this->continuous_ = continuous; } | ||||||
| @@ -201,6 +225,10 @@ class VoiceAssistant : public Component { | |||||||
|  |  | ||||||
|   State state_{State::IDLE}; |   State state_{State::IDLE}; | ||||||
|   State desired_state_{State::IDLE}; |   State desired_state_{State::IDLE}; | ||||||
|  |  | ||||||
|  |   AudioMode audio_mode_{AUDIO_MODE_UDP}; | ||||||
|  |   bool udp_socket_running_{false}; | ||||||
|  |   bool start_udp_socket_(); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> { | template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user