mirror of
https://github.com/esphome/esphome.git
synced 2025-10-30 06:33:51 +00:00
Send/Receive Voice Assistant audio via API (#6471)
Co-authored-by: Michael Hansen <mike@rhasspy.org>
This commit is contained in:
@@ -24,28 +24,24 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
|
||||
|
||||
float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
|
||||
|
||||
void VoiceAssistant::setup() {
|
||||
ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
|
||||
|
||||
global_voice_assistant = this;
|
||||
|
||||
bool VoiceAssistant::start_udp_socket_() {
|
||||
this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
|
||||
if (socket_ == nullptr) {
|
||||
ESP_LOGW(TAG, "Could not create socket");
|
||||
if (this->socket_ == nullptr) {
|
||||
ESP_LOGE(TAG, "Could not create socket");
|
||||
this->mark_failed();
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
int enable = 1;
|
||||
int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
|
||||
int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
|
||||
if (err != 0) {
|
||||
ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
|
||||
// we can still continue
|
||||
}
|
||||
err = socket_->setblocking(false);
|
||||
err = this->socket_->setblocking(false);
|
||||
if (err != 0) {
|
||||
ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err);
|
||||
ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
|
||||
this->mark_failed();
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef USE_SPEAKER
|
||||
@@ -54,18 +50,30 @@ void VoiceAssistant::setup() {
|
||||
|
||||
socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
|
||||
if (sl == 0) {
|
||||
ESP_LOGW(TAG, "Socket unable to set sockaddr: errno %d", errno);
|
||||
ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
|
||||
this->mark_failed();
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
err = socket_->bind((struct sockaddr *) &server, sizeof(server));
|
||||
err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
|
||||
if (err != 0) {
|
||||
ESP_LOGW(TAG, "Socket unable to bind: errno %d", errno);
|
||||
ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
|
||||
this->mark_failed();
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
this->udp_socket_running_ = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void VoiceAssistant::setup() {
|
||||
ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
|
||||
|
||||
global_voice_assistant = this;
|
||||
|
||||
#ifdef USE_SPEAKER
|
||||
if (this->speaker_ != nullptr) {
|
||||
ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
|
||||
this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
|
||||
if (this->speaker_buffer_ == nullptr) {
|
||||
@@ -238,8 +246,20 @@ void VoiceAssistant::loop() {
|
||||
size_t available = this->ring_buffer_->available();
|
||||
while (available >= SEND_BUFFER_SIZE) {
|
||||
size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
|
||||
this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
|
||||
sizeof(this->dest_addr_));
|
||||
if (this->audio_mode_ == AUDIO_MODE_API) {
|
||||
api::VoiceAssistantAudio msg;
|
||||
msg.data.assign((char *) this->send_buffer_, read_bytes);
|
||||
this->api_client_->send_voice_assistant_audio(msg);
|
||||
} else {
|
||||
if (!this->udp_socket_running_) {
|
||||
if (!this->start_udp_socket_()) {
|
||||
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
||||
break;
|
||||
}
|
||||
}
|
||||
this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
|
||||
sizeof(this->dest_addr_));
|
||||
}
|
||||
available = this->ring_buffer_->available();
|
||||
}
|
||||
|
||||
@@ -268,22 +288,25 @@ void VoiceAssistant::loop() {
|
||||
#ifdef USE_SPEAKER
|
||||
if (this->speaker_ != nullptr) {
|
||||
ssize_t received_len = 0;
|
||||
if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
|
||||
received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
|
||||
if (received_len > 0) {
|
||||
this->speaker_buffer_index_ += received_len;
|
||||
this->speaker_buffer_size_ += received_len;
|
||||
this->speaker_bytes_received_ += received_len;
|
||||
if (this->audio_mode_ == AUDIO_MODE_UDP) {
|
||||
if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
|
||||
received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
|
||||
if (received_len > 0) {
|
||||
this->speaker_buffer_index_ += received_len;
|
||||
this->speaker_buffer_size_ += received_len;
|
||||
this->speaker_bytes_received_ += received_len;
|
||||
}
|
||||
} else {
|
||||
ESP_LOGD(TAG, "Receive buffer full");
|
||||
}
|
||||
} else {
|
||||
ESP_LOGD(TAG, "Receive buffer full");
|
||||
}
|
||||
// Build a small buffer of audio before sending to the speaker
|
||||
if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4)
|
||||
bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
|
||||
if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
|
||||
this->write_speaker_();
|
||||
if (this->wait_for_stream_end_) {
|
||||
this->cancel_timeout("playing");
|
||||
if (this->stream_ended_ && received_len < 0) {
|
||||
if (end_of_stream) {
|
||||
ESP_LOGD(TAG, "End of audio stream received");
|
||||
this->cancel_timeout("speaker-timeout");
|
||||
this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
|
||||
@@ -428,6 +451,22 @@ void VoiceAssistant::failed_to_start() {
|
||||
this->set_state_(State::STOP_MICROPHONE, State::IDLE);
|
||||
}
|
||||
|
||||
void VoiceAssistant::start_streaming() {
|
||||
if (this->state_ != State::STARTING_PIPELINE) {
|
||||
this->signal_stop_();
|
||||
return;
|
||||
}
|
||||
|
||||
ESP_LOGD(TAG, "Client started, streaming microphone");
|
||||
this->audio_mode_ = AUDIO_MODE_API;
|
||||
|
||||
if (this->mic_->is_running()) {
|
||||
this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||
} else {
|
||||
this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
|
||||
}
|
||||
}
|
||||
|
||||
void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
|
||||
if (this->state_ != State::STARTING_PIPELINE) {
|
||||
this->signal_stop_();
|
||||
@@ -435,6 +474,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
|
||||
}
|
||||
|
||||
ESP_LOGD(TAG, "Client started, streaming microphone");
|
||||
this->audio_mode_ = AUDIO_MODE_UDP;
|
||||
|
||||
memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
|
||||
if (this->dest_addr_.ss_family == AF_INET) {
|
||||
@@ -688,6 +728,17 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
|
||||
}
|
||||
}
|
||||
|
||||
void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) {
|
||||
if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
|
||||
memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
|
||||
this->speaker_buffer_index_ += msg.data.length();
|
||||
this->speaker_buffer_size_ += msg.data.length();
|
||||
this->speaker_bytes_received_ += msg.data.length();
|
||||
} else {
|
||||
ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
|
||||
}
|
||||
}
|
||||
|
||||
VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
|
||||
} // namespace voice_assistant
|
||||
|
||||
@@ -29,9 +29,14 @@ namespace voice_assistant {
|
||||
|
||||
// Version 1: Initial version
|
||||
// Version 2: Adds raw speaker support
|
||||
// Version 3: Unused/skip
|
||||
static const uint32_t INITIAL_VERSION = 1;
|
||||
static const uint32_t SPEAKER_SUPPORT = 2;
|
||||
static const uint32_t LEGACY_INITIAL_VERSION = 1;
|
||||
static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
|
||||
|
||||
enum VoiceAssistantFeature : uint32_t {
|
||||
FEATURE_VOICE_ASSISTANT = 1 << 0,
|
||||
FEATURE_SPEAKER = 1 << 1,
|
||||
FEATURE_API_AUDIO = 1 << 2,
|
||||
};
|
||||
|
||||
enum class State {
|
||||
IDLE,
|
||||
@@ -49,11 +54,17 @@ enum class State {
|
||||
RESPONSE_FINISHED,
|
||||
};
|
||||
|
||||
enum AudioMode : uint8_t {
|
||||
AUDIO_MODE_UDP,
|
||||
AUDIO_MODE_API,
|
||||
};
|
||||
|
||||
class VoiceAssistant : public Component {
|
||||
public:
|
||||
void setup() override;
|
||||
void loop() override;
|
||||
float get_setup_priority() const override;
|
||||
void start_streaming();
|
||||
void start_streaming(struct sockaddr_storage *addr, uint16_t port);
|
||||
void failed_to_start();
|
||||
|
||||
@@ -71,19 +82,32 @@ class VoiceAssistant : public Component {
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t get_version() const {
|
||||
uint32_t get_legacy_version() const {
|
||||
#ifdef USE_SPEAKER
|
||||
if (this->speaker_ != nullptr) {
|
||||
return SPEAKER_SUPPORT;
|
||||
return LEGACY_SPEAKER_SUPPORT;
|
||||
}
|
||||
#endif
|
||||
return INITIAL_VERSION;
|
||||
return LEGACY_INITIAL_VERSION;
|
||||
}
|
||||
|
||||
uint32_t get_feature_flags() const {
|
||||
uint32_t flags = 0;
|
||||
flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT;
|
||||
#ifdef USE_SPEAKER
|
||||
if (this->speaker_ != nullptr) {
|
||||
flags |= VoiceAssistantFeature::FEATURE_SPEAKER;
|
||||
flags |= VoiceAssistantFeature::FEATURE_API_AUDIO;
|
||||
}
|
||||
#endif
|
||||
return flags;
|
||||
}
|
||||
|
||||
void request_start(bool continuous, bool silence_detection);
|
||||
void request_stop();
|
||||
|
||||
void on_event(const api::VoiceAssistantEventResponse &msg);
|
||||
void on_audio(const api::VoiceAssistantAudio &msg);
|
||||
|
||||
bool is_running() const { return this->state_ != State::IDLE; }
|
||||
void set_continuous(bool continuous) { this->continuous_ = continuous; }
|
||||
@@ -201,6 +225,10 @@ class VoiceAssistant : public Component {
|
||||
|
||||
State state_{State::IDLE};
|
||||
State desired_state_{State::IDLE};
|
||||
|
||||
AudioMode audio_mode_{AUDIO_MODE_UDP};
|
||||
bool udp_socket_running_{false};
|
||||
bool start_udp_socket_();
|
||||
};
|
||||
|
||||
template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
|
||||
|
||||
Reference in New Issue
Block a user