From 1ea5d90ea32fe9b8f4adf0e120cc3e260f290c34 Mon Sep 17 00:00:00 2001 From: Jesse Hills <3060199+jesserockz@users.noreply.github.com> Date: Wed, 31 May 2023 16:30:53 +1200 Subject: [PATCH] Continuous voice_assistant and silence detection (#4892) --- esphome/components/api/api.proto | 1 + esphome/components/api/api_connection.cpp | 3 +- esphome/components/api/api_connection.h | 2 +- esphome/components/api/api_pb2.cpp | 19 +++++- esphome/components/api/api_pb2.h | 2 + esphome/components/api/api_server.cpp | 6 +- esphome/components/api/api_server.h | 2 +- .../i2s_audio/microphone/__init__.py | 2 +- .../components/voice_assistant/__init__.py | 49 ++++++++++++-- .../voice_assistant/voice_assistant.cpp | 67 ++++++++++++++++--- .../voice_assistant/voice_assistant.h | 47 ++++++++++++- esphome/const.py | 1 + 12 files changed, 176 insertions(+), 25 deletions(-) diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto index 4cc98c91d9..34678fde0f 100644 --- a/esphome/components/api/api.proto +++ b/esphome/components/api/api.proto @@ -1397,6 +1397,7 @@ message VoiceAssistantRequest { option (ifdef) = "USE_VOICE_ASSISTANT"; bool start = 1; + string conversation_id = 2; } message VoiceAssistantResponse { diff --git a/esphome/components/api/api_connection.cpp b/esphome/components/api/api_connection.cpp index c350197e68..3983c6403b 100644 --- a/esphome/components/api/api_connection.cpp +++ b/esphome/components/api/api_connection.cpp @@ -895,11 +895,12 @@ BluetoothConnectionsFreeResponse APIConnection::subscribe_bluetooth_connections_ #endif #ifdef USE_VOICE_ASSISTANT -bool APIConnection::request_voice_assistant(bool start) { +bool APIConnection::request_voice_assistant(bool start, const std::string &conversation_id) { if (!this->voice_assistant_subscription_) return false; VoiceAssistantRequest msg; msg.start = start; + msg.conversation_id = conversation_id; return this->send_voice_assistant_request(msg); } void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &msg) { diff --git a/esphome/components/api/api_connection.h b/esphome/components/api/api_connection.h index 78ecbb98e6..d4e9cc656e 100644 --- a/esphome/components/api/api_connection.h +++ b/esphome/components/api/api_connection.h @@ -128,7 +128,7 @@ class APIConnection : public APIServerConnection { void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override { this->voice_assistant_subscription_ = msg.subscribe; } - bool request_voice_assistant(bool start); + bool request_voice_assistant(bool start, const std::string &conversation_id); void on_voice_assistant_response(const VoiceAssistantResponse &msg) override; void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override; #endif diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp index 1dd8c82e00..29e8e207dd 100644 --- a/esphome/components/api/api_pb2.cpp +++ b/esphome/components/api/api_pb2.cpp @@ -6187,7 +6187,20 @@ bool VoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarInt value) return false; } } -void VoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->start); } +bool VoiceAssistantRequest::decode_length(uint32_t field_id, ProtoLengthDelimited value) { + switch (field_id) { + case 2: { + this->conversation_id = value.as_string(); + return true; + } + default: + return false; + } +} +void VoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { + buffer.encode_bool(1, this->start); + buffer.encode_string(2, this->conversation_id); +} #ifdef HAS_PROTO_MESSAGE_DUMP void VoiceAssistantRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; @@ -6195,6 +6208,10 @@ void VoiceAssistantRequest::dump_to(std::string &out) const { out.append(" start: "); out.append(YESNO(this->start)); out.append("\n"); + + out.append(" conversation_id: "); + out.append("'").append(this->conversation_id).append("'"); + out.append("\n"); out.append("}"); } #endif diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h index 0f4b79de19..cd1cfb595a 100644 --- a/esphome/components/api/api_pb2.h +++ b/esphome/components/api/api_pb2.h @@ -1604,12 +1604,14 @@ class SubscribeVoiceAssistantRequest : public ProtoMessage { class VoiceAssistantRequest : public ProtoMessage { public: bool start{false}; + std::string conversation_id{}; void encode(ProtoWriteBuffer buffer) const override; #ifdef HAS_PROTO_MESSAGE_DUMP void dump_to(std::string &out) const override; #endif protected: + bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override; bool decode_varint(uint32_t field_id, ProtoVarInt value) override; }; class VoiceAssistantResponse : public ProtoMessage { diff --git a/esphome/components/api/api_server.cpp b/esphome/components/api/api_server.cpp index 068f74315c..3dd47c4dd8 100644 --- a/esphome/components/api/api_server.cpp +++ b/esphome/components/api/api_server.cpp @@ -428,16 +428,16 @@ void APIServer::on_shutdown() { } #ifdef USE_VOICE_ASSISTANT -bool APIServer::start_voice_assistant() { +bool APIServer::start_voice_assistant(const std::string &conversation_id) { for (auto &c : this->clients_) { - if (c->request_voice_assistant(true)) + if (c->request_voice_assistant(true, conversation_id)) return true; } return false; } void APIServer::stop_voice_assistant() { for (auto &c : this->clients_) { - if (c->request_voice_assistant(false)) + if (c->request_voice_assistant(false, "")) return; } } diff --git a/esphome/components/api/api_server.h b/esphome/components/api/api_server.h index a1bec2802f..79ba7b17f1 100644 --- a/esphome/components/api/api_server.h +++ b/esphome/components/api/api_server.h @@ -96,7 +96,7 @@ class APIServer : public Component, public Controller { #endif #ifdef USE_VOICE_ASSISTANT - bool start_voice_assistant(); + bool start_voice_assistant(const std::string &conversation_id); void stop_voice_assistant(); #endif diff --git a/esphome/components/i2s_audio/microphone/__init__.py b/esphome/components/i2s_audio/microphone/__init__.py index 07f5158188..b917da3045 100644 --- a/esphome/components/i2s_audio/microphone/__init__.py +++ b/esphome/components/i2s_audio/microphone/__init__.py @@ -62,7 +62,7 @@ BASE_SCHEMA = microphone.MICROPHONE_SCHEMA.extend( cv.GenerateID(): cv.declare_id(I2SAudioMicrophone), cv.GenerateID(CONF_I2S_AUDIO_ID): cv.use_id(I2SAudioComponent), cv.Optional(CONF_CHANNEL, default="right"): cv.enum(CHANNELS), - cv.Optional(CONF_BITS_PER_SAMPLE, default="16bit"): cv.All( + cv.Optional(CONF_BITS_PER_SAMPLE, default="32bit"): cv.All( _validate_bits, cv.enum(BITS_PER_SAMPLE) ), } diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py index 624fcdf52c..55d995be88 100644 --- a/esphome/components/voice_assistant/__init__.py +++ b/esphome/components/voice_assistant/__init__.py @@ -1,16 +1,23 @@ import esphome.config_validation as cv import esphome.codegen as cg -from esphome.const import CONF_ID, CONF_MICROPHONE, CONF_SPEAKER +from esphome.const import ( + CONF_ID, + CONF_MICROPHONE, + CONF_SPEAKER, + CONF_MEDIA_PLAYER, +) from esphome import automation -from esphome.automation import register_action -from esphome.components import microphone, speaker +from esphome.automation import register_action, register_condition +from esphome.components import microphone, speaker, media_player AUTO_LOAD = ["socket"] DEPENDENCIES = ["api", "microphone"] CODEOWNERS = ["@jesserockz"] +CONF_SILENCE_DETECTION = "silence_detection" +CONF_ON_LISTENING = "on_listening" CONF_ON_START = "on_start" CONF_ON_STT_END = "on_stt_end" CONF_ON_TTS_START = "on_tts_start" @@ -25,16 +32,25 @@ VoiceAssistant = voice_assistant_ns.class_("VoiceAssistant", cg.Component) StartAction = voice_assistant_ns.class_( "StartAction", automation.Action, cg.Parented.template(VoiceAssistant) ) +StartContinuousAction = voice_assistant_ns.class_( + "StartContinuousAction", automation.Action, cg.Parented.template(VoiceAssistant) +) StopAction = voice_assistant_ns.class_( "StopAction", automation.Action, cg.Parented.template(VoiceAssistant) ) +IsRunningCondition = voice_assistant_ns.class_( + "IsRunningCondition", automation.Condition, cg.Parented.template(VoiceAssistant) +) CONFIG_SCHEMA = cv.Schema( { cv.GenerateID(): cv.declare_id(VoiceAssistant), cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), - cv.Optional(CONF_SPEAKER): cv.use_id(speaker.Speaker), + cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker), + cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(media_player.MediaPlayer), + cv.Optional(CONF_SILENCE_DETECTION, default=True): cv.boolean, + cv.Optional(CONF_ON_LISTENING): automation.validate_automation(single=True), cv.Optional(CONF_ON_START): automation.validate_automation(single=True), cv.Optional(CONF_ON_STT_END): automation.validate_automation(single=True), cv.Optional(CONF_ON_TTS_START): automation.validate_automation(single=True), @@ -56,6 +72,17 @@ async def to_code(config): spkr = await cg.get_variable(config[CONF_SPEAKER]) cg.add(var.set_speaker(spkr)) + if CONF_MEDIA_PLAYER in config: + mp = await cg.get_variable(config[CONF_MEDIA_PLAYER]) + cg.add(var.set_media_player(mp)) + + cg.add(var.set_silence_detection(config[CONF_SILENCE_DETECTION])) + + if CONF_ON_LISTENING in config: + await automation.build_automation( + var.get_listening_trigger(), [], config[CONF_ON_LISTENING] + ) + if CONF_ON_START in config: await automation.build_automation( var.get_start_trigger(), [], config[CONF_ON_START] @@ -96,6 +123,11 @@ async def to_code(config): VOICE_ASSISTANT_ACTION_SCHEMA = cv.Schema({cv.GenerateID(): cv.use_id(VoiceAssistant)}) +@register_action( + "voice_assistant.start_continuous", + StartContinuousAction, + VOICE_ASSISTANT_ACTION_SCHEMA, +) @register_action("voice_assistant.start", StartAction, VOICE_ASSISTANT_ACTION_SCHEMA) async def voice_assistant_listen_to_code(config, action_id, template_arg, args): var = cg.new_Pvariable(action_id, template_arg) @@ -108,3 +140,12 @@ async def voice_assistant_stop_to_code(config, action_id, template_arg, args): var = cg.new_Pvariable(action_id, template_arg) await cg.register_parented(var, config[CONF_ID]) return var + + +@register_condition( + "voice_assistant.is_running", IsRunningCondition, VOICE_ASSISTANT_ACTION_SCHEMA +) +async def voice_assistant_is_running_to_code(config, condition_id, template_arg, args): + var = cg.new_Pvariable(condition_id, template_arg) + await cg.register_parented(var, config[CONF_ID]) + return var diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 4245578711..44d640ff39 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -69,17 +69,42 @@ void VoiceAssistant::setup() { void VoiceAssistant::loop() { #ifdef USE_SPEAKER - if (this->speaker_ == nullptr) { + if (this->speaker_ != nullptr) { + uint8_t buf[1024]; + auto len = this->socket_->read(buf, sizeof(buf)); + if (len == -1) { + return; + } + this->speaker_->play(buf, len); + this->set_timeout("data-incoming", 200, [this]() { + if (this->continuous_) { + this->request_start(true); + } + }); return; } - - uint8_t buf[1024]; - auto len = this->socket_->read(buf, sizeof(buf)); - if (len == -1) { - return; - } - this->speaker_->play(buf, len); #endif +#ifdef USE_MEDIA_PLAYER + if (this->media_player_ != nullptr) { + if (!this->playing_tts_ || + this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_PLAYING) { + return; + } + this->set_timeout("playing-media", 1000, [this]() { + this->playing_tts_ = false; + if (this->continuous_) { + this->request_start(true); + } + }); + return; + } +#endif + // Set a 1 second timeout to start the voice assistant again. + this->set_timeout("continuous-no-sound", 1000, [this]() { + if (this->continuous_) { + this->request_start(true); + } + }); } void VoiceAssistant::start(struct sockaddr_storage *addr, uint16_t port) { @@ -100,14 +125,19 @@ void VoiceAssistant::start(struct sockaddr_storage *addr, uint16_t port) { } this->running_ = true; this->mic_->start(); + this->listening_trigger_->trigger(); } -void VoiceAssistant::request_start() { +void VoiceAssistant::request_start(bool continuous) { ESP_LOGD(TAG, "Requesting start..."); - if (!api::global_api_server->start_voice_assistant()) { + if (!api::global_api_server->start_voice_assistant(this->conversation_id_)) { ESP_LOGW(TAG, "Could not request start."); this->error_trigger_->trigger("not-connected", "Could not request start."); + this->continuous_ = false; + return; } + this->continuous_ = continuous; + this->set_timeout("reset-conversation_id", 5 * 60 * 1000, [this]() { this->conversation_id_ = ""; }); } void VoiceAssistant::signal_stop() { @@ -136,9 +166,18 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); + this->signal_stop(); this->stt_end_trigger_->trigger(text); break; } + case api::enums::VOICE_ASSISTANT_INTENT_END: { + for (auto arg : msg.data) { + if (arg.name == "conversation_id") { + this->conversation_id_ = std::move(arg.value); + } + } + break; + } case api::enums::VOICE_ASSISTANT_TTS_START: { std::string text; for (auto arg : msg.data) { @@ -166,6 +205,12 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); +#ifdef USE_MEDIA_PLAYER + if (this->media_player_ != nullptr) { + this->playing_tts_ = true; + this->media_player_->make_call().set_media_url(url).perform(); + } +#endif this->tts_end_trigger_->trigger(url); break; } @@ -184,6 +229,8 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { } } ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str()); + this->continuous_ = false; + this->signal_stop(); this->error_trigger_->trigger(code, message); } default: diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index c1a6e8883b..b103584509 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -15,6 +15,9 @@ #ifdef USE_SPEAKER #include "esphome/components/speaker/speaker.h" #endif +#ifdef USE_MEDIA_PLAYER +#include "esphome/components/media_player/media_player.h" +#endif #include "esphome/components/socket/socket.h" namespace esphome { @@ -22,8 +25,10 @@ namespace voice_assistant { // Version 1: Initial version // Version 2: Adds raw speaker support +// Version 3: Adds continuous support static const uint32_t INITIAL_VERSION = 1; static const uint32_t SPEAKER_SUPPORT = 2; +static const uint32_t SILENCE_DETECTION_SUPPORT = 3; class VoiceAssistant : public Component { public: @@ -36,20 +41,34 @@ class VoiceAssistant : public Component { #ifdef USE_SPEAKER void set_speaker(speaker::Speaker *speaker) { this->speaker_ = speaker; } #endif +#ifdef USE_MEDIA_PLAYER + void set_media_player(media_player::MediaPlayer *media_player) { this->media_player_ = media_player; } +#endif uint32_t get_version() const { #ifdef USE_SPEAKER - if (this->speaker_ != nullptr) + if (this->speaker_ != nullptr) { + if (this->silence_detection_) { + return SILENCE_DETECTION_SUPPORT; + } return SPEAKER_SUPPORT; + } #endif return INITIAL_VERSION; } - void request_start(); + void request_start(bool continuous = false); void signal_stop(); void on_event(const api::VoiceAssistantEventResponse &msg); + bool is_running() const { return this->running_; } + void set_continuous(bool continuous) { this->continuous_ = continuous; } + bool is_continuous() const { return this->continuous_; } + + void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; } + + Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } Trigger<> *get_start_trigger() const { return this->start_trigger_; } Trigger *get_stt_end_trigger() const { return this->stt_end_trigger_; } Trigger *get_tts_start_trigger() const { return this->tts_start_trigger_; } @@ -61,6 +80,7 @@ class VoiceAssistant : public Component { std::unique_ptr socket_ = nullptr; struct sockaddr_storage dest_addr_; + Trigger<> *listening_trigger_ = new Trigger<>(); Trigger<> *start_trigger_ = new Trigger<>(); Trigger *stt_end_trigger_ = new Trigger(); Trigger *tts_start_trigger_ = new Trigger(); @@ -72,8 +92,16 @@ class VoiceAssistant : public Component { #ifdef USE_SPEAKER speaker::Speaker *speaker_{nullptr}; #endif +#ifdef USE_MEDIA_PLAYER + media_player::MediaPlayer *media_player_{nullptr}; + bool playing_tts_{false}; +#endif + + std::string conversation_id_{""}; bool running_{false}; + bool continuous_{false}; + bool silence_detection_; }; template class StartAction : public Action, public Parented { @@ -81,9 +109,22 @@ template class StartAction : public Action, public Parent void play(Ts... x) override { this->parent_->request_start(); } }; +template class StartContinuousAction : public Action, public Parented { + public: + void play(Ts... x) override { this->parent_->request_start(true); } +}; + template class StopAction : public Action, public Parented { public: - void play(Ts... x) override { this->parent_->signal_stop(); } + void play(Ts... x) override { + this->parent_->set_continuous(false); + this->parent_->signal_stop(); + } +}; + +template class IsRunningCondition : public Condition, public Parented { + public: + bool check(Ts... x) override { return this->parent_->is_running() || this->parent_->is_continuous(); } }; extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) diff --git a/esphome/const.py b/esphome/const.py index cbc8f428f5..470f8a46e5 100644 --- a/esphome/const.py +++ b/esphome/const.py @@ -399,6 +399,7 @@ CONF_MAX_VOLTAGE = "max_voltage" CONF_MDNS = "mdns" CONF_MEASUREMENT_DURATION = "measurement_duration" CONF_MEASUREMENT_SEQUENCE_NUMBER = "measurement_sequence_number" +CONF_MEDIA_PLAYER = "media_player" CONF_MEDIUM = "medium" CONF_MEMORY_BLOCKS = "memory_blocks" CONF_METHOD = "method"