From ddb762f8f53006a6ac53fe70ca066704de556830 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 22 Jan 2026 08:09:14 -1000 Subject: [PATCH] [api] Limit Nagle batching for log messages to reduce LWIP buffer pressure (#13439) --- esphome/components/api/api_connection.cpp | 19 +------ esphome/components/api/api_frame_helper.h | 67 +++++++++++++++-------- 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/esphome/components/api/api_connection.cpp b/esphome/components/api/api_connection.cpp index 0364879ccd..1626f395e6 100644 --- a/esphome/components/api/api_connection.cpp +++ b/esphome/components/api/api_connection.cpp @@ -1844,23 +1844,8 @@ bool APIConnection::send_buffer(ProtoWriteBuffer buffer, uint8_t message_type) { return false; } - // Toggle Nagle's algorithm based on message type to prevent log messages from - // filling the TCP send buffer and crowding out important state updates. - // - // This honors the `no_delay` proto option - SubscribeLogsResponse is the only - // message with `option (no_delay) = false;` in api.proto, indicating it should - // allow Nagle coalescing. This option existed since 2019 but was never implemented. - // - // - Log messages: Enable Nagle (NODELAY=false) so small log packets coalesce - // into fewer, larger packets. They flush naturally via TCP delayed ACK timer - // (~200ms), buffer filling, or when a state update triggers a flush. - // - // - All other messages (state updates, responses): Disable Nagle (NODELAY=true) - // for immediate delivery. These are time-sensitive and should not be delayed. - // - // This must be done proactively BEFORE the buffer fills up - checking buffer - // state here would be too late since we'd already be in a degraded state. - this->helper_->set_nodelay(!is_log_message); + // Set TCP_NODELAY based on message type - see set_nodelay_for_message() for details + this->helper_->set_nodelay_for_message(is_log_message); APIError err = this->helper_->write_protobuf_packet(message_type, buffer); if (err == APIError::WOULD_BLOCK) diff --git a/esphome/components/api/api_frame_helper.h b/esphome/components/api/api_frame_helper.h index 27ec1ff915..f311e34fd7 100644 --- a/esphome/components/api/api_frame_helper.h +++ b/esphome/components/api/api_frame_helper.h @@ -120,26 +120,39 @@ class APIFrameHelper { } return APIError::OK; } - /// Toggle TCP_NODELAY socket option to control Nagle's algorithm. - /// - /// This is used to allow log messages to coalesce (Nagle enabled) while keeping - /// state updates low-latency (NODELAY enabled). Without this, many small log - /// packets fill the TCP send buffer, crowding out important state updates. - /// - /// State is tracked to minimize setsockopt() overhead - on lwip_raw (ESP8266/RP2040) - /// this is just a boolean assignment; on other platforms it's a lightweight syscall. - /// - /// @param enable true to enable NODELAY (disable Nagle), false to enable Nagle - /// @return true if successful or already in desired state - bool set_nodelay(bool enable) { - if (this->nodelay_enabled_ == enable) - return true; - int val = enable ? 1 : 0; - int err = this->socket_->setsockopt(IPPROTO_TCP, TCP_NODELAY, &val, sizeof(int)); - if (err == 0) { - this->nodelay_enabled_ = enable; + // Manage TCP_NODELAY (Nagle's algorithm) based on message type. + // + // For non-log messages (sensor data, state updates): Always disable Nagle + // (NODELAY on) for immediate delivery - these are time-sensitive. + // + // For log messages: Use Nagle to coalesce multiple small log packets into + // fewer larger packets, reducing WiFi overhead. However, we limit batching + // to 3 messages to avoid excessive LWIP buffer pressure on memory-constrained + // devices like ESP8266. LWIP's TCP_OVERSIZE option coalesces the data into + // shared pbufs, but holding data too long waiting for Nagle's timer causes + // buffer exhaustion and dropped messages. + // + // Flow: Log 1 (Nagle on) -> Log 2 (Nagle on) -> Log 3 (NODELAY, flush all) + // + void set_nodelay_for_message(bool is_log_message) { + if (!is_log_message) { + if (this->nodelay_state_ != NODELAY_ON) { + this->set_nodelay_raw_(true); + this->nodelay_state_ = NODELAY_ON; + } + return; + } + + // Log messages 1-3: state transitions -1 -> 1 -> 2 -> -1 (flush on 3rd) + if (this->nodelay_state_ == NODELAY_ON) { + this->set_nodelay_raw_(false); + this->nodelay_state_ = 1; + } else if (this->nodelay_state_ >= LOG_NAGLE_COUNT) { + this->set_nodelay_raw_(true); + this->nodelay_state_ = NODELAY_ON; + } else { + this->nodelay_state_++; } - return err == 0; } virtual APIError write_protobuf_packet(uint8_t type, ProtoWriteBuffer buffer) = 0; // Write multiple protobuf messages in a single operation @@ -229,10 +242,18 @@ class APIFrameHelper { uint8_t tx_buf_head_{0}; uint8_t tx_buf_tail_{0}; uint8_t tx_buf_count_{0}; - // Tracks TCP_NODELAY state to minimize setsockopt() calls. Initialized to true - // since init_common_() enables NODELAY. Used by set_nodelay() to allow log - // messages to coalesce while keeping state updates low-latency. - bool nodelay_enabled_{true}; + // Nagle batching state for log messages. NODELAY_ON (-1) means NODELAY is enabled + // (immediate send). Values 1-2 count log messages in the current Nagle batch. + // After LOG_NAGLE_COUNT logs, we switch to NODELAY to flush and reset. + static constexpr int8_t NODELAY_ON = -1; + static constexpr int8_t LOG_NAGLE_COUNT = 2; + int8_t nodelay_state_{NODELAY_ON}; + + // Internal helper to set TCP_NODELAY socket option + void set_nodelay_raw_(bool enable) { + int val = enable ? 1 : 0; + this->socket_->setsockopt(IPPROTO_TCP, TCP_NODELAY, &val, sizeof(int)); + } // Common initialization for both plaintext and noise protocols APIError init_common_();