From 4c121502003dd1c8505eab3d443908acd3738c0f Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 10:31:27 -0500 Subject: [PATCH 01/11] debug logging --- esphome/core/scheduler.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index 7e2a805793..a8cebf3e9e 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -91,9 +91,15 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type // Reuse from pool item = std::move(this->scheduler_item_pool_.back()); this->scheduler_item_pool_.pop_back(); +#ifdef ESPHOME_DEBUG_SCHEDULER + ESP_LOGVV(TAG, "Reused item from pool (pool size now: %zu)", this->scheduler_item_pool_.size()); +#endif } else { // Allocate new if pool is empty item = make_unique(); +#ifdef ESPHOME_DEBUG_SCHEDULER + ESP_LOGVV(TAG, "Allocated new item (pool empty)"); +#endif } item->component = component; item->set_name(name_cstr, !is_static_string); @@ -771,6 +777,13 @@ void Scheduler::recycle_item_(std::unique_ptr item) { // Clear dynamic name if any item->clear_dynamic_name(); this->scheduler_item_pool_.push_back(std::move(item)); +#ifdef ESPHOME_DEBUG_SCHEDULER + ESP_LOGVV(TAG, "Recycled item to pool (pool size now: %zu)", this->scheduler_item_pool_.size()); +#endif + } else { +#ifdef ESPHOME_DEBUG_SCHEDULER + ESP_LOGVV(TAG, "Pool full (size: %zu), deleting item", this->scheduler_item_pool_.size()); +#endif } // else: unique_ptr will delete the item when it goes out of scope } From 440053577589a023a9114479e2efe672a3a13e0b Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 10:45:59 -0500 Subject: [PATCH 02/11] some tests --- esphome/components/api/api_pb2_dump.cpp | 2 +- .../integration/fixtures/scheduler_pool.yaml | 215 ++++++++++++++++++ tests/integration/test_scheduler_pool.py | 194 ++++++++++++++++ 3 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 tests/integration/fixtures/scheduler_pool.yaml create mode 100644 tests/integration/test_scheduler_pool.py diff --git a/esphome/components/api/api_pb2_dump.cpp b/esphome/components/api/api_pb2_dump.cpp index 3e7df9195b..1d7d315419 100644 --- a/esphome/components/api/api_pb2_dump.cpp +++ b/esphome/components/api/api_pb2_dump.cpp @@ -1135,7 +1135,7 @@ void ExecuteServiceArgument::dump_to(std::string &out) const { dump_field(out, "string_", this->string_); dump_field(out, "int_", this->int_); for (const auto it : this->bool_array) { - dump_field(out, "bool_array", it, 4); + dump_field(out, "bool_array", static_cast(it), 4); } for (const auto &it : this->int_array) { dump_field(out, "int_array", it, 4); diff --git a/tests/integration/fixtures/scheduler_pool.yaml b/tests/integration/fixtures/scheduler_pool.yaml new file mode 100644 index 0000000000..196724c021 --- /dev/null +++ b/tests/integration/fixtures/scheduler_pool.yaml @@ -0,0 +1,215 @@ +esphome: + name: scheduler-pool-test + on_boot: + priority: -100 + then: + - logger.log: "Starting scheduler pool tests" + debug_scheduler: true # Enable scheduler debug logging + +host: +api: + services: + - service: run_phase_1 + then: + - script.execute: test_pool_recycling + - service: run_phase_2 + then: + - script.execute: test_sensor_polling + - service: run_phase_3 + then: + - script.execute: test_communication_patterns + - service: run_phase_4 + then: + - script.execute: test_defer_patterns + - service: run_phase_5 + then: + - script.execute: test_pool_reuse_verification + - service: run_complete + then: + - script.execute: complete_test +logger: + level: VERY_VERBOSE # Need VERY_VERBOSE to see pool debug messages + +globals: + - id: create_count + type: int + initial_value: '0' + - id: cancel_count + type: int + initial_value: '0' + - id: interval_counter + type: int + initial_value: '0' + - id: pool_test_done + type: bool + initial_value: 'false' + +script: + - id: test_pool_recycling + then: + - logger.log: "Testing scheduler pool recycling with realistic usage patterns" + - lambda: |- + auto *component = id(test_sensor); + + // Simulate realistic component behavior with timeouts that complete naturally + ESP_LOGI("test", "Phase 1: Simulating normal component lifecycle"); + + // Sensor update timeouts (common pattern) + App.scheduler.set_timeout(component, "sensor_init", 100, []() { + ESP_LOGD("test", "Sensor initialized"); + id(create_count)++; + }); + + // Retry timeout (gets cancelled if successful) + App.scheduler.set_timeout(component, "retry_timeout", 500, []() { + ESP_LOGD("test", "Retry timeout executed"); + id(create_count)++; + }); + + // Simulate successful operation - cancel retry + App.scheduler.set_timeout(component, "success_sim", 200, []() { + ESP_LOGD("test", "Operation succeeded, cancelling retry"); + App.scheduler.cancel_timeout(id(test_sensor), "retry_timeout"); + id(cancel_count)++; + }); + + id(create_count) += 3; + ESP_LOGI("test", "Phase 1 complete"); + + - id: test_sensor_polling + then: + - lambda: |- + // Simulate sensor polling pattern + ESP_LOGI("test", "Phase 2: Simulating sensor polling patterns"); + auto *component = id(test_sensor); + + // Multiple sensors with different update intervals + App.scheduler.set_interval(component, "temp_sensor", 1000, []() { + ESP_LOGD("test", "Temperature sensor update"); + id(interval_counter)++; + if (id(interval_counter) >= 3) { + App.scheduler.cancel_interval(id(test_sensor), "temp_sensor"); + ESP_LOGD("test", "Temperature sensor stopped"); + } + }); + + App.scheduler.set_interval(component, "humidity_sensor", 1500, []() { + ESP_LOGD("test", "Humidity sensor update"); + id(interval_counter)++; + if (id(interval_counter) >= 5) { + App.scheduler.cancel_interval(id(test_sensor), "humidity_sensor"); + ESP_LOGD("test", "Humidity sensor stopped"); + } + }); + + id(create_count) += 2; + ESP_LOGI("test", "Phase 2 complete"); + + - id: test_communication_patterns + then: + - lambda: |- + // Simulate communication patterns (WiFi/API reconnects, etc) + ESP_LOGI("test", "Phase 3: Simulating communication patterns"); + auto *component = id(test_sensor); + + // Connection timeout pattern + App.scheduler.set_timeout(component, "connect_timeout", 2000, []() { + ESP_LOGD("test", "Connection timeout - would retry"); + id(create_count)++; + + // Schedule retry + App.scheduler.set_timeout(id(test_sensor), "connect_retry", 1000, []() { + ESP_LOGD("test", "Retrying connection"); + id(create_count)++; + }); + }); + + // Heartbeat pattern + App.scheduler.set_interval(component, "heartbeat", 500, []() { + ESP_LOGD("test", "Heartbeat"); + id(interval_counter)++; + if (id(interval_counter) >= 10) { + App.scheduler.cancel_interval(id(test_sensor), "heartbeat"); + ESP_LOGD("test", "Heartbeat stopped"); + } + }); + + id(create_count) += 2; + ESP_LOGI("test", "Phase 3 complete"); + + - id: test_defer_patterns + then: + - lambda: |- + // Simulate defer patterns (state changes, async operations) + ESP_LOGI("test", "Phase 4: Simulating defer patterns"); + + class TestComponent : public Component { + public: + void simulate_state_changes() { + // Defer state changes (common in switches, lights, etc) + this->defer("state_change_1", []() { + ESP_LOGD("test", "State change 1 applied"); + id(create_count)++; + }); + + // Another state change + this->defer("state_change_2", []() { + ESP_LOGD("test", "State change 2 applied"); + id(create_count)++; + }); + + // Cleanup operation + this->defer("cleanup", []() { + ESP_LOGD("test", "Cleanup executed"); + id(create_count)++; + }); + } + }; + + static TestComponent test_comp; + test_comp.simulate_state_changes(); + ESP_LOGI("test", "Phase 4 complete"); + + - id: test_pool_reuse_verification + then: + - lambda: |- + ESP_LOGI("test", "Phase 5: Verifying pool reuse after everything settles"); + + // First, ensure any remaining intervals are cancelled to recycle to pool + auto *component = id(test_sensor); + App.scheduler.cancel_interval(component, "temp_sensor"); + App.scheduler.cancel_interval(component, "humidity_sensor"); + App.scheduler.cancel_interval(component, "heartbeat"); + + // Give a moment for items to be recycled + ESP_LOGD("test", "Cancelled any remaining intervals to build up pool"); + + // Now create 6 new timeouts - they should all reuse from pool + int reuse_test_count = 6; + int initial_pool_reused = 0; + + for (int i = 0; i < reuse_test_count; i++) { + std::string name = "reuse_test_" + std::to_string(i); + App.scheduler.set_timeout(component, name, 100 + i * 50, [i]() { + ESP_LOGD("test", "Reuse test %d completed", i); + }); + } + + ESP_LOGI("test", "Created %d items for reuse verification", reuse_test_count); + id(create_count) += reuse_test_count; + ESP_LOGI("test", "Phase 5 complete"); + + - id: complete_test + then: + - lambda: |- + ESP_LOGI("test", "Pool recycling test complete - created %d items, cancelled %d, intervals %d", + id(create_count), id(cancel_count), id(interval_counter)); + +sensor: + - platform: template + name: Test Sensor + id: test_sensor + lambda: return 1.0; + update_interval: never + +# No interval - tests will be triggered from Python via API services diff --git a/tests/integration/test_scheduler_pool.py b/tests/integration/test_scheduler_pool.py new file mode 100644 index 0000000000..bd604e053f --- /dev/null +++ b/tests/integration/test_scheduler_pool.py @@ -0,0 +1,194 @@ +"""Integration test for scheduler memory pool functionality.""" + +from __future__ import annotations + +import asyncio +import re + +import pytest + +from .types import APIClientConnectedFactory, RunCompiledFunction + + +@pytest.mark.asyncio +async def test_scheduler_pool( + yaml_config: str, + run_compiled: RunCompiledFunction, + api_client_connected: APIClientConnectedFactory, +) -> None: + """Test that the scheduler memory pool is working correctly with realistic usage. + + This test simulates real-world scheduler usage patterns and verifies that: + 1. Items are recycled to the pool when timeouts complete naturally + 2. Items are recycled when intervals/timeouts are cancelled + 3. Items are reused from the pool for new scheduler operations + 4. The pool grows gradually based on actual usage patterns + 5. Pool operations are logged correctly with debug scheduler enabled + """ + # Track log messages to verify pool behavior + log_lines: list[str] = [] + pool_reuse_count = 0 + pool_recycle_count = 0 + pool_full_count = 0 + new_alloc_count = 0 + + # Patterns to match pool operations + reuse_pattern = re.compile(r"Reused item from pool \(pool size now: (\d+)\)") + recycle_pattern = re.compile(r"Recycled item to pool \(pool size now: (\d+)\)") + pool_full_pattern = re.compile(r"Pool full \(size: (\d+)\), deleting item") + new_alloc_pattern = re.compile(r"Allocated new item \(pool empty\)") + + # Futures to track when test phases complete + loop = asyncio.get_running_loop() + test_complete_future: asyncio.Future[bool] = loop.create_future() + phase_futures = { + 1: loop.create_future(), + 2: loop.create_future(), + 3: loop.create_future(), + 4: loop.create_future(), + 5: loop.create_future(), + } + + def check_output(line: str) -> None: + """Check log output for pool operations and phase completion.""" + nonlocal pool_reuse_count, pool_recycle_count, pool_full_count, new_alloc_count + log_lines.append(line) + + # Track pool operations + if reuse_pattern.search(line): + pool_reuse_count += 1 + + elif recycle_pattern.search(line): + pool_recycle_count += 1 + + elif pool_full_pattern.search(line): + pool_full_count += 1 + + elif new_alloc_pattern.search(line): + new_alloc_count += 1 + + # Track phase completion + for phase_num in range(1, 6): + if ( + f"Phase {phase_num} complete" in line + and not phase_futures[phase_num].done() + ): + phase_futures[phase_num].set_result(True) + + # Check for test completion + if "Pool recycling test complete" in line and not test_complete_future.done(): + test_complete_future.set_result(True) + + # Run the test with log monitoring + async with ( + run_compiled(yaml_config, line_callback=check_output), + api_client_connected() as client, + ): + # Verify device is running + device_info = await client.device_info() + assert device_info is not None + assert device_info.name == "scheduler-pool-test" + + # Get list of services + entities, services = await client.list_entities_services() + service_names = {s.name for s in services} + + # Verify all test services are available + expected_services = { + "run_phase_1", + "run_phase_2", + "run_phase_3", + "run_phase_4", + "run_phase_5", + "run_complete", + } + assert expected_services.issubset(service_names), ( + f"Missing services: {expected_services - service_names}" + ) + + # Get service objects + phase_services = { + num: next(s for s in services if s.name == f"run_phase_{num}") + for num in range(1, 6) + } + complete_service = next(s for s in services if s.name == "run_complete") + + try: + # Phase 1: Component lifecycle + client.execute_service(phase_services[1], {}) + await asyncio.wait_for(phase_futures[1], timeout=3.0) + await asyncio.sleep(0.5) # Let timeouts complete + + # Phase 2: Sensor polling + client.execute_service(phase_services[2], {}) + await asyncio.wait_for(phase_futures[2], timeout=3.0) + await asyncio.sleep(1.0) # Let intervals run a bit + + # Phase 3: Communication patterns + client.execute_service(phase_services[3], {}) + await asyncio.wait_for(phase_futures[3], timeout=3.0) + await asyncio.sleep(1.0) # Let heartbeat run + + # Phase 4: Defer patterns + client.execute_service(phase_services[4], {}) + await asyncio.wait_for(phase_futures[4], timeout=3.0) + await asyncio.sleep(2.0) # Let everything settle and recycle + + # Phase 5: Pool reuse verification + client.execute_service(phase_services[5], {}) + await asyncio.wait_for(phase_futures[5], timeout=3.0) + await asyncio.sleep(0.5) # Let reuse tests complete + + # Complete test + client.execute_service(complete_service, {}) + await asyncio.wait_for(test_complete_future, timeout=2.0) + + except TimeoutError as e: + # Print debug info if test times out + recent_logs = "\n".join(log_lines[-30:]) + phases_completed = [num for num, fut in phase_futures.items() if fut.done()] + pytest.fail( + f"Test timed out waiting for phase/completion. Error: {e}\n" + f" Phases completed: {phases_completed}\n" + f" Pool stats:\n" + f" Reuse count: {pool_reuse_count}\n" + f" Recycle count: {pool_recycle_count}\n" + f" Pool full count: {pool_full_count}\n" + f" New alloc count: {new_alloc_count}\n" + f"Recent logs:\n{recent_logs}" + ) + + # Verify all test phases ran + for phase_num in range(1, 6): + assert phase_futures[phase_num].done(), f"Phase {phase_num} did not complete" + + # Verify pool behavior + assert pool_recycle_count > 0, "Should have recycled items to pool" + + # Check pool metrics + if pool_recycle_count > 0: + max_pool_size = 0 + for line in log_lines: + if match := recycle_pattern.search(line): + size = int(match.group(1)) + max_pool_size = max(max_pool_size, size) + + # Pool can grow up to its maximum of 8 + assert max_pool_size <= 8, f"Pool grew beyond maximum ({max_pool_size})" + + # Log summary for debugging + print("\nScheduler Pool Test Summary (Python Orchestrated):") + print(f" Items recycled to pool: {pool_recycle_count}") + print(f" Items reused from pool: {pool_reuse_count}") + print(f" Pool full events: {pool_full_count}") + print(f" New allocations: {new_alloc_count}") + print(" All phases completed successfully") + + # Verify reuse happened + if pool_reuse_count == 0 and pool_recycle_count > 3: + pytest.fail("Pool had items recycled but none were reused") + + # Success - pool is working + assert pool_recycle_count > 0 or new_alloc_count < 15, ( + "Pool should either recycle items or limit new allocations" + ) From 50f5728c765decaa40d4d6fa61d49eb9ef99caf2 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 11:00:18 -0500 Subject: [PATCH 03/11] preen --- tests/integration/fixtures/scheduler_pool.yaml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/integration/fixtures/scheduler_pool.yaml b/tests/integration/fixtures/scheduler_pool.yaml index 196724c021..0541421c26 100644 --- a/tests/integration/fixtures/scheduler_pool.yaml +++ b/tests/integration/fixtures/scheduler_pool.yaml @@ -175,22 +175,26 @@ script: - lambda: |- ESP_LOGI("test", "Phase 5: Verifying pool reuse after everything settles"); - // First, ensure any remaining intervals are cancelled to recycle to pool + // Cancel any remaining intervals auto *component = id(test_sensor); App.scheduler.cancel_interval(component, "temp_sensor"); App.scheduler.cancel_interval(component, "humidity_sensor"); App.scheduler.cancel_interval(component, "heartbeat"); - // Give a moment for items to be recycled - ESP_LOGD("test", "Cancelled any remaining intervals to build up pool"); + ESP_LOGD("test", "Cancelled any remaining intervals"); - // Now create 6 new timeouts - they should all reuse from pool - int reuse_test_count = 6; - int initial_pool_reused = 0; + // The pool should have items from completed timeouts in earlier phases. + // Phase 1 had 3 timeouts that completed and were recycled. + // Phase 3 had 1 timeout that completed and was recycled. + // Phase 4 had 3 defers that completed and were recycled. + // So we should have a decent pool size already from naturally completed items. + + // Now create 8 new timeouts - they should reuse from pool when available + int reuse_test_count = 8; for (int i = 0; i < reuse_test_count; i++) { std::string name = "reuse_test_" + std::to_string(i); - App.scheduler.set_timeout(component, name, 100 + i * 50, [i]() { + App.scheduler.set_timeout(component, name, 50 + i * 10, [i]() { ESP_LOGD("test", "Reuse test %d completed", i); }); } From 154023f0177c956b896afb0152cc18137b64c9f8 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 11:04:42 -0500 Subject: [PATCH 04/11] preen --- esphome/core/scheduler.cpp | 6 +++--- esphome/core/scheduler.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index a8cebf3e9e..78024a206f 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -14,7 +14,7 @@ namespace esphome { static const char *const TAG = "scheduler"; -static const uint32_t MAX_LOGICALLY_DELETED_ITEMS = 10; +static const uint32_t MAX_LOGICALLY_DELETED_ITEMS = 6; // Half the 32-bit range - used to detect rollovers vs normal time progression static constexpr uint32_t HALF_MAX_UINT32 = std::numeric_limits::max() / 2; // max delay to start an interval sequence @@ -765,12 +765,12 @@ void Scheduler::recycle_item_(std::unique_ptr item) { if (!item) return; - // Pool size of 8 is a balance between memory usage and performance: + // Pool size of 10 is a balance between memory usage and performance: // - Small enough to not waste memory on simple configs (1-2 timers) // - Large enough to handle complex setups with multiple sensors/components // - Prevents system-wide stalls from heap allocation/deallocation that can // disrupt task synchronization and cause dropped events - static constexpr size_t MAX_POOL_SIZE = 8; + static constexpr size_t MAX_POOL_SIZE = 10; if (this->scheduler_item_pool_.size() < MAX_POOL_SIZE) { // Clear callback to release captured resources item->callback = nullptr; diff --git a/esphome/core/scheduler.h b/esphome/core/scheduler.h index 50cc8da366..300e12117d 100644 --- a/esphome/core/scheduler.h +++ b/esphome/core/scheduler.h @@ -325,7 +325,7 @@ class Scheduler { // Memory pool for recycling SchedulerItem objects to reduce heap churn. // Design decisions: // - std::vector is used instead of a fixed array because many systems only need 1-2 scheduler items - // - The vector grows dynamically up to MAX_POOL_SIZE (8) only when needed, saving memory on simple setups + // - The vector grows dynamically up to MAX_POOL_SIZE (10) only when needed, saving memory on simple setups // - This approach balances memory efficiency for simple configs with performance for complex ones // - The pool significantly reduces heap fragmentation which is critical because heap allocation/deallocation // can stall the entire system, causing timing issues and dropped events for any components that need From 3115c6fdbfaf067149efb4101ea2de1ad89abfe6 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 11:05:42 -0500 Subject: [PATCH 05/11] preen --- tests/integration/test_scheduler_pool.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_scheduler_pool.py b/tests/integration/test_scheduler_pool.py index bd604e053f..eee9ab5e65 100644 --- a/tests/integration/test_scheduler_pool.py +++ b/tests/integration/test_scheduler_pool.py @@ -173,8 +173,8 @@ async def test_scheduler_pool( size = int(match.group(1)) max_pool_size = max(max_pool_size, size) - # Pool can grow up to its maximum of 8 - assert max_pool_size <= 8, f"Pool grew beyond maximum ({max_pool_size})" + # Pool can grow up to its maximum of 10 + assert max_pool_size <= 10, f"Pool grew beyond maximum ({max_pool_size})" # Log summary for debugging print("\nScheduler Pool Test Summary (Python Orchestrated):") From ef33f630c2ccdb7678ecb152decd73375efa92a5 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 11:07:13 -0500 Subject: [PATCH 06/11] preen --- .../integration/fixtures/scheduler_pool.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/integration/fixtures/scheduler_pool.yaml b/tests/integration/fixtures/scheduler_pool.yaml index 0541421c26..eb1446a710 100644 --- a/tests/integration/fixtures/scheduler_pool.yaml +++ b/tests/integration/fixtures/scheduler_pool.yaml @@ -24,6 +24,9 @@ api: - service: run_phase_5 then: - script.execute: test_pool_reuse_verification + - service: run_phase_6 + then: + - script.execute: test_full_pool_reuse - service: run_complete then: - script.execute: complete_test @@ -203,6 +206,29 @@ script: id(create_count) += reuse_test_count; ESP_LOGI("test", "Phase 5 complete"); + - id: test_full_pool_reuse + then: + - lambda: |- + ESP_LOGI("test", "Phase 6: Testing full pool reuse after Phase 5 items complete"); + + // At this point, all Phase 5 timeouts should have completed and been recycled. + // The pool should be at or near its maximum size (10). + // Creating 10 new items should reuse all from the pool. + + auto *component = id(test_sensor); + int full_reuse_count = 10; + + for (int i = 0; i < full_reuse_count; i++) { + std::string name = "full_reuse_" + std::to_string(i); + App.scheduler.set_timeout(component, name, 50 + i * 10, [i]() { + ESP_LOGD("test", "Full reuse test %d completed", i); + }); + } + + ESP_LOGI("test", "Created %d items for full pool reuse verification", full_reuse_count); + id(create_count) += full_reuse_count; + ESP_LOGI("test", "Phase 6 complete"); + - id: complete_test then: - lambda: |- From 05c71bda91586d70a0a43e747f9754701bc7f929 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 11:10:46 -0500 Subject: [PATCH 07/11] preen --- esphome/core/scheduler.cpp | 17 +++++++++++------ tests/integration/test_scheduler_pool.py | 15 +++++++++++---- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index 78024a206f..33efa52eb3 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -14,7 +14,18 @@ namespace esphome { static const char *const TAG = "scheduler"; +// Memory pool configuration constants +// Pool size of 10 is a balance between memory usage and performance: +// - Small enough to not waste memory on simple configs (1-2 timers) +// - Large enough to handle complex setups with multiple sensors/components +// - Prevents system-wide stalls from heap allocation/deallocation that can +// disrupt task synchronization and cause dropped events +static constexpr size_t MAX_POOL_SIZE = 10; +// Maximum number of cancelled items to keep in the heap before forcing a cleanup. +// Set to 6 to trigger cleanup relatively frequently, ensuring cancelled items are +// recycled to the pool in a timely manner to maintain pool efficiency. static const uint32_t MAX_LOGICALLY_DELETED_ITEMS = 6; + // Half the 32-bit range - used to detect rollovers vs normal time progression static constexpr uint32_t HALF_MAX_UINT32 = std::numeric_limits::max() / 2; // max delay to start an interval sequence @@ -765,12 +776,6 @@ void Scheduler::recycle_item_(std::unique_ptr item) { if (!item) return; - // Pool size of 10 is a balance between memory usage and performance: - // - Small enough to not waste memory on simple configs (1-2 timers) - // - Large enough to handle complex setups with multiple sensors/components - // - Prevents system-wide stalls from heap allocation/deallocation that can - // disrupt task synchronization and cause dropped events - static constexpr size_t MAX_POOL_SIZE = 10; if (this->scheduler_item_pool_.size() < MAX_POOL_SIZE) { // Clear callback to release captured resources item->callback = nullptr; diff --git a/tests/integration/test_scheduler_pool.py b/tests/integration/test_scheduler_pool.py index eee9ab5e65..9da6ba7f10 100644 --- a/tests/integration/test_scheduler_pool.py +++ b/tests/integration/test_scheduler_pool.py @@ -47,6 +47,7 @@ async def test_scheduler_pool( 3: loop.create_future(), 4: loop.create_future(), 5: loop.create_future(), + 6: loop.create_future(), } def check_output(line: str) -> None: @@ -68,7 +69,7 @@ async def test_scheduler_pool( new_alloc_count += 1 # Track phase completion - for phase_num in range(1, 6): + for phase_num in range(1, 7): if ( f"Phase {phase_num} complete" in line and not phase_futures[phase_num].done() @@ -100,6 +101,7 @@ async def test_scheduler_pool( "run_phase_3", "run_phase_4", "run_phase_5", + "run_phase_6", "run_complete", } assert expected_services.issubset(service_names), ( @@ -109,7 +111,7 @@ async def test_scheduler_pool( # Get service objects phase_services = { num: next(s for s in services if s.name == f"run_phase_{num}") - for num in range(1, 6) + for num in range(1, 7) } complete_service = next(s for s in services if s.name == "run_complete") @@ -137,7 +139,12 @@ async def test_scheduler_pool( # Phase 5: Pool reuse verification client.execute_service(phase_services[5], {}) await asyncio.wait_for(phase_futures[5], timeout=3.0) - await asyncio.sleep(0.5) # Let reuse tests complete + await asyncio.sleep(1.0) # Let Phase 5 timeouts complete and recycle + + # Phase 6: Full pool reuse verification + client.execute_service(phase_services[6], {}) + await asyncio.wait_for(phase_futures[6], timeout=3.0) + await asyncio.sleep(1.0) # Let Phase 6 timeouts complete # Complete test client.execute_service(complete_service, {}) @@ -159,7 +166,7 @@ async def test_scheduler_pool( ) # Verify all test phases ran - for phase_num in range(1, 6): + for phase_num in range(1, 7): assert phase_futures[phase_num].done(), f"Phase {phase_num} did not complete" # Verify pool behavior From c4efdf57667a60abb18c233ce96ac57c6caba49f Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 11:14:43 -0500 Subject: [PATCH 08/11] preen --- esphome/core/scheduler.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index 33efa52eb3..162f40e8e6 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -26,6 +26,11 @@ static constexpr size_t MAX_POOL_SIZE = 10; // recycled to the pool in a timely manner to maintain pool efficiency. static const uint32_t MAX_LOGICALLY_DELETED_ITEMS = 6; +// Ensure MAX_LOGICALLY_DELETED_ITEMS is at least 4 smaller than MAX_POOL_SIZE +// This guarantees we have room in the pool for recycled items when cleanup occurs +static_assert(MAX_LOGICALLY_DELETED_ITEMS + 4 <= MAX_POOL_SIZE, + "MAX_LOGICALLY_DELETED_ITEMS must be at least 4 smaller than MAX_POOL_SIZE"); + // Half the 32-bit range - used to detect rollovers vs normal time progression static constexpr uint32_t HALF_MAX_UINT32 = std::numeric_limits::max() / 2; // max delay to start an interval sequence From b009a0f967d76a687fd06eca31e0548a00815ab8 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 16:10:28 -0500 Subject: [PATCH 09/11] improve pool hit rate --- esphome/core/scheduler.cpp | 2 + .../integration/fixtures/scheduler_pool.yaml | 52 +++++++++++-------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index 162f40e8e6..7df1334aec 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -349,6 +349,8 @@ void HOT Scheduler::call(uint32_t now) { if (!this->should_skip_item_(item.get())) { this->execute_item_(item.get(), now); } + // Recycle the defer item after execution + this->recycle_item_(std::move(item)); } #endif /* not ESPHOME_THREAD_SINGLE */ diff --git a/tests/integration/fixtures/scheduler_pool.yaml b/tests/integration/fixtures/scheduler_pool.yaml index eb1446a710..e3b5c0f42f 100644 --- a/tests/integration/fixtures/scheduler_pool.yaml +++ b/tests/integration/fixtures/scheduler_pool.yaml @@ -144,33 +144,39 @@ script: then: - lambda: |- // Simulate defer patterns (state changes, async operations) - ESP_LOGI("test", "Phase 4: Simulating defer patterns"); + ESP_LOGI("test", "Phase 4: Simulating heavy defer patterns like ratgdo"); - class TestComponent : public Component { - public: - void simulate_state_changes() { - // Defer state changes (common in switches, lights, etc) - this->defer("state_change_1", []() { - ESP_LOGD("test", "State change 1 applied"); - id(create_count)++; - }); + auto *component = id(test_sensor); - // Another state change - this->defer("state_change_2", []() { - ESP_LOGD("test", "State change 2 applied"); - id(create_count)++; - }); + // Simulate a burst of defer operations like ratgdo does with state updates + // These should execute immediately and recycle quickly to the pool + for (int i = 0; i < 10; i++) { + std::string defer_name = "defer_" + std::to_string(i); + App.scheduler.set_timeout(component, defer_name, 0, [i]() { + ESP_LOGD("test", "Defer %d executed", i); + // Force a small delay between defer executions to see recycling + if (i == 5) { + ESP_LOGI("test", "Half of defers executed, checking pool status"); + } + }); + } - // Cleanup operation - this->defer("cleanup", []() { - ESP_LOGD("test", "Cleanup executed"); - id(create_count)++; - }); - } - }; + id(create_count) += 10; + ESP_LOGD("test", "Created 10 defer operations (0ms timeouts)"); + + // Also create some named defers that might get replaced + App.scheduler.set_timeout(component, "state_update", 0, []() { + ESP_LOGD("test", "State update 1"); + }); + + // Replace the same named defer (should cancel previous) + App.scheduler.set_timeout(component, "state_update", 0, []() { + ESP_LOGD("test", "State update 2 (replaced)"); + }); + + id(create_count) += 2; + id(cancel_count) += 1; // One cancelled due to replacement - static TestComponent test_comp; - test_comp.simulate_state_changes(); ESP_LOGI("test", "Phase 4 complete"); - id: test_pool_reuse_verification From e0e8a982d57f86975c2d2856c7ba8218f3e8c78c Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 16:12:59 -0500 Subject: [PATCH 10/11] improve pool hit rate --- .../integration/fixtures/scheduler_pool.yaml | 20 ++++++++-------- tests/integration/test_scheduler_pool.py | 24 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/integration/fixtures/scheduler_pool.yaml b/tests/integration/fixtures/scheduler_pool.yaml index e3b5c0f42f..e488f38e2e 100644 --- a/tests/integration/fixtures/scheduler_pool.yaml +++ b/tests/integration/fixtures/scheduler_pool.yaml @@ -58,19 +58,19 @@ script: ESP_LOGI("test", "Phase 1: Simulating normal component lifecycle"); // Sensor update timeouts (common pattern) - App.scheduler.set_timeout(component, "sensor_init", 100, []() { + App.scheduler.set_timeout(component, "sensor_init", 10, []() { ESP_LOGD("test", "Sensor initialized"); id(create_count)++; }); // Retry timeout (gets cancelled if successful) - App.scheduler.set_timeout(component, "retry_timeout", 500, []() { + App.scheduler.set_timeout(component, "retry_timeout", 50, []() { ESP_LOGD("test", "Retry timeout executed"); id(create_count)++; }); // Simulate successful operation - cancel retry - App.scheduler.set_timeout(component, "success_sim", 200, []() { + App.scheduler.set_timeout(component, "success_sim", 20, []() { ESP_LOGD("test", "Operation succeeded, cancelling retry"); App.scheduler.cancel_timeout(id(test_sensor), "retry_timeout"); id(cancel_count)++; @@ -87,7 +87,7 @@ script: auto *component = id(test_sensor); // Multiple sensors with different update intervals - App.scheduler.set_interval(component, "temp_sensor", 1000, []() { + App.scheduler.set_interval(component, "temp_sensor", 100, []() { ESP_LOGD("test", "Temperature sensor update"); id(interval_counter)++; if (id(interval_counter) >= 3) { @@ -96,7 +96,7 @@ script: } }); - App.scheduler.set_interval(component, "humidity_sensor", 1500, []() { + App.scheduler.set_interval(component, "humidity_sensor", 150, []() { ESP_LOGD("test", "Humidity sensor update"); id(interval_counter)++; if (id(interval_counter) >= 5) { @@ -116,19 +116,19 @@ script: auto *component = id(test_sensor); // Connection timeout pattern - App.scheduler.set_timeout(component, "connect_timeout", 2000, []() { + App.scheduler.set_timeout(component, "connect_timeout", 200, []() { ESP_LOGD("test", "Connection timeout - would retry"); id(create_count)++; // Schedule retry - App.scheduler.set_timeout(id(test_sensor), "connect_retry", 1000, []() { + App.scheduler.set_timeout(id(test_sensor), "connect_retry", 100, []() { ESP_LOGD("test", "Retrying connection"); id(create_count)++; }); }); // Heartbeat pattern - App.scheduler.set_interval(component, "heartbeat", 500, []() { + App.scheduler.set_interval(component, "heartbeat", 50, []() { ESP_LOGD("test", "Heartbeat"); id(interval_counter)++; if (id(interval_counter) >= 10) { @@ -203,7 +203,7 @@ script: for (int i = 0; i < reuse_test_count; i++) { std::string name = "reuse_test_" + std::to_string(i); - App.scheduler.set_timeout(component, name, 50 + i * 10, [i]() { + App.scheduler.set_timeout(component, name, 10 + i * 5, [i]() { ESP_LOGD("test", "Reuse test %d completed", i); }); } @@ -226,7 +226,7 @@ script: for (int i = 0; i < full_reuse_count; i++) { std::string name = "full_reuse_" + std::to_string(i); - App.scheduler.set_timeout(component, name, 50 + i * 10, [i]() { + App.scheduler.set_timeout(component, name, 10 + i * 5, [i]() { ESP_LOGD("test", "Full reuse test %d completed", i); }); } diff --git a/tests/integration/test_scheduler_pool.py b/tests/integration/test_scheduler_pool.py index 9da6ba7f10..98e5904ce6 100644 --- a/tests/integration/test_scheduler_pool.py +++ b/tests/integration/test_scheduler_pool.py @@ -118,33 +118,33 @@ async def test_scheduler_pool( try: # Phase 1: Component lifecycle client.execute_service(phase_services[1], {}) - await asyncio.wait_for(phase_futures[1], timeout=3.0) - await asyncio.sleep(0.5) # Let timeouts complete + await asyncio.wait_for(phase_futures[1], timeout=1.0) + await asyncio.sleep(0.05) # Let timeouts complete # Phase 2: Sensor polling client.execute_service(phase_services[2], {}) - await asyncio.wait_for(phase_futures[2], timeout=3.0) - await asyncio.sleep(1.0) # Let intervals run a bit + await asyncio.wait_for(phase_futures[2], timeout=1.0) + await asyncio.sleep(0.1) # Let intervals run a bit # Phase 3: Communication patterns client.execute_service(phase_services[3], {}) - await asyncio.wait_for(phase_futures[3], timeout=3.0) - await asyncio.sleep(1.0) # Let heartbeat run + await asyncio.wait_for(phase_futures[3], timeout=1.0) + await asyncio.sleep(0.1) # Let heartbeat run # Phase 4: Defer patterns client.execute_service(phase_services[4], {}) - await asyncio.wait_for(phase_futures[4], timeout=3.0) - await asyncio.sleep(2.0) # Let everything settle and recycle + await asyncio.wait_for(phase_futures[4], timeout=1.0) + await asyncio.sleep(0.2) # Let everything settle and recycle # Phase 5: Pool reuse verification client.execute_service(phase_services[5], {}) - await asyncio.wait_for(phase_futures[5], timeout=3.0) - await asyncio.sleep(1.0) # Let Phase 5 timeouts complete and recycle + await asyncio.wait_for(phase_futures[5], timeout=1.0) + await asyncio.sleep(0.1) # Let Phase 5 timeouts complete and recycle # Phase 6: Full pool reuse verification client.execute_service(phase_services[6], {}) - await asyncio.wait_for(phase_futures[6], timeout=3.0) - await asyncio.sleep(1.0) # Let Phase 6 timeouts complete + await asyncio.wait_for(phase_futures[6], timeout=1.0) + await asyncio.sleep(0.1) # Let Phase 6 timeouts complete # Complete test client.execute_service(complete_service, {}) From f72f80ed7bbd30770ab48a67a5570b080186ff1c Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Tue, 2 Sep 2025 16:13:51 -0500 Subject: [PATCH 11/11] cleanup --- tests/integration/test_scheduler_pool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_scheduler_pool.py b/tests/integration/test_scheduler_pool.py index 98e5904ce6..bd878be180 100644 --- a/tests/integration/test_scheduler_pool.py +++ b/tests/integration/test_scheduler_pool.py @@ -148,7 +148,7 @@ async def test_scheduler_pool( # Complete test client.execute_service(complete_service, {}) - await asyncio.wait_for(test_complete_future, timeout=2.0) + await asyncio.wait_for(test_complete_future, timeout=0.5) except TimeoutError as e: # Print debug info if test times out