From c56fd00a7c75096c411524577a395a178a93341a Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 Aug 2025 16:28:32 -0400 Subject: [PATCH] cleanup --- esphome/core/scheduler.cpp | 3 - .../fixtures/scheduler_removed_item_race.yaml | 113 +++++++++++------- .../test_scheduler_removed_item_race.py | 4 +- 3 files changed, 69 insertions(+), 51 deletions(-) diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index 4d8c4c67da..c3ade260ac 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -409,8 +409,6 @@ void HOT Scheduler::call(uint32_t now) { // This handles two cases: // 1. Item was marked for removal after cleanup_() but before we got here // 2. Item is marked for removal but wasn't at the front of the heap during cleanup_() - // TEMPORARILY DISABLED TO VERIFY TEST CATCHES THE BUG - /* #ifdef ESPHOME_THREAD_MULTI_NO_ATOMICS // Multi-threaded platforms without atomics: must take lock to safely read remove flag { @@ -430,7 +428,6 @@ void HOT Scheduler::call(uint32_t now) { continue; } #endif - */ #ifdef ESPHOME_DEBUG_SCHEDULER const char *item_name = item->get_name(); diff --git a/tests/integration/fixtures/scheduler_removed_item_race.yaml b/tests/integration/fixtures/scheduler_removed_item_race.yaml index f47cf9a163..2f8a7fb987 100644 --- a/tests/integration/fixtures/scheduler_removed_item_race.yaml +++ b/tests/integration/fixtures/scheduler_removed_item_race.yaml @@ -36,80 +36,101 @@ script: - logger.log: "=== Starting Removed Item Race Test ===" # This test creates a scenario where: - # 1. Multiple timeouts are scheduled to execute at nearly the same time - # 2. One timeout in the middle of the heap gets cancelled - # 3. Without the fix, the cancelled timeout would still execute + # 1. First item in heap is NOT cancelled (cleanup stops immediately) + # 2. Items behind it ARE cancelled (remain in heap after cleanup) + # 3. All items execute at the same time, including cancelled ones - lambda: |- - // Schedule multiple timeouts that will all be ready at the same time - // This ensures they're all in the heap together + // The key to hitting the race: + // 1. Add items in a specific order to control heap structure + // 2. Cancel ONLY items that won't be at the front + // 3. Ensure the first item stays non-cancelled so cleanup_() stops immediately - // First timeout - executes at 10ms - App.scheduler.set_timeout(id(test_sensor), "timeout1", 10, []() { - ESP_LOGD("test", "Timeout 1 executed (expected)"); + // Schedule all items to execute at the SAME time (1ms from now) + // Using 1ms instead of 0 to avoid defer queue on multi-core platforms + // This ensures they'll all be ready together and go through the heap + const uint32_t exec_time = 1; + + // CRITICAL: Add a non-cancellable item FIRST + // This will be at the front of the heap and block cleanup_() + App.scheduler.set_timeout(id(test_sensor), "blocker", exec_time, []() { + ESP_LOGD("test", "Blocker timeout executed (expected) - was at front of heap"); id(normal_item_executed)++; }); - // Second timeout - executes at 10ms (will be cancelled) - App.scheduler.set_timeout(id(test_sensor), "timeout2", 10, []() { - ESP_LOGE("test", "RACE: Timeout 2 executed after being cancelled!"); + // Now add items that we WILL cancel + // These will be behind the blocker in the heap + App.scheduler.set_timeout(id(test_sensor), "cancel_1", exec_time, []() { + ESP_LOGE("test", "RACE: Cancelled timeout 1 executed after being cancelled!"); id(removed_item_executed)++; id(test_passed) = false; }); - // Third timeout - executes at 10ms - App.scheduler.set_timeout(id(test_sensor), "timeout3", 10, []() { - ESP_LOGD("test", "Timeout 3 executed (expected)"); - id(normal_item_executed)++; - }); - - // Fourth timeout - executes at 10ms - App.scheduler.set_timeout(id(test_sensor), "timeout4", 10, []() { - ESP_LOGD("test", "Timeout 4 executed (expected)"); - id(normal_item_executed)++; - }); - - // Now cancel timeout2 - // Since all timeouts have the same execution time, they're all in the heap - // timeout2 might not be at the front, so cleanup_() won't remove it - bool cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout2"); - ESP_LOGD("test", "Cancelled timeout2: %s", cancelled ? "true" : "false"); - - // Also test with items at slightly different times - App.scheduler.set_timeout(id(test_sensor), "timeout5", 11, []() { - ESP_LOGD("test", "Timeout 5 executed (expected)"); - id(normal_item_executed)++; - }); - - App.scheduler.set_timeout(id(test_sensor), "timeout6", 12, []() { - ESP_LOGE("test", "RACE: Timeout 6 executed after being cancelled!"); + App.scheduler.set_timeout(id(test_sensor), "cancel_2", exec_time, []() { + ESP_LOGE("test", "RACE: Cancelled timeout 2 executed after being cancelled!"); id(removed_item_executed)++; id(test_passed) = false; }); - App.scheduler.set_timeout(id(test_sensor), "timeout7", 13, []() { - ESP_LOGD("test", "Timeout 7 executed (expected)"); + App.scheduler.set_timeout(id(test_sensor), "cancel_3", exec_time, []() { + ESP_LOGE("test", "RACE: Cancelled timeout 3 executed after being cancelled!"); + id(removed_item_executed)++; + id(test_passed) = false; + }); + + // Add some more normal items + App.scheduler.set_timeout(id(test_sensor), "normal_1", exec_time, []() { + ESP_LOGD("test", "Normal timeout 1 executed (expected)"); id(normal_item_executed)++; }); - // Cancel timeout6 - cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout6"); - ESP_LOGD("test", "Cancelled timeout6: %s", cancelled ? "true" : "false"); + App.scheduler.set_timeout(id(test_sensor), "normal_2", exec_time, []() { + ESP_LOGD("test", "Normal timeout 2 executed (expected)"); + id(normal_item_executed)++; + }); + + App.scheduler.set_timeout(id(test_sensor), "normal_3", exec_time, []() { + ESP_LOGD("test", "Normal timeout 3 executed (expected)"); + id(normal_item_executed)++; + }); + + // Force items into the heap before cancelling + App.scheduler.process_to_add(); + + // NOW cancel the items - they're behind "blocker" in the heap + // When cleanup_() runs, it will see "blocker" (not removed) at the front + // and stop immediately, leaving cancel_1, cancel_2, cancel_3 in the heap + bool c1 = App.scheduler.cancel_timeout(id(test_sensor), "cancel_1"); + bool c2 = App.scheduler.cancel_timeout(id(test_sensor), "cancel_2"); + bool c3 = App.scheduler.cancel_timeout(id(test_sensor), "cancel_3"); + + ESP_LOGD("test", "Cancelled items (behind blocker): %s, %s, %s", + c1 ? "true" : "false", + c2 ? "true" : "false", + c3 ? "true" : "false"); + + // The heap now has: + // - "blocker" at front (not cancelled) + // - cancelled items behind it (marked remove=true but still in heap) + // - When all execute at once, cleanup_() stops at "blocker" + // - The loop then executes ALL ready items including cancelled ones + + ESP_LOGD("test", "Setup complete. Blocker at front prevents cleanup of cancelled items behind it"); # Wait for all timeouts to execute (or not) - - delay: 50ms + - delay: 20ms # Check results - lambda: |- ESP_LOGI("test", "=== Test Results ==="); - ESP_LOGI("test", "Normal items executed: %d (expected 5)", id(normal_item_executed)); + ESP_LOGI("test", "Normal items executed: %d (expected 4)", id(normal_item_executed)); ESP_LOGI("test", "Removed items executed: %d (expected 0)", id(removed_item_executed)); if (id(removed_item_executed) > 0) { ESP_LOGE("test", "TEST FAILED: %d cancelled items were executed!", id(removed_item_executed)); id(test_passed) = false; - } else if (id(normal_item_executed) != 5) { - ESP_LOGE("test", "TEST FAILED: Expected 5 normal items, got %d", id(normal_item_executed)); + } else if (id(normal_item_executed) != 4) { + ESP_LOGE("test", "TEST FAILED: Expected 4 normal items, got %d", id(normal_item_executed)); id(test_passed) = false; } else { ESP_LOGI("test", "TEST PASSED: No cancelled items were executed"); diff --git a/tests/integration/test_scheduler_removed_item_race.py b/tests/integration/test_scheduler_removed_item_race.py index c95a399ce3..3e72bacc0d 100644 --- a/tests/integration/test_scheduler_removed_item_race.py +++ b/tests/integration/test_scheduler_removed_item_race.py @@ -97,6 +97,6 @@ async def test_scheduler_removed_item_race( assert removed_executed == 0, ( f"Cancelled items should not execute, but {removed_executed} did" ) - assert normal_executed == 5, ( - f"Expected 5 normal items to execute, got {normal_executed}" + assert normal_executed == 4, ( + f"Expected 4 normal items to execute, got {normal_executed}" )