diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp index c3ade260ac..4d8c4c67da 100644 --- a/esphome/core/scheduler.cpp +++ b/esphome/core/scheduler.cpp @@ -409,6 +409,8 @@ void HOT Scheduler::call(uint32_t now) { // This handles two cases: // 1. Item was marked for removal after cleanup_() but before we got here // 2. Item is marked for removal but wasn't at the front of the heap during cleanup_() + // TEMPORARILY DISABLED TO VERIFY TEST CATCHES THE BUG + /* #ifdef ESPHOME_THREAD_MULTI_NO_ATOMICS // Multi-threaded platforms without atomics: must take lock to safely read remove flag { @@ -428,6 +430,7 @@ void HOT Scheduler::call(uint32_t now) { continue; } #endif + */ #ifdef ESPHOME_DEBUG_SCHEDULER const char *item_name = item->get_name(); diff --git a/tests/integration/fixtures/scheduler_removed_item_race.yaml b/tests/integration/fixtures/scheduler_removed_item_race.yaml new file mode 100644 index 0000000000..f47cf9a163 --- /dev/null +++ b/tests/integration/fixtures/scheduler_removed_item_race.yaml @@ -0,0 +1,118 @@ +esphome: + name: scheduler-removed-item-race + +host: + +api: + services: + - service: run_test + then: + - script.execute: run_test_script + +logger: + level: DEBUG + +globals: + - id: test_passed + type: bool + initial_value: 'true' + - id: removed_item_executed + type: int + initial_value: '0' + - id: normal_item_executed + type: int + initial_value: '0' + +sensor: + - platform: template + id: test_sensor + name: "Test Sensor" + update_interval: never + lambda: return 0.0; + +script: + - id: run_test_script + then: + - logger.log: "=== Starting Removed Item Race Test ===" + + # This test creates a scenario where: + # 1. Multiple timeouts are scheduled to execute at nearly the same time + # 2. One timeout in the middle of the heap gets cancelled + # 3. Without the fix, the cancelled timeout would still execute + + - lambda: |- + // Schedule multiple timeouts that will all be ready at the same time + // This ensures they're all in the heap together + + // First timeout - executes at 10ms + App.scheduler.set_timeout(id(test_sensor), "timeout1", 10, []() { + ESP_LOGD("test", "Timeout 1 executed (expected)"); + id(normal_item_executed)++; + }); + + // Second timeout - executes at 10ms (will be cancelled) + App.scheduler.set_timeout(id(test_sensor), "timeout2", 10, []() { + ESP_LOGE("test", "RACE: Timeout 2 executed after being cancelled!"); + id(removed_item_executed)++; + id(test_passed) = false; + }); + + // Third timeout - executes at 10ms + App.scheduler.set_timeout(id(test_sensor), "timeout3", 10, []() { + ESP_LOGD("test", "Timeout 3 executed (expected)"); + id(normal_item_executed)++; + }); + + // Fourth timeout - executes at 10ms + App.scheduler.set_timeout(id(test_sensor), "timeout4", 10, []() { + ESP_LOGD("test", "Timeout 4 executed (expected)"); + id(normal_item_executed)++; + }); + + // Now cancel timeout2 + // Since all timeouts have the same execution time, they're all in the heap + // timeout2 might not be at the front, so cleanup_() won't remove it + bool cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout2"); + ESP_LOGD("test", "Cancelled timeout2: %s", cancelled ? "true" : "false"); + + // Also test with items at slightly different times + App.scheduler.set_timeout(id(test_sensor), "timeout5", 11, []() { + ESP_LOGD("test", "Timeout 5 executed (expected)"); + id(normal_item_executed)++; + }); + + App.scheduler.set_timeout(id(test_sensor), "timeout6", 12, []() { + ESP_LOGE("test", "RACE: Timeout 6 executed after being cancelled!"); + id(removed_item_executed)++; + id(test_passed) = false; + }); + + App.scheduler.set_timeout(id(test_sensor), "timeout7", 13, []() { + ESP_LOGD("test", "Timeout 7 executed (expected)"); + id(normal_item_executed)++; + }); + + // Cancel timeout6 + cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout6"); + ESP_LOGD("test", "Cancelled timeout6: %s", cancelled ? "true" : "false"); + + # Wait for all timeouts to execute (or not) + - delay: 50ms + + # Check results + - lambda: |- + ESP_LOGI("test", "=== Test Results ==="); + ESP_LOGI("test", "Normal items executed: %d (expected 5)", id(normal_item_executed)); + ESP_LOGI("test", "Removed items executed: %d (expected 0)", id(removed_item_executed)); + + if (id(removed_item_executed) > 0) { + ESP_LOGE("test", "TEST FAILED: %d cancelled items were executed!", id(removed_item_executed)); + id(test_passed) = false; + } else if (id(normal_item_executed) != 5) { + ESP_LOGE("test", "TEST FAILED: Expected 5 normal items, got %d", id(normal_item_executed)); + id(test_passed) = false; + } else { + ESP_LOGI("test", "TEST PASSED: No cancelled items were executed"); + } + + ESP_LOGI("test", "=== Test Complete ==="); diff --git a/tests/integration/test_scheduler_removed_item_race.py b/tests/integration/test_scheduler_removed_item_race.py new file mode 100644 index 0000000000..c95a399ce3 --- /dev/null +++ b/tests/integration/test_scheduler_removed_item_race.py @@ -0,0 +1,102 @@ +"""Test for scheduler race condition where removed items still execute.""" + +import asyncio +import re + +import pytest + +from .types import APIClientConnectedFactory, RunCompiledFunction + + +@pytest.mark.asyncio +async def test_scheduler_removed_item_race( + yaml_config: str, + run_compiled: RunCompiledFunction, + api_client_connected: APIClientConnectedFactory, +) -> None: + """Test that items marked for removal don't execute. + + This test verifies the fix for a race condition where: + 1. cleanup_() only removes items from the front of the heap + 2. Items in the middle of the heap marked for removal still execute + 3. This causes cancelled timeouts to run when they shouldn't + """ + + loop = asyncio.get_running_loop() + test_complete_future: asyncio.Future[bool] = loop.create_future() + + # Track test results + test_passed = False + removed_executed = 0 + normal_executed = 0 + + # Patterns to match + race_pattern = re.compile(r"RACE: .* executed after being cancelled!") + passed_pattern = re.compile(r"TEST PASSED") + failed_pattern = re.compile(r"TEST FAILED") + complete_pattern = re.compile(r"=== Test Complete ===") + normal_count_pattern = re.compile(r"Normal items executed: (\d+)") + removed_count_pattern = re.compile(r"Removed items executed: (\d+)") + + def check_output(line: str) -> None: + """Check log output for test results.""" + nonlocal test_passed, removed_executed, normal_executed + + if race_pattern.search(line): + # Race condition detected - a cancelled item executed + test_passed = False + + if passed_pattern.search(line): + test_passed = True + elif failed_pattern.search(line): + test_passed = False + + normal_match = normal_count_pattern.search(line) + if normal_match: + normal_executed = int(normal_match.group(1)) + + removed_match = removed_count_pattern.search(line) + if removed_match: + removed_executed = int(removed_match.group(1)) + + if not test_complete_future.done() and complete_pattern.search(line): + test_complete_future.set_result(True) + + async with ( + run_compiled(yaml_config, line_callback=check_output), + api_client_connected() as client, + ): + # Verify we can connect + device_info = await client.device_info() + assert device_info is not None + assert device_info.name == "scheduler-removed-item-race" + + # List services + _, services = await asyncio.wait_for( + client.list_entities_services(), timeout=5.0 + ) + + # Find run_test service + run_test_service = next((s for s in services if s.name == "run_test"), None) + assert run_test_service is not None, "run_test service not found" + + # Execute the test + client.execute_service(run_test_service, {}) + + # Wait for test completion + try: + await asyncio.wait_for(test_complete_future, timeout=5.0) + except TimeoutError: + pytest.fail("Test did not complete within timeout") + + # Verify results + assert test_passed, ( + f"Test failed! Removed items executed: {removed_executed}, " + f"Normal items executed: {normal_executed}" + ) + assert removed_executed == 0, ( + f"Cancelled items should not execute, but {removed_executed} did" + ) + assert normal_executed == 5, ( + f"Expected 5 normal items to execute, got {normal_executed}" + )