fix

2025-10-30 06:33:51 +00:00 · 2025-08-17 16:12:53 -04:00
parent 5b674dc28c
commit e06dbffe9f
3 changed files with 223 additions and 0 deletions
--- a/esphome/core/scheduler.cpp
+++ b/esphome/core/scheduler.cpp
@@ -409,6 +409,8 @@ void HOT Scheduler::call(uint32_t now) {
      // This handles two cases:
      // 1. Item was marked for removal after cleanup_() but before we got here
      // 2. Item is marked for removal but wasn't at the front of the heap during cleanup_()
      // TEMPORARILY DISABLED TO VERIFY TEST CATCHES THE BUG
      /*
 #ifdef ESPHOME_THREAD_MULTI_NO_ATOMICS
      // Multi-threaded platforms without atomics: must take lock to safely read remove flag
      {
@@ -428,6 +430,7 @@ void HOT Scheduler::call(uint32_t now) {
        continue;
      }
 #endif
      */
 #ifdef ESPHOME_DEBUG_SCHEDULER
      const char *item_name = item->get_name();
--- a/tests/integration/fixtures/scheduler_removed_item_race.yaml
+++ b/tests/integration/fixtures/scheduler_removed_item_race.yaml
@@ -0,0 +1,118 @@
 esphome:
  name: scheduler-removed-item-race
 host:
 api:
  services:
    - service: run_test
      then:
        - script.execute: run_test_script
 logger:
  level: DEBUG
 globals:
  - id: test_passed
    type: bool
    initial_value: 'true'
  - id: removed_item_executed
    type: int
    initial_value: '0'
  - id: normal_item_executed
    type: int
    initial_value: '0'
 sensor:
  - platform: template
    id: test_sensor
    name: "Test Sensor"
    update_interval: never
    lambda: return 0.0;
 script:
  - id: run_test_script
    then:
      - logger.log: "=== Starting Removed Item Race Test ==="
      # This test creates a scenario where:
      # 1. Multiple timeouts are scheduled to execute at nearly the same time
      # 2. One timeout in the middle of the heap gets cancelled
      # 3. Without the fix, the cancelled timeout would still execute
      - lambda: |-
          // Schedule multiple timeouts that will all be ready at the same time
          // This ensures they're all in the heap together
          // First timeout - executes at 10ms
          App.scheduler.set_timeout(id(test_sensor), "timeout1", 10, []() {
            ESP_LOGD("test", "Timeout 1 executed (expected)");
            id(normal_item_executed)++;
          });
          // Second timeout - executes at 10ms (will be cancelled)
          App.scheduler.set_timeout(id(test_sensor), "timeout2", 10, []() {
            ESP_LOGE("test", "RACE: Timeout 2 executed after being cancelled!");
            id(removed_item_executed)++;
            id(test_passed) = false;
          });
          // Third timeout - executes at 10ms
          App.scheduler.set_timeout(id(test_sensor), "timeout3", 10, []() {
            ESP_LOGD("test", "Timeout 3 executed (expected)");
            id(normal_item_executed)++;
          });
          // Fourth timeout - executes at 10ms
          App.scheduler.set_timeout(id(test_sensor), "timeout4", 10, []() {
            ESP_LOGD("test", "Timeout 4 executed (expected)");
            id(normal_item_executed)++;
          });
          // Now cancel timeout2
          // Since all timeouts have the same execution time, they're all in the heap
          // timeout2 might not be at the front, so cleanup_() won't remove it
          bool cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout2");
          ESP_LOGD("test", "Cancelled timeout2: %s", cancelled ? "true" : "false");
          // Also test with items at slightly different times
          App.scheduler.set_timeout(id(test_sensor), "timeout5", 11, []() {
            ESP_LOGD("test", "Timeout 5 executed (expected)");
            id(normal_item_executed)++;
          });
          App.scheduler.set_timeout(id(test_sensor), "timeout6", 12, []() {
            ESP_LOGE("test", "RACE: Timeout 6 executed after being cancelled!");
            id(removed_item_executed)++;
            id(test_passed) = false;
          });
          App.scheduler.set_timeout(id(test_sensor), "timeout7", 13, []() {
            ESP_LOGD("test", "Timeout 7 executed (expected)");
            id(normal_item_executed)++;
          });
          // Cancel timeout6
          cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout6");
          ESP_LOGD("test", "Cancelled timeout6: %s", cancelled ? "true" : "false");
      # Wait for all timeouts to execute (or not)
      - delay: 50ms
      # Check results
      - lambda: |-
          ESP_LOGI("test", "=== Test Results ===");
          ESP_LOGI("test", "Normal items executed: %d (expected 5)", id(normal_item_executed));
          ESP_LOGI("test", "Removed items executed: %d (expected 0)", id(removed_item_executed));
          if (id(removed_item_executed) > 0) {
            ESP_LOGE("test", "TEST FAILED: %d cancelled items were executed!", id(removed_item_executed));
            id(test_passed) = false;
          } else if (id(normal_item_executed) != 5) {
            ESP_LOGE("test", "TEST FAILED: Expected 5 normal items, got %d", id(normal_item_executed));
            id(test_passed) = false;
          } else {
            ESP_LOGI("test", "TEST PASSED: No cancelled items were executed");
          }
          ESP_LOGI("test", "=== Test Complete ===");
--- a/tests/integration/test_scheduler_removed_item_race.py
+++ b/tests/integration/test_scheduler_removed_item_race.py
@@ -0,0 +1,102 @@
 """Test for scheduler race condition where removed items still execute."""
 import asyncio
 import re
 import pytest
 from .types import APIClientConnectedFactory, RunCompiledFunction
@pytest.mark.asyncio
 async def test_scheduler_removed_item_race(
    yaml_config: str,
    run_compiled: RunCompiledFunction,
    api_client_connected: APIClientConnectedFactory,
 ) -> None:
    """Test that items marked for removal don't execute.
    This test verifies the fix for a race condition where:
    1. cleanup_() only removes items from the front of the heap
    2. Items in the middle of the heap marked for removal still execute
    3. This causes cancelled timeouts to run when they shouldn't
    """
    loop = asyncio.get_running_loop()
    test_complete_future: asyncio.Future[bool] = loop.create_future()
    # Track test results
    test_passed = False
    removed_executed = 0
    normal_executed = 0
    # Patterns to match
    race_pattern = re.compile(r"RACE: .* executed after being cancelled!")
    passed_pattern = re.compile(r"TEST PASSED")
    failed_pattern = re.compile(r"TEST FAILED")
    complete_pattern = re.compile(r"=== Test Complete ===")
    normal_count_pattern = re.compile(r"Normal items executed: (\d+)")
    removed_count_pattern = re.compile(r"Removed items executed: (\d+)")
    def check_output(line: str) -> None:
        """Check log output for test results."""
        nonlocal test_passed, removed_executed, normal_executed
        if race_pattern.search(line):
            # Race condition detected - a cancelled item executed
            test_passed = False
        if passed_pattern.search(line):
            test_passed = True
        elif failed_pattern.search(line):
            test_passed = False
        normal_match = normal_count_pattern.search(line)
        if normal_match:
            normal_executed = int(normal_match.group(1))
        removed_match = removed_count_pattern.search(line)
        if removed_match:
            removed_executed = int(removed_match.group(1))
        if not test_complete_future.done() and complete_pattern.search(line):
            test_complete_future.set_result(True)
    async with (
        run_compiled(yaml_config, line_callback=check_output),
        api_client_connected() as client,
    ):
        # Verify we can connect
        device_info = await client.device_info()
        assert device_info is not None
        assert device_info.name == "scheduler-removed-item-race"
        # List services
        _, services = await asyncio.wait_for(
            client.list_entities_services(), timeout=5.0
        )
        # Find run_test service
        run_test_service = next((s for s in services if s.name == "run_test"), None)
        assert run_test_service is not None, "run_test service not found"
        # Execute the test
        client.execute_service(run_test_service, {})
        # Wait for test completion
        try:
            await asyncio.wait_for(test_complete_future, timeout=5.0)
        except TimeoutError:
            pytest.fail("Test did not complete within timeout")
        # Verify results
        assert test_passed, (
            f"Test failed! Removed items executed: {removed_executed}, "
            f"Normal items executed: {normal_executed}"
        )
        assert removed_executed == 0, (
            f"Cancelled items should not execute, but {removed_executed} did"
        )
        assert normal_executed == 5, (
            f"Expected 5 normal items to execute, got {normal_executed}"
        )