diff --git a/esphome/core/scheduler.cpp b/esphome/core/scheduler.cpp
index c3ade260ac..4d8c4c67da 100644
--- a/esphome/core/scheduler.cpp
+++ b/esphome/core/scheduler.cpp
@@ -409,6 +409,8 @@ void HOT Scheduler::call(uint32_t now) {
       // This handles two cases:
       // 1. Item was marked for removal after cleanup_() but before we got here
       // 2. Item is marked for removal but wasn't at the front of the heap during cleanup_()
+      // TEMPORARILY DISABLED TO VERIFY TEST CATCHES THE BUG
+      /*
 #ifdef ESPHOME_THREAD_MULTI_NO_ATOMICS
       // Multi-threaded platforms without atomics: must take lock to safely read remove flag
       {
@@ -428,6 +430,7 @@ void HOT Scheduler::call(uint32_t now) {
         continue;
       }
 #endif
+      */
 
 #ifdef ESPHOME_DEBUG_SCHEDULER
       const char *item_name = item->get_name();
diff --git a/tests/integration/fixtures/scheduler_removed_item_race.yaml b/tests/integration/fixtures/scheduler_removed_item_race.yaml
new file mode 100644
index 0000000000..f47cf9a163
--- /dev/null
+++ b/tests/integration/fixtures/scheduler_removed_item_race.yaml
@@ -0,0 +1,118 @@
+esphome:
+  name: scheduler-removed-item-race
+
+host:
+
+api:
+  services:
+    - service: run_test
+      then:
+        - script.execute: run_test_script
+
+logger:
+  level: DEBUG
+
+globals:
+  - id: test_passed
+    type: bool
+    initial_value: 'true'
+  - id: removed_item_executed
+    type: int
+    initial_value: '0'
+  - id: normal_item_executed
+    type: int
+    initial_value: '0'
+
+sensor:
+  - platform: template
+    id: test_sensor
+    name: "Test Sensor"
+    update_interval: never
+    lambda: return 0.0;
+
+script:
+  - id: run_test_script
+    then:
+      - logger.log: "=== Starting Removed Item Race Test ==="
+
+      # This test creates a scenario where:
+      # 1. Multiple timeouts are scheduled to execute at nearly the same time
+      # 2. One timeout in the middle of the heap gets cancelled
+      # 3. Without the fix, the cancelled timeout would still execute
+
+      - lambda: |-
+          // Schedule multiple timeouts that will all be ready at the same time
+          // This ensures they're all in the heap together
+
+          // First timeout - executes at 10ms
+          App.scheduler.set_timeout(id(test_sensor), "timeout1", 10, []() {
+            ESP_LOGD("test", "Timeout 1 executed (expected)");
+            id(normal_item_executed)++;
+          });
+
+          // Second timeout - executes at 10ms (will be cancelled)
+          App.scheduler.set_timeout(id(test_sensor), "timeout2", 10, []() {
+            ESP_LOGE("test", "RACE: Timeout 2 executed after being cancelled!");
+            id(removed_item_executed)++;
+            id(test_passed) = false;
+          });
+
+          // Third timeout - executes at 10ms
+          App.scheduler.set_timeout(id(test_sensor), "timeout3", 10, []() {
+            ESP_LOGD("test", "Timeout 3 executed (expected)");
+            id(normal_item_executed)++;
+          });
+
+          // Fourth timeout - executes at 10ms
+          App.scheduler.set_timeout(id(test_sensor), "timeout4", 10, []() {
+            ESP_LOGD("test", "Timeout 4 executed (expected)");
+            id(normal_item_executed)++;
+          });
+
+          // Now cancel timeout2
+          // Since all timeouts have the same execution time, they're all in the heap
+          // timeout2 might not be at the front, so cleanup_() won't remove it
+          bool cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout2");
+          ESP_LOGD("test", "Cancelled timeout2: %s", cancelled ? "true" : "false");
+
+          // Also test with items at slightly different times
+          App.scheduler.set_timeout(id(test_sensor), "timeout5", 11, []() {
+            ESP_LOGD("test", "Timeout 5 executed (expected)");
+            id(normal_item_executed)++;
+          });
+
+          App.scheduler.set_timeout(id(test_sensor), "timeout6", 12, []() {
+            ESP_LOGE("test", "RACE: Timeout 6 executed after being cancelled!");
+            id(removed_item_executed)++;
+            id(test_passed) = false;
+          });
+
+          App.scheduler.set_timeout(id(test_sensor), "timeout7", 13, []() {
+            ESP_LOGD("test", "Timeout 7 executed (expected)");
+            id(normal_item_executed)++;
+          });
+
+          // Cancel timeout6
+          cancelled = App.scheduler.cancel_timeout(id(test_sensor), "timeout6");
+          ESP_LOGD("test", "Cancelled timeout6: %s", cancelled ? "true" : "false");
+
+      # Wait for all timeouts to execute (or not)
+      - delay: 50ms
+
+      # Check results
+      - lambda: |-
+          ESP_LOGI("test", "=== Test Results ===");
+          ESP_LOGI("test", "Normal items executed: %d (expected 5)", id(normal_item_executed));
+          ESP_LOGI("test", "Removed items executed: %d (expected 0)", id(removed_item_executed));
+
+          if (id(removed_item_executed) > 0) {
+            ESP_LOGE("test", "TEST FAILED: %d cancelled items were executed!", id(removed_item_executed));
+            id(test_passed) = false;
+          } else if (id(normal_item_executed) != 5) {
+            ESP_LOGE("test", "TEST FAILED: Expected 5 normal items, got %d", id(normal_item_executed));
+            id(test_passed) = false;
+          } else {
+            ESP_LOGI("test", "TEST PASSED: No cancelled items were executed");
+          }
+
+          ESP_LOGI("test", "=== Test Complete ===");
diff --git a/tests/integration/test_scheduler_removed_item_race.py b/tests/integration/test_scheduler_removed_item_race.py
new file mode 100644
index 0000000000..c95a399ce3
--- /dev/null
+++ b/tests/integration/test_scheduler_removed_item_race.py
@@ -0,0 +1,102 @@
+"""Test for scheduler race condition where removed items still execute."""
+
+import asyncio
+import re
+
+import pytest
+
+from .types import APIClientConnectedFactory, RunCompiledFunction
+
+
+@pytest.mark.asyncio
+async def test_scheduler_removed_item_race(
+    yaml_config: str,
+    run_compiled: RunCompiledFunction,
+    api_client_connected: APIClientConnectedFactory,
+) -> None:
+    """Test that items marked for removal don't execute.
+
+    This test verifies the fix for a race condition where:
+    1. cleanup_() only removes items from the front of the heap
+    2. Items in the middle of the heap marked for removal still execute
+    3. This causes cancelled timeouts to run when they shouldn't
+    """
+
+    loop = asyncio.get_running_loop()
+    test_complete_future: asyncio.Future[bool] = loop.create_future()
+
+    # Track test results
+    test_passed = False
+    removed_executed = 0
+    normal_executed = 0
+
+    # Patterns to match
+    race_pattern = re.compile(r"RACE: .* executed after being cancelled!")
+    passed_pattern = re.compile(r"TEST PASSED")
+    failed_pattern = re.compile(r"TEST FAILED")
+    complete_pattern = re.compile(r"=== Test Complete ===")
+    normal_count_pattern = re.compile(r"Normal items executed: (\d+)")
+    removed_count_pattern = re.compile(r"Removed items executed: (\d+)")
+
+    def check_output(line: str) -> None:
+        """Check log output for test results."""
+        nonlocal test_passed, removed_executed, normal_executed
+
+        if race_pattern.search(line):
+            # Race condition detected - a cancelled item executed
+            test_passed = False
+
+        if passed_pattern.search(line):
+            test_passed = True
+        elif failed_pattern.search(line):
+            test_passed = False
+
+        normal_match = normal_count_pattern.search(line)
+        if normal_match:
+            normal_executed = int(normal_match.group(1))
+
+        removed_match = removed_count_pattern.search(line)
+        if removed_match:
+            removed_executed = int(removed_match.group(1))
+
+        if not test_complete_future.done() and complete_pattern.search(line):
+            test_complete_future.set_result(True)
+
+    async with (
+        run_compiled(yaml_config, line_callback=check_output),
+        api_client_connected() as client,
+    ):
+        # Verify we can connect
+        device_info = await client.device_info()
+        assert device_info is not None
+        assert device_info.name == "scheduler-removed-item-race"
+
+        # List services
+        _, services = await asyncio.wait_for(
+            client.list_entities_services(), timeout=5.0
+        )
+
+        # Find run_test service
+        run_test_service = next((s for s in services if s.name == "run_test"), None)
+        assert run_test_service is not None, "run_test service not found"
+
+        # Execute the test
+        client.execute_service(run_test_service, {})
+
+        # Wait for test completion
+        try:
+            await asyncio.wait_for(test_complete_future, timeout=5.0)
+        except TimeoutError:
+            pytest.fail("Test did not complete within timeout")
+
+        # Verify results
+        assert test_passed, (
+            f"Test failed! Removed items executed: {removed_executed}, "
+            f"Normal items executed: {normal_executed}"
+        )
+        assert removed_executed == 0, (
+            f"Cancelled items should not execute, but {removed_executed} did"
+        )
+        assert normal_executed == 5, (
+            f"Expected 5 normal items to execute, got {normal_executed}"
+        )