mirror of
				https://github.com/esphome/esphome.git
				synced 2025-10-30 22:53:59 +00:00 
			
		
		
		
	Fix defer() thread safety issues on multi-core platforms (#9317)
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
		
							
								
								
									
										109
									
								
								tests/integration/fixtures/defer_fifo_simple.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								tests/integration/fixtures/defer_fifo_simple.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,109 @@ | ||||
| esphome: | ||||
|   name: defer-fifo-simple | ||||
|  | ||||
| host: | ||||
|  | ||||
| logger: | ||||
|   level: DEBUG | ||||
|  | ||||
| api: | ||||
|   services: | ||||
|     - service: test_set_timeout | ||||
|       then: | ||||
|         - lambda: |- | ||||
|             // Test set_timeout with 0 delay (direct scheduler call) | ||||
|             static int set_timeout_order = 0; | ||||
|             static bool set_timeout_passed = true; | ||||
|  | ||||
|             // Reset for this test | ||||
|             set_timeout_order = 0; | ||||
|             set_timeout_passed = true; | ||||
|  | ||||
|             ESP_LOGD("defer_test", "Testing set_timeout(0) for FIFO order..."); | ||||
|             for (int i = 0; i < 10; i++) { | ||||
|               int expected = i; | ||||
|               App.scheduler.set_timeout((Component*)nullptr, nullptr, 0, [expected]() { | ||||
|                 ESP_LOGD("defer_test", "set_timeout(0) item %d executed, order %d", expected, set_timeout_order); | ||||
|                 if (set_timeout_order != expected) { | ||||
|                   ESP_LOGE("defer_test", "FIFO violation in set_timeout: expected %d but got execution order %d", expected, set_timeout_order); | ||||
|                   set_timeout_passed = false; | ||||
|                 } | ||||
|                 set_timeout_order++; | ||||
|  | ||||
|                 if (set_timeout_order == 10) { | ||||
|                   if (set_timeout_passed) { | ||||
|                     ESP_LOGI("defer_test", "✓ Test PASSED - set_timeout(0) maintains FIFO order"); | ||||
|                     id(test_result)->trigger("passed"); | ||||
|                   } else { | ||||
|                     ESP_LOGE("defer_test", "✗ Test FAILED - set_timeout(0) executed out of order"); | ||||
|                     id(test_result)->trigger("failed"); | ||||
|                   } | ||||
|                   id(test_complete)->trigger("test_finished"); | ||||
|                 } | ||||
|               }); | ||||
|             } | ||||
|  | ||||
|             ESP_LOGD("defer_test", "Deferred 10 items using set_timeout(0), waiting for execution..."); | ||||
|  | ||||
|     - service: test_defer | ||||
|       then: | ||||
|         - lambda: |- | ||||
|             // Test defer() method (component method) | ||||
|             static int defer_order = 0; | ||||
|             static bool defer_passed = true; | ||||
|  | ||||
|             // Reset for this test | ||||
|             defer_order = 0; | ||||
|             defer_passed = true; | ||||
|  | ||||
|             ESP_LOGD("defer_test", "Testing defer() for FIFO order..."); | ||||
|  | ||||
|             // Create a test component class that exposes defer() | ||||
|             class TestComponent : public Component { | ||||
|             public: | ||||
|               void test_defer() { | ||||
|                 for (int i = 0; i < 10; i++) { | ||||
|                   int expected = i; | ||||
|                   this->defer([expected]() { | ||||
|                     ESP_LOGD("defer_test", "defer() item %d executed, order %d", expected, defer_order); | ||||
|                     if (defer_order != expected) { | ||||
|                       ESP_LOGE("defer_test", "FIFO violation in defer: expected %d but got execution order %d", expected, defer_order); | ||||
|                       defer_passed = false; | ||||
|                     } | ||||
|                     defer_order++; | ||||
|  | ||||
|                     if (defer_order == 10) { | ||||
|                       if (defer_passed) { | ||||
|                         ESP_LOGI("defer_test", "✓ Test PASSED - defer() maintains FIFO order"); | ||||
|                         id(test_result)->trigger("passed"); | ||||
|                       } else { | ||||
|                         ESP_LOGE("defer_test", "✗ Test FAILED - defer() executed out of order"); | ||||
|                         id(test_result)->trigger("failed"); | ||||
|                       } | ||||
|                       id(test_complete)->trigger("test_finished"); | ||||
|                     } | ||||
|                   }); | ||||
|                 } | ||||
|               } | ||||
|             }; | ||||
|  | ||||
|             // Use a static instance so it doesn't go out of scope | ||||
|             static TestComponent test_component; | ||||
|             test_component.test_defer(); | ||||
|  | ||||
|             ESP_LOGD("defer_test", "Deferred 10 items using defer(), waiting for execution..."); | ||||
|  | ||||
| event: | ||||
|   - platform: template | ||||
|     name: "Test Complete" | ||||
|     id: test_complete | ||||
|     device_class: button | ||||
|     event_types: | ||||
|       - "test_finished" | ||||
|   - platform: template | ||||
|     name: "Test Result" | ||||
|     id: test_result | ||||
|     device_class: button | ||||
|     event_types: | ||||
|       - "passed" | ||||
|       - "failed" | ||||
							
								
								
									
										38
									
								
								tests/integration/fixtures/defer_stress.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								tests/integration/fixtures/defer_stress.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| esphome: | ||||
|   name: defer-stress-test | ||||
|  | ||||
| external_components: | ||||
|   - source: | ||||
|       type: local | ||||
|       path: EXTERNAL_COMPONENT_PATH | ||||
|     components: [defer_stress_component] | ||||
|  | ||||
| host: | ||||
|  | ||||
| logger: | ||||
|   level: VERBOSE | ||||
|  | ||||
| defer_stress_component: | ||||
|   id: defer_stress | ||||
|  | ||||
| api: | ||||
|   services: | ||||
|     - service: run_stress_test | ||||
|       then: | ||||
|         - lambda: |- | ||||
|             id(defer_stress)->run_multi_thread_test(); | ||||
|  | ||||
| event: | ||||
|   - platform: template | ||||
|     name: "Test Complete" | ||||
|     id: test_complete | ||||
|     device_class: button | ||||
|     event_types: | ||||
|       - "test_finished" | ||||
|   - platform: template | ||||
|     name: "Test Result" | ||||
|     id: test_result | ||||
|     device_class: button | ||||
|     event_types: | ||||
|       - "passed" | ||||
|       - "failed" | ||||
| @@ -0,0 +1,19 @@ | ||||
| import esphome.codegen as cg | ||||
| import esphome.config_validation as cv | ||||
| from esphome.const import CONF_ID | ||||
|  | ||||
| defer_stress_component_ns = cg.esphome_ns.namespace("defer_stress_component") | ||||
| DeferStressComponent = defer_stress_component_ns.class_( | ||||
|     "DeferStressComponent", cg.Component | ||||
| ) | ||||
|  | ||||
| CONFIG_SCHEMA = cv.Schema( | ||||
|     { | ||||
|         cv.GenerateID(): cv.declare_id(DeferStressComponent), | ||||
|     } | ||||
| ).extend(cv.COMPONENT_SCHEMA) | ||||
|  | ||||
|  | ||||
| async def to_code(config): | ||||
|     var = cg.new_Pvariable(config[CONF_ID]) | ||||
|     await cg.register_component(var, config) | ||||
| @@ -0,0 +1,75 @@ | ||||
| #include "defer_stress_component.h" | ||||
| #include "esphome/core/log.h" | ||||
| #include <thread> | ||||
| #include <atomic> | ||||
| #include <vector> | ||||
| #include <chrono> | ||||
|  | ||||
| namespace esphome { | ||||
| namespace defer_stress_component { | ||||
|  | ||||
| static const char *const TAG = "defer_stress"; | ||||
|  | ||||
| void DeferStressComponent::setup() { ESP_LOGCONFIG(TAG, "DeferStressComponent setup"); } | ||||
|  | ||||
| void DeferStressComponent::run_multi_thread_test() { | ||||
|   // Use member variables instead of static to avoid issues | ||||
|   this->total_defers_ = 0; | ||||
|   this->executed_defers_ = 0; | ||||
|   static constexpr int NUM_THREADS = 10; | ||||
|   static constexpr int DEFERS_PER_THREAD = 100; | ||||
|  | ||||
|   ESP_LOGI(TAG, "Starting defer stress test - multi-threaded concurrent defers"); | ||||
|  | ||||
|   // Ensure we're starting clean | ||||
|   ESP_LOGI(TAG, "Initial counters: total=%d, executed=%d", this->total_defers_.load(), this->executed_defers_.load()); | ||||
|  | ||||
|   // Track start time | ||||
|   auto start_time = std::chrono::steady_clock::now(); | ||||
|  | ||||
|   // Create threads | ||||
|   std::vector<std::thread> threads; | ||||
|  | ||||
|   ESP_LOGI(TAG, "Creating %d threads, each will defer %d callbacks", NUM_THREADS, DEFERS_PER_THREAD); | ||||
|  | ||||
|   threads.reserve(NUM_THREADS); | ||||
|   for (int i = 0; i < NUM_THREADS; i++) { | ||||
|     threads.emplace_back([this, i]() { | ||||
|       ESP_LOGV(TAG, "Thread %d starting", i); | ||||
|       // Each thread directly calls defer() without any locking | ||||
|       for (int j = 0; j < DEFERS_PER_THREAD; j++) { | ||||
|         int defer_id = this->total_defers_.fetch_add(1); | ||||
|         ESP_LOGV(TAG, "Thread %d calling defer for request %d", i, defer_id); | ||||
|  | ||||
|         // Capture this pointer safely for the lambda | ||||
|         auto *component = this; | ||||
|  | ||||
|         // Directly call defer() from this thread - no locking! | ||||
|         this->defer([component, i, j, defer_id]() { | ||||
|           component->executed_defers_.fetch_add(1); | ||||
|           ESP_LOGV(TAG, "Executed defer %d (thread %d, index %d)", defer_id, i, j); | ||||
|         }); | ||||
|  | ||||
|         ESP_LOGV(TAG, "Thread %d called defer for request %d successfully", i, defer_id); | ||||
|  | ||||
|         // Small random delay to increase contention | ||||
|         if (j % 10 == 0) { | ||||
|           std::this_thread::sleep_for(std::chrono::microseconds(100)); | ||||
|         } | ||||
|       } | ||||
|       ESP_LOGV(TAG, "Thread %d finished", i); | ||||
|     }); | ||||
|   } | ||||
|  | ||||
|   // Wait for all threads to complete | ||||
|   for (auto &t : threads) { | ||||
|     t.join(); | ||||
|   } | ||||
|  | ||||
|   auto end_time = std::chrono::steady_clock::now(); | ||||
|   auto thread_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count(); | ||||
|   ESP_LOGI(TAG, "All threads finished in %lldms. Created %d defer requests", thread_time, this->total_defers_.load()); | ||||
| } | ||||
|  | ||||
| }  // namespace defer_stress_component | ||||
| }  // namespace esphome | ||||
| @@ -0,0 +1,20 @@ | ||||
| #pragma once | ||||
|  | ||||
| #include "esphome/core/component.h" | ||||
| #include <atomic> | ||||
|  | ||||
| namespace esphome { | ||||
| namespace defer_stress_component { | ||||
|  | ||||
| class DeferStressComponent : public Component { | ||||
|  public: | ||||
|   void setup() override; | ||||
|   void run_multi_thread_test(); | ||||
|  | ||||
|  private: | ||||
|   std::atomic<int> total_defers_{0}; | ||||
|   std::atomic<int> executed_defers_{0}; | ||||
| }; | ||||
|  | ||||
| }  // namespace defer_stress_component | ||||
| }  // namespace esphome | ||||
							
								
								
									
										117
									
								
								tests/integration/test_defer_fifo_simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										117
									
								
								tests/integration/test_defer_fifo_simple.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,117 @@ | ||||
| """Simple test that defer() maintains FIFO order.""" | ||||
|  | ||||
| import asyncio | ||||
|  | ||||
| from aioesphomeapi import EntityState, Event, EventInfo, UserService | ||||
| import pytest | ||||
|  | ||||
| from .types import APIClientConnectedFactory, RunCompiledFunction | ||||
|  | ||||
|  | ||||
| @pytest.mark.asyncio | ||||
| async def test_defer_fifo_simple( | ||||
|     yaml_config: str, | ||||
|     run_compiled: RunCompiledFunction, | ||||
|     api_client_connected: APIClientConnectedFactory, | ||||
| ) -> None: | ||||
|     """Test that defer() maintains FIFO order with a simple test.""" | ||||
|  | ||||
|     async with run_compiled(yaml_config), api_client_connected() as client: | ||||
|         # Verify we can connect | ||||
|         device_info = await client.device_info() | ||||
|         assert device_info is not None | ||||
|         assert device_info.name == "defer-fifo-simple" | ||||
|  | ||||
|         # List entities and services | ||||
|         entity_info, services = await asyncio.wait_for( | ||||
|             client.list_entities_services(), timeout=5.0 | ||||
|         ) | ||||
|  | ||||
|         # Find our test entities | ||||
|         test_complete_entity: EventInfo | None = None | ||||
|         test_result_entity: EventInfo | None = None | ||||
|  | ||||
|         for entity in entity_info: | ||||
|             if isinstance(entity, EventInfo): | ||||
|                 if entity.object_id == "test_complete": | ||||
|                     test_complete_entity = entity | ||||
|                 elif entity.object_id == "test_result": | ||||
|                     test_result_entity = entity | ||||
|  | ||||
|         assert test_complete_entity is not None, "test_complete event not found" | ||||
|         assert test_result_entity is not None, "test_result event not found" | ||||
|  | ||||
|         # Find our test services | ||||
|         test_set_timeout_service: UserService | None = None | ||||
|         test_defer_service: UserService | None = None | ||||
|         for service in services: | ||||
|             if service.name == "test_set_timeout": | ||||
|                 test_set_timeout_service = service | ||||
|             elif service.name == "test_defer": | ||||
|                 test_defer_service = service | ||||
|  | ||||
|         assert test_set_timeout_service is not None, ( | ||||
|             "test_set_timeout service not found" | ||||
|         ) | ||||
|         assert test_defer_service is not None, "test_defer service not found" | ||||
|  | ||||
|         # Get the event loop | ||||
|         loop = asyncio.get_running_loop() | ||||
|  | ||||
|         # Subscribe to states | ||||
|         # (events are delivered as EventStates through subscribe_states) | ||||
|         test_complete_future: asyncio.Future[bool] = loop.create_future() | ||||
|         test_result_future: asyncio.Future[bool] = loop.create_future() | ||||
|  | ||||
|         def on_state(state: EntityState) -> None: | ||||
|             if not isinstance(state, Event): | ||||
|                 return | ||||
|  | ||||
|             if ( | ||||
|                 state.key == test_complete_entity.key | ||||
|                 and state.event_type == "test_finished" | ||||
|                 and not test_complete_future.done() | ||||
|             ): | ||||
|                 test_complete_future.set_result(True) | ||||
|                 return | ||||
|  | ||||
|             if state.key == test_result_entity.key and not test_result_future.done(): | ||||
|                 if state.event_type == "passed": | ||||
|                     test_result_future.set_result(True) | ||||
|                 elif state.event_type == "failed": | ||||
|                     test_result_future.set_result(False) | ||||
|  | ||||
|         client.subscribe_states(on_state) | ||||
|  | ||||
|         # Test 1: Test set_timeout(0) | ||||
|         client.execute_service(test_set_timeout_service, {}) | ||||
|  | ||||
|         # Wait for first test completion | ||||
|         try: | ||||
|             await asyncio.wait_for(test_complete_future, timeout=5.0) | ||||
|             test1_passed = await asyncio.wait_for(test_result_future, timeout=1.0) | ||||
|         except asyncio.TimeoutError: | ||||
|             pytest.fail("Test set_timeout(0) did not complete within 5 seconds") | ||||
|  | ||||
|         assert test1_passed is True, ( | ||||
|             "set_timeout(0) FIFO test failed - items executed out of order" | ||||
|         ) | ||||
|  | ||||
|         # Reset futures for second test | ||||
|         test_complete_future = loop.create_future() | ||||
|         test_result_future = loop.create_future() | ||||
|  | ||||
|         # Test 2: Test defer() | ||||
|         client.execute_service(test_defer_service, {}) | ||||
|  | ||||
|         # Wait for second test completion | ||||
|         try: | ||||
|             await asyncio.wait_for(test_complete_future, timeout=5.0) | ||||
|             test2_passed = await asyncio.wait_for(test_result_future, timeout=1.0) | ||||
|         except asyncio.TimeoutError: | ||||
|             pytest.fail("Test defer() did not complete within 5 seconds") | ||||
|  | ||||
|         # Verify the test passed | ||||
|         assert test2_passed is True, ( | ||||
|             "defer() FIFO test failed - items executed out of order" | ||||
|         ) | ||||
							
								
								
									
										137
									
								
								tests/integration/test_defer_stress.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										137
									
								
								tests/integration/test_defer_stress.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,137 @@ | ||||
| """Stress test for defer() thread safety with multiple threads.""" | ||||
|  | ||||
| import asyncio | ||||
| from pathlib import Path | ||||
| import re | ||||
|  | ||||
| from aioesphomeapi import UserService | ||||
| import pytest | ||||
|  | ||||
| from .types import APIClientConnectedFactory, RunCompiledFunction | ||||
|  | ||||
|  | ||||
| @pytest.mark.asyncio | ||||
| async def test_defer_stress( | ||||
|     yaml_config: str, | ||||
|     run_compiled: RunCompiledFunction, | ||||
|     api_client_connected: APIClientConnectedFactory, | ||||
| ) -> None: | ||||
|     """Test that defer() doesn't crash when called rapidly from multiple threads.""" | ||||
|  | ||||
|     # Get the absolute path to the external components directory | ||||
|     external_components_path = str( | ||||
|         Path(__file__).parent / "fixtures" / "external_components" | ||||
|     ) | ||||
|  | ||||
|     # Replace the placeholder in the YAML config with the actual path | ||||
|     yaml_config = yaml_config.replace( | ||||
|         "EXTERNAL_COMPONENT_PATH", external_components_path | ||||
|     ) | ||||
|  | ||||
|     # Create a future to signal test completion | ||||
|     loop = asyncio.get_event_loop() | ||||
|     test_complete_future: asyncio.Future[None] = loop.create_future() | ||||
|  | ||||
|     # Track executed defers and their order | ||||
|     executed_defers: set[int] = set() | ||||
|     thread_executions: dict[ | ||||
|         int, list[int] | ||||
|     ] = {}  # thread_id -> list of indices in execution order | ||||
|     fifo_violations: list[str] = [] | ||||
|  | ||||
|     def on_log_line(line: str) -> None: | ||||
|         # Track all executed defers with thread and index info | ||||
|         match = re.search(r"Executed defer (\d+) \(thread (\d+), index (\d+)\)", line) | ||||
|         if not match: | ||||
|             return | ||||
|  | ||||
|         defer_id = int(match.group(1)) | ||||
|         thread_id = int(match.group(2)) | ||||
|         index = int(match.group(3)) | ||||
|  | ||||
|         executed_defers.add(defer_id) | ||||
|  | ||||
|         # Track execution order per thread | ||||
|         if thread_id not in thread_executions: | ||||
|             thread_executions[thread_id] = [] | ||||
|  | ||||
|         # Check FIFO ordering within thread | ||||
|         if thread_executions[thread_id] and thread_executions[thread_id][-1] >= index: | ||||
|             fifo_violations.append( | ||||
|                 f"Thread {thread_id}: index {index} executed after " | ||||
|                 f"{thread_executions[thread_id][-1]}" | ||||
|             ) | ||||
|  | ||||
|         thread_executions[thread_id].append(index) | ||||
|  | ||||
|         # Check if we've executed all 1000 defers (0-999) | ||||
|         if len(executed_defers) == 1000 and not test_complete_future.done(): | ||||
|             test_complete_future.set_result(None) | ||||
|  | ||||
|     async with ( | ||||
|         run_compiled(yaml_config, line_callback=on_log_line), | ||||
|         api_client_connected() as client, | ||||
|     ): | ||||
|         # Verify we can connect | ||||
|         device_info = await client.device_info() | ||||
|         assert device_info is not None | ||||
|         assert device_info.name == "defer-stress-test" | ||||
|  | ||||
|         # List entities and services | ||||
|         entity_info, services = await asyncio.wait_for( | ||||
|             client.list_entities_services(), timeout=5.0 | ||||
|         ) | ||||
|  | ||||
|         # Find our test service | ||||
|         run_stress_test_service: UserService | None = None | ||||
|         for service in services: | ||||
|             if service.name == "run_stress_test": | ||||
|                 run_stress_test_service = service | ||||
|                 break | ||||
|  | ||||
|         assert run_stress_test_service is not None, "run_stress_test service not found" | ||||
|  | ||||
|         # Call the run_stress_test service to start the test | ||||
|         client.execute_service(run_stress_test_service, {}) | ||||
|  | ||||
|         # Wait for all defers to execute (should be quick) | ||||
|         try: | ||||
|             await asyncio.wait_for(test_complete_future, timeout=5.0) | ||||
|         except asyncio.TimeoutError: | ||||
|             # Report how many we got | ||||
|             pytest.fail( | ||||
|                 f"Stress test timed out. Only {len(executed_defers)} of " | ||||
|                 f"1000 defers executed. Missing IDs: " | ||||
|                 f"{sorted(set(range(1000)) - executed_defers)[:10]}..." | ||||
|             ) | ||||
|  | ||||
|         # Verify all defers executed | ||||
|         assert len(executed_defers) == 1000, ( | ||||
|             f"Expected 1000 defers, got {len(executed_defers)}" | ||||
|         ) | ||||
|  | ||||
|         # Verify we have all IDs from 0-999 | ||||
|         expected_ids = set(range(1000)) | ||||
|         missing_ids = expected_ids - executed_defers | ||||
|         assert not missing_ids, f"Missing defer IDs: {sorted(missing_ids)}" | ||||
|  | ||||
|         # Verify FIFO ordering was maintained within each thread | ||||
|         assert not fifo_violations, "FIFO ordering violations detected:\n" + "\n".join( | ||||
|             fifo_violations[:10] | ||||
|         ) | ||||
|  | ||||
|         # Verify each thread executed all its defers in order | ||||
|         for thread_id, indices in thread_executions.items(): | ||||
|             assert len(indices) == 100, ( | ||||
|                 f"Thread {thread_id} executed {len(indices)} defers, expected 100" | ||||
|             ) | ||||
|             # Indices should be 0-99 in ascending order | ||||
|             assert indices == list(range(100)), ( | ||||
|                 f"Thread {thread_id} executed indices out of order: {indices[:10]}..." | ||||
|             ) | ||||
|  | ||||
|         # If we got here without crashing and with proper ordering, the test passed | ||||
|         assert True, ( | ||||
|             "Test completed successfully - all 1000 defers executed with " | ||||
|             "FIFO ordering preserved" | ||||
|         ) | ||||
		Reference in New Issue
	
	Block a user