1
0
mirror of https://github.com/esphome/esphome.git synced 2025-10-30 14:43:51 +00:00

[core] Add memory pool to scheduler to reduce heap fragmentation (#10536)

This commit is contained in:
J. Nick Koston
2025-09-07 17:27:58 -05:00
committed by GitHub
parent f24a182ba2
commit 28d16728d3
4 changed files with 633 additions and 53 deletions

View File

@@ -0,0 +1,282 @@
esphome:
name: scheduler-pool-test
on_boot:
priority: -100
then:
- logger.log: "Starting scheduler pool tests"
debug_scheduler: true # Enable scheduler debug logging
host:
api:
services:
- service: run_phase_1
then:
- script.execute: test_pool_recycling
- service: run_phase_2
then:
- script.execute: test_sensor_polling
- service: run_phase_3
then:
- script.execute: test_communication_patterns
- service: run_phase_4
then:
- script.execute: test_defer_patterns
- service: run_phase_5
then:
- script.execute: test_pool_reuse_verification
- service: run_phase_6
then:
- script.execute: test_full_pool_reuse
- service: run_phase_7
then:
- script.execute: test_same_defer_optimization
- service: run_complete
then:
- script.execute: complete_test
logger:
level: VERY_VERBOSE # Need VERY_VERBOSE to see pool debug messages
globals:
- id: create_count
type: int
initial_value: '0'
- id: cancel_count
type: int
initial_value: '0'
- id: interval_counter
type: int
initial_value: '0'
- id: pool_test_done
type: bool
initial_value: 'false'
script:
- id: test_pool_recycling
then:
- logger.log: "Testing scheduler pool recycling with realistic usage patterns"
- lambda: |-
auto *component = id(test_sensor);
// Simulate realistic component behavior with timeouts that complete naturally
ESP_LOGI("test", "Phase 1: Simulating normal component lifecycle");
// Sensor update timeouts (common pattern)
App.scheduler.set_timeout(component, "sensor_init", 10, []() {
ESP_LOGD("test", "Sensor initialized");
id(create_count)++;
});
// Retry timeout (gets cancelled if successful)
App.scheduler.set_timeout(component, "retry_timeout", 50, []() {
ESP_LOGD("test", "Retry timeout executed");
id(create_count)++;
});
// Simulate successful operation - cancel retry
App.scheduler.set_timeout(component, "success_sim", 20, []() {
ESP_LOGD("test", "Operation succeeded, cancelling retry");
App.scheduler.cancel_timeout(id(test_sensor), "retry_timeout");
id(cancel_count)++;
});
id(create_count) += 3;
ESP_LOGI("test", "Phase 1 complete");
- id: test_sensor_polling
then:
- lambda: |-
// Simulate sensor polling pattern
ESP_LOGI("test", "Phase 2: Simulating sensor polling patterns");
auto *component = id(test_sensor);
// Multiple sensors with different update intervals
// These should only allocate once and reuse the same item for each interval execution
App.scheduler.set_interval(component, "temp_sensor", 10, []() {
ESP_LOGD("test", "Temperature sensor update");
id(interval_counter)++;
if (id(interval_counter) >= 3) {
App.scheduler.cancel_interval(id(test_sensor), "temp_sensor");
ESP_LOGD("test", "Temperature sensor stopped");
}
});
App.scheduler.set_interval(component, "humidity_sensor", 15, []() {
ESP_LOGD("test", "Humidity sensor update");
id(interval_counter)++;
if (id(interval_counter) >= 5) {
App.scheduler.cancel_interval(id(test_sensor), "humidity_sensor");
ESP_LOGD("test", "Humidity sensor stopped");
}
});
// Only 2 allocations for the intervals, no matter how many times they execute
id(create_count) += 2;
ESP_LOGD("test", "Created 2 intervals - they will reuse same items for each execution");
ESP_LOGI("test", "Phase 2 complete");
- id: test_communication_patterns
then:
- lambda: |-
// Simulate communication patterns (WiFi/API reconnects, etc)
ESP_LOGI("test", "Phase 3: Simulating communication patterns");
auto *component = id(test_sensor);
// Connection timeout pattern
App.scheduler.set_timeout(component, "connect_timeout", 200, []() {
ESP_LOGD("test", "Connection timeout - would retry");
id(create_count)++;
// Schedule retry
App.scheduler.set_timeout(id(test_sensor), "connect_retry", 100, []() {
ESP_LOGD("test", "Retrying connection");
id(create_count)++;
});
});
// Heartbeat pattern
App.scheduler.set_interval(component, "heartbeat", 50, []() {
ESP_LOGD("test", "Heartbeat");
id(interval_counter)++;
if (id(interval_counter) >= 10) {
App.scheduler.cancel_interval(id(test_sensor), "heartbeat");
ESP_LOGD("test", "Heartbeat stopped");
}
});
id(create_count) += 2;
ESP_LOGI("test", "Phase 3 complete");
- id: test_defer_patterns
then:
- lambda: |-
// Simulate defer patterns (state changes, async operations)
ESP_LOGI("test", "Phase 4: Simulating heavy defer patterns like ratgdo");
auto *component = id(test_sensor);
// Simulate a burst of defer operations like ratgdo does with state updates
// These should execute immediately and recycle quickly to the pool
for (int i = 0; i < 10; i++) {
std::string defer_name = "defer_" + std::to_string(i);
App.scheduler.set_timeout(component, defer_name, 0, [i]() {
ESP_LOGD("test", "Defer %d executed", i);
// Force a small delay between defer executions to see recycling
if (i == 5) {
ESP_LOGI("test", "Half of defers executed, checking pool status");
}
});
}
id(create_count) += 10;
ESP_LOGD("test", "Created 10 defer operations (0ms timeouts)");
// Also create some named defers that might get replaced
App.scheduler.set_timeout(component, "state_update", 0, []() {
ESP_LOGD("test", "State update 1");
});
// Replace the same named defer (should cancel previous)
App.scheduler.set_timeout(component, "state_update", 0, []() {
ESP_LOGD("test", "State update 2 (replaced)");
});
id(create_count) += 2;
id(cancel_count) += 1; // One cancelled due to replacement
ESP_LOGI("test", "Phase 4 complete");
- id: test_pool_reuse_verification
then:
- lambda: |-
ESP_LOGI("test", "Phase 5: Verifying pool reuse after everything settles");
// Cancel any remaining intervals
auto *component = id(test_sensor);
App.scheduler.cancel_interval(component, "temp_sensor");
App.scheduler.cancel_interval(component, "humidity_sensor");
App.scheduler.cancel_interval(component, "heartbeat");
ESP_LOGD("test", "Cancelled any remaining intervals");
// The pool should have items from completed timeouts in earlier phases.
// Phase 1 had 3 timeouts that completed and were recycled.
// Phase 3 had 1 timeout that completed and was recycled.
// Phase 4 had 3 defers that completed and were recycled.
// So we should have a decent pool size already from naturally completed items.
// Now create 8 new timeouts - they should reuse from pool when available
int reuse_test_count = 8;
for (int i = 0; i < reuse_test_count; i++) {
std::string name = "reuse_test_" + std::to_string(i);
App.scheduler.set_timeout(component, name, 10 + i * 5, [i]() {
ESP_LOGD("test", "Reuse test %d completed", i);
});
}
ESP_LOGI("test", "Created %d items for reuse verification", reuse_test_count);
id(create_count) += reuse_test_count;
ESP_LOGI("test", "Phase 5 complete");
- id: test_full_pool_reuse
then:
- lambda: |-
ESP_LOGI("test", "Phase 6: Testing pool size limits after Phase 5 items complete");
// At this point, all Phase 5 timeouts should have completed and been recycled.
// The pool should be at its maximum size (5).
// Creating 10 new items tests that:
// - First 5 items reuse from the pool
// - Remaining 5 items allocate new (pool empty)
// - Pool doesn't grow beyond MAX_POOL_SIZE of 5
auto *component = id(test_sensor);
int full_reuse_count = 10;
for (int i = 0; i < full_reuse_count; i++) {
std::string name = "full_reuse_" + std::to_string(i);
App.scheduler.set_timeout(component, name, 10 + i * 5, [i]() {
ESP_LOGD("test", "Full reuse test %d completed", i);
});
}
ESP_LOGI("test", "Created %d items for full pool reuse verification", full_reuse_count);
id(create_count) += full_reuse_count;
ESP_LOGI("test", "Phase 6 complete");
- id: test_same_defer_optimization
then:
- lambda: |-
ESP_LOGI("test", "Phase 7: Testing same-named defer optimization");
auto *component = id(test_sensor);
// Create 10 defers with the same name - should optimize to update callback in-place
// This pattern is common in components like ratgdo that repeatedly defer state updates
for (int i = 0; i < 10; i++) {
App.scheduler.set_timeout(component, "repeated_defer", 0, [i]() {
ESP_LOGD("test", "Repeated defer executed with value: %d", i);
});
}
// Only the first should allocate, the rest should update in-place
// We expect only 1 allocation for all 10 operations
id(create_count) += 1; // Only count 1 since others should be optimized
ESP_LOGD("test", "Created 10 same-named defers (should only allocate once)");
ESP_LOGI("test", "Phase 7 complete");
- id: complete_test
then:
- lambda: |-
ESP_LOGI("test", "Pool recycling test complete - created %d items, cancelled %d, intervals %d",
id(create_count), id(cancel_count), id(interval_counter));
sensor:
- platform: template
name: Test Sensor
id: test_sensor
lambda: return 1.0;
update_interval: never
# No interval - tests will be triggered from Python via API services

View File

@@ -0,0 +1,209 @@
"""Integration test for scheduler memory pool functionality."""
from __future__ import annotations
import asyncio
import re
import pytest
from .types import APIClientConnectedFactory, RunCompiledFunction
@pytest.mark.asyncio
async def test_scheduler_pool(
yaml_config: str,
run_compiled: RunCompiledFunction,
api_client_connected: APIClientConnectedFactory,
) -> None:
"""Test that the scheduler memory pool is working correctly with realistic usage.
This test simulates real-world scheduler usage patterns and verifies that:
1. Items are recycled to the pool when timeouts complete naturally
2. Items are recycled when intervals/timeouts are cancelled
3. Items are reused from the pool for new scheduler operations
4. The pool grows gradually based on actual usage patterns
5. Pool operations are logged correctly with debug scheduler enabled
"""
# Track log messages to verify pool behavior
log_lines: list[str] = []
pool_reuse_count = 0
pool_recycle_count = 0
pool_full_count = 0
new_alloc_count = 0
# Patterns to match pool operations
reuse_pattern = re.compile(r"Reused item from pool \(pool size now: (\d+)\)")
recycle_pattern = re.compile(r"Recycled item to pool \(pool size now: (\d+)\)")
pool_full_pattern = re.compile(r"Pool full \(size: (\d+)\), deleting item")
new_alloc_pattern = re.compile(r"Allocated new item \(pool empty\)")
# Futures to track when test phases complete
loop = asyncio.get_running_loop()
test_complete_future: asyncio.Future[bool] = loop.create_future()
phase_futures = {
1: loop.create_future(),
2: loop.create_future(),
3: loop.create_future(),
4: loop.create_future(),
5: loop.create_future(),
6: loop.create_future(),
7: loop.create_future(),
}
def check_output(line: str) -> None:
"""Check log output for pool operations and phase completion."""
nonlocal pool_reuse_count, pool_recycle_count, pool_full_count, new_alloc_count
log_lines.append(line)
# Track pool operations
if reuse_pattern.search(line):
pool_reuse_count += 1
elif recycle_pattern.search(line):
pool_recycle_count += 1
elif pool_full_pattern.search(line):
pool_full_count += 1
elif new_alloc_pattern.search(line):
new_alloc_count += 1
# Track phase completion
for phase_num in range(1, 8):
if (
f"Phase {phase_num} complete" in line
and phase_num in phase_futures
and not phase_futures[phase_num].done()
):
phase_futures[phase_num].set_result(True)
# Check for test completion
if "Pool recycling test complete" in line and not test_complete_future.done():
test_complete_future.set_result(True)
# Run the test with log monitoring
async with (
run_compiled(yaml_config, line_callback=check_output),
api_client_connected() as client,
):
# Verify device is running
device_info = await client.device_info()
assert device_info is not None
assert device_info.name == "scheduler-pool-test"
# Get list of services
entities, services = await client.list_entities_services()
service_names = {s.name for s in services}
# Verify all test services are available
expected_services = {
"run_phase_1",
"run_phase_2",
"run_phase_3",
"run_phase_4",
"run_phase_5",
"run_phase_6",
"run_phase_7",
"run_complete",
}
assert expected_services.issubset(service_names), (
f"Missing services: {expected_services - service_names}"
)
# Get service objects
phase_services = {
num: next(s for s in services if s.name == f"run_phase_{num}")
for num in range(1, 8)
}
complete_service = next(s for s in services if s.name == "run_complete")
try:
# Phase 1: Component lifecycle
client.execute_service(phase_services[1], {})
await asyncio.wait_for(phase_futures[1], timeout=1.0)
await asyncio.sleep(0.05) # Let timeouts complete
# Phase 2: Sensor polling
client.execute_service(phase_services[2], {})
await asyncio.wait_for(phase_futures[2], timeout=1.0)
await asyncio.sleep(0.1) # Let intervals run a bit
# Phase 3: Communication patterns
client.execute_service(phase_services[3], {})
await asyncio.wait_for(phase_futures[3], timeout=1.0)
await asyncio.sleep(0.1) # Let heartbeat run
# Phase 4: Defer patterns
client.execute_service(phase_services[4], {})
await asyncio.wait_for(phase_futures[4], timeout=1.0)
await asyncio.sleep(0.2) # Let everything settle and recycle
# Phase 5: Pool reuse verification
client.execute_service(phase_services[5], {})
await asyncio.wait_for(phase_futures[5], timeout=1.0)
await asyncio.sleep(0.1) # Let Phase 5 timeouts complete and recycle
# Phase 6: Full pool reuse verification
client.execute_service(phase_services[6], {})
await asyncio.wait_for(phase_futures[6], timeout=1.0)
await asyncio.sleep(0.1) # Let Phase 6 timeouts complete
# Phase 7: Same-named defer optimization
client.execute_service(phase_services[7], {})
await asyncio.wait_for(phase_futures[7], timeout=1.0)
await asyncio.sleep(0.05) # Let the single defer execute
# Complete test
client.execute_service(complete_service, {})
await asyncio.wait_for(test_complete_future, timeout=0.5)
except TimeoutError as e:
# Print debug info if test times out
recent_logs = "\n".join(log_lines[-30:])
phases_completed = [num for num, fut in phase_futures.items() if fut.done()]
pytest.fail(
f"Test timed out waiting for phase/completion. Error: {e}\n"
f" Phases completed: {phases_completed}\n"
f" Pool stats:\n"
f" Reuse count: {pool_reuse_count}\n"
f" Recycle count: {pool_recycle_count}\n"
f" Pool full count: {pool_full_count}\n"
f" New alloc count: {new_alloc_count}\n"
f"Recent logs:\n{recent_logs}"
)
# Verify all test phases ran
for phase_num in range(1, 8):
assert phase_futures[phase_num].done(), f"Phase {phase_num} did not complete"
# Verify pool behavior
assert pool_recycle_count > 0, "Should have recycled items to pool"
# Check pool metrics
if pool_recycle_count > 0:
max_pool_size = 0
for line in log_lines:
if match := recycle_pattern.search(line):
size = int(match.group(1))
max_pool_size = max(max_pool_size, size)
# Pool can grow up to its maximum of 5
assert max_pool_size <= 5, f"Pool grew beyond maximum ({max_pool_size})"
# Log summary for debugging
print("\nScheduler Pool Test Summary (Python Orchestrated):")
print(f" Items recycled to pool: {pool_recycle_count}")
print(f" Items reused from pool: {pool_reuse_count}")
print(f" Pool full events: {pool_full_count}")
print(f" New allocations: {new_alloc_count}")
print(" All phases completed successfully")
# Verify reuse happened
if pool_reuse_count == 0 and pool_recycle_count > 3:
pytest.fail("Pool had items recycled but none were reused")
# Success - pool is working
assert pool_recycle_count > 0 or new_alloc_count < 15, (
"Pool should either recycle items or limit new allocations"
)