1
0
mirror of https://github.com/esphome/esphome.git synced 2026-02-09 09:11:52 +00:00

Compare commits

...

13 Commits

Author SHA1 Message Date
J. Nick Koston
8f032d5213 wip 2026-02-08 15:04:33 -06:00
J. Nick Koston
12c369cb09 wip 2026-02-08 15:01:01 -06:00
J. Nick Koston
b728ebd5fb tweaks 2026-02-08 14:56:57 -06:00
J. Nick Koston
5caa54761b tweaks 2026-02-08 14:51:11 -06:00
J. Nick Koston
4efdfda8fb wip 2026-02-08 14:34:37 -06:00
J. Nick Koston
6099dd3065 wip 2026-02-08 14:34:00 -06:00
J. Nick Koston
30558262d8 fix dupes 2026-02-08 14:27:09 -06:00
J. Nick Koston
6dd7f45cc9 tweak 2026-02-08 14:12:35 -06:00
J. Nick Koston
17664750b7 tweak 2026-02-08 11:20:54 -06:00
J. Nick Koston
95e92716d9 tweak 2026-02-08 11:19:38 -06:00
J. Nick Koston
0f1ece6bdc [analyze-memory] Attribute PlatformIO library symbols via nm archive scanning 2026-02-08 11:14:34 -06:00
J. Nick Koston
8c4a732eb7 copilot edge cases 2026-02-07 18:00:30 -06:00
J. Nick Koston
4e3ccb4fc5 [analyze-memory] Attribute CSWTCH symbols from SDK archives 2026-02-07 17:52:20 -06:00
2 changed files with 488 additions and 48 deletions

View File

@@ -43,6 +43,7 @@ _READELF_SECTION_PATTERN = re.compile(
# Component category prefixes
_COMPONENT_PREFIX_ESPHOME = "[esphome]"
_COMPONENT_PREFIX_EXTERNAL = "[external]"
_COMPONENT_PREFIX_LIB = "[lib]"
_COMPONENT_CORE = f"{_COMPONENT_PREFIX_ESPHOME}core"
_COMPONENT_API = f"{_COMPONENT_PREFIX_ESPHOME}api"
@@ -56,6 +57,16 @@ SymbolInfoType = tuple[str, int, str]
# RAM sections - symbols in these sections consume RAM
RAM_SECTIONS = frozenset([".data", ".bss"])
# nm symbol types for global/weak defined symbols (used for library symbol mapping)
# Only global (uppercase) and weak symbols are safe to use - local symbols (lowercase)
# can have name collisions across compilation units
_NM_DEFINED_GLOBAL_TYPES = frozenset({"T", "D", "B", "R", "W", "V"})
# Pattern matching compiler-generated local names that can collide across compilation
# units (e.g., packet$19, buf$20, flag$5261). These are unsafe for name-based lookup.
# Does NOT match mangled C++ names with optimization suffixes (e.g., func$isra$0).
_COMPILER_LOCAL_PATTERN = re.compile(r"^[a-zA-Z_]\w*\$\d+$")
@dataclass
class MemorySection:
@@ -179,11 +190,19 @@ class MemoryAnalyzer:
self._sdk_symbols: list[SDKSymbol] = []
# CSWTCH symbols: list of (name, size, source_file, component)
self._cswtch_symbols: list[tuple[str, int, str, str]] = []
# Library symbol mapping: symbol_name -> library_name
self._lib_symbol_map: dict[str, str] = {}
# Library dir to name mapping: "lib641" -> "espsoftwareserial",
# "espressif__mdns" -> "mdns"
self._lib_hash_to_name: dict[str, str] = {}
# Heuristic category to library redirect: "mdns_lib" -> "[lib]mdns"
self._heuristic_to_lib: dict[str, str] = {}
def analyze(self) -> dict[str, ComponentMemory]:
"""Analyze the ELF file and return component memory usage."""
self._parse_sections()
self._parse_symbols()
self._scan_libraries()
self._categorize_symbols()
self._analyze_cswtch_symbols()
self._analyze_sdk_libraries()
@@ -328,15 +347,19 @@ class MemoryAnalyzer:
# If no component match found, it's core
return _COMPONENT_CORE
# Check library symbol map (more accurate than heuristic patterns)
if lib_name := self._lib_symbol_map.get(symbol_name):
return f"{_COMPONENT_PREFIX_LIB}{lib_name}"
# Check against symbol patterns
for component, patterns in SYMBOL_PATTERNS.items():
if any(pattern in symbol_name for pattern in patterns):
return component
return self._heuristic_to_lib.get(component, component)
# Check against demangled patterns
for component, patterns in DEMANGLED_PATTERNS.items():
if any(pattern in demangled for pattern in patterns):
return component
return self._heuristic_to_lib.get(component, component)
# Special cases that need more complex logic
@@ -384,6 +407,327 @@ class MemoryAnalyzer:
return "Other Core"
def _discover_pio_libraries(
self,
libraries: dict[str, list[Path]],
hash_to_name: dict[str, str],
) -> None:
"""Discover PlatformIO third-party libraries from the build directory.
Scans ``lib<hex>/`` directories under ``.pioenvs/<env>/`` to find
library names and their ``.a`` archive or ``.o`` file paths.
Args:
libraries: Dict to populate with library name -> file path list mappings.
Prefers ``.a`` archives when available, falls back to ``.o`` files
(e.g., pioarduino ESP32 Arduino builds only produce ``.o`` files).
hash_to_name: Dict to populate with dir name -> library name mappings
for CSWTCH attribution (e.g., ``lib641`` -> ``espsoftwareserial``).
"""
build_dir = self.elf_path.parent
for entry in build_dir.iterdir():
if not entry.is_dir() or not entry.name.startswith("lib"):
continue
# Validate that the suffix after "lib" is a hex hash
hex_part = entry.name[3:]
if not hex_part:
continue
try:
int(hex_part, 16)
except ValueError:
continue
# Each lib<hex>/ directory contains a subdirectory named after the library
for lib_subdir in entry.iterdir():
if not lib_subdir.is_dir():
continue
lib_name = lib_subdir.name.lower()
# Prefer .a archive (lib<LibraryName>.a), fall back to .o files
# e.g., lib72a/ESPAsyncTCP/... has lib72a/libESPAsyncTCP.a
archive = entry / f"lib{lib_subdir.name}.a"
if archive.exists():
file_paths = [archive]
elif archives := list(entry.glob("*.a")):
# Case-insensitive fallback
file_paths = [archives[0]]
else:
# No .a archive (e.g., pioarduino CMake builds) - use .o files
file_paths = sorted(lib_subdir.rglob("*.o"))
if file_paths:
libraries[lib_name] = file_paths
hash_to_name[entry.name] = lib_name
_LOGGER.debug(
"Discovered PlatformIO library: %s -> %s",
lib_subdir.name,
file_paths[0],
)
def _discover_idf_managed_components(
self,
libraries: dict[str, list[Path]],
hash_to_name: dict[str, str],
) -> None:
"""Discover ESP-IDF managed component libraries from the build directory.
ESP-IDF managed components (from the IDF component registry) use a
``<vendor>__<name>`` naming convention. Source files live under
``managed_components/<vendor>__<name>/`` and the compiled archives are at
``esp-idf/<vendor>__<name>/lib<vendor>__<name>.a``.
Args:
libraries: Dict to populate with library name -> file path list mappings.
hash_to_name: Dict to populate with dir name -> library name mappings
for CSWTCH attribution (e.g., ``espressif__mdns`` -> ``mdns``).
"""
build_dir = self.elf_path.parent
managed_dir = build_dir / "managed_components"
if not managed_dir.is_dir():
return
espidf_dir = build_dir / "esp-idf"
for entry in managed_dir.iterdir():
if not entry.is_dir() or "__" not in entry.name:
continue
# Extract the short name: espressif__mdns -> mdns
full_name = entry.name # e.g., espressif__mdns
short_name = full_name.split("__", 1)[1].lower()
# Find the .a archive under esp-idf/<vendor>__<name>/
archive = espidf_dir / full_name / f"lib{full_name}.a"
if archive.exists():
libraries[short_name] = [archive]
hash_to_name[full_name] = short_name
_LOGGER.debug(
"Discovered IDF managed component: %s -> %s",
short_name,
archive,
)
def _build_library_symbol_map(
self, libraries: dict[str, list[Path]]
) -> dict[str, str]:
"""Build a symbol-to-library mapping from library archives or object files.
Runs ``nm --defined-only`` on each ``.a`` or ``.o`` file to collect
global and weak defined symbols.
Args:
libraries: Dictionary mapping library name to list of file paths
(``.a`` archives or ``.o`` object files).
Returns:
Dictionary mapping symbol name to library name.
"""
symbol_map: dict[str, str] = {}
if not self.nm_path:
return symbol_map
for lib_name, file_paths in libraries.items():
result = run_tool(
[self.nm_path, "--defined-only", *(str(p) for p in file_paths)],
timeout=10,
)
if result is None or result.returncode != 0:
continue
for line in result.stdout.splitlines():
parts = line.split()
if len(parts) < 3:
continue
sym_type = parts[-2]
sym_name = parts[-1]
# Include global defined symbols (uppercase) and weak symbols (W/V)
if sym_type in _NM_DEFINED_GLOBAL_TYPES:
symbol_map[sym_name] = lib_name
return symbol_map
@staticmethod
def _build_heuristic_to_lib_mapping(
library_names: set[str],
) -> dict[str, str]:
"""Build mapping from heuristic pattern categories to discovered libraries.
Heuristic categories like ``mdns_lib``, ``web_server_lib``, ``async_tcp``
exist as approximations for library attribution. When we discover the
actual library, symbols matching those heuristics should be redirected
to the ``[lib]`` category instead.
The mapping is built by checking if the normalized category name
(stripped of ``_lib`` suffix and underscores) appears as a substring
of any discovered library name.
Examples::
mdns_lib -> mdns -> in "mdns" or "esp8266mdns" -> [lib]mdns
web_server_lib -> webserver -> in "espasyncwebserver" -> [lib]espasyncwebserver
async_tcp -> asynctcp -> in "espasynctcp" -> [lib]espasynctcp
Args:
library_names: Set of discovered library names (lowercase).
Returns:
Dictionary mapping heuristic category to ``[lib]<name>`` string.
"""
mapping: dict[str, str] = {}
all_categories = set(SYMBOL_PATTERNS) | set(DEMANGLED_PATTERNS)
for category in all_categories:
base = category.removesuffix("_lib").replace("_", "")
# Collect all libraries whose name contains the base string
candidates = [lib_name for lib_name in library_names if base in lib_name]
if not candidates:
continue
# Choose a deterministic "best" match:
# 1. Prefer exact name matches over substring matches.
# 2. Among non-exact matches, prefer the shortest library name.
# 3. Break remaining ties lexicographically.
best_lib = min(
candidates,
key=lambda lib_name, _base=base: (
lib_name != _base,
len(lib_name),
lib_name,
),
)
mapping[category] = f"{_COMPONENT_PREFIX_LIB}{best_lib}"
if mapping:
_LOGGER.debug(
"Heuristic-to-library redirects: %s",
", ".join(f"{k} -> {v}" for k, v in sorted(mapping.items())),
)
return mapping
def _parse_map_file(self) -> dict[str, str] | None:
"""Parse linker map file to build authoritative symbol-to-library mapping.
The linker map file contains the definitive source attribution for every
symbol, including local/static ones that ``nm`` cannot safely export.
Map file format (GNU ld)::
.text._mdns_service_task
0x400e9fdc 0x65c .pioenvs/env/esp-idf/espressif__mdns/libespressif__mdns.a(mdns.c.o)
Each section entry has a ``.section.symbol_name`` line followed by an
indented line with address, size, and source path.
Returns:
Symbol-to-library dict, or ``None`` if no usable map file exists.
"""
map_path = self.elf_path.with_suffix(".map")
if not map_path.exists() or map_path.stat().st_size < 10000:
return None
_LOGGER.info("Parsing linker map file: %s", map_path.name)
try:
map_text = map_path.read_text(encoding="utf-8", errors="replace")
except OSError as err:
_LOGGER.warning("Failed to read map file: %s", err)
return None
symbol_map: dict[str, str] = {}
current_symbol: str | None = None
section_prefixes = (".text.", ".rodata.", ".data.", ".bss.", ".literal.")
for line in map_text.splitlines():
# Match section.symbol line: " .text.symbol_name"
# Single space indent, starts with dot
if len(line) > 2 and line[0] == " " and line[1] == ".":
stripped = line.strip()
for prefix in section_prefixes:
if stripped.startswith(prefix):
current_symbol = stripped[len(prefix) :]
break
else:
current_symbol = None
continue
# Match source attribution line: " 0xADDR 0xSIZE source_path"
if current_symbol is None:
continue
fields = line.split()
# Skip compiler-generated local names (e.g., packet$19, buf$20)
# that can collide across compilation units
if (
len(fields) >= 3
and fields[0].startswith("0x")
and fields[1].startswith("0x")
and not _COMPILER_LOCAL_PATTERN.match(current_symbol)
):
source_path = fields[2]
# Check if source path contains a known library directory
for dir_key, lib_name in self._lib_hash_to_name.items():
if dir_key in source_path:
symbol_map[current_symbol] = lib_name
break
current_symbol = None
return symbol_map or None
def _scan_libraries(self) -> None:
"""Discover third-party libraries and build symbol mapping.
Scans both PlatformIO ``lib<hex>/`` directories (Arduino builds) and
ESP-IDF ``managed_components/`` (IDF builds) to find library archives.
Uses the linker map file for authoritative symbol attribution when
available, falling back to ``nm`` scanning with heuristic redirects.
"""
libraries: dict[str, list[Path]] = {}
self._discover_pio_libraries(libraries, self._lib_hash_to_name)
self._discover_idf_managed_components(libraries, self._lib_hash_to_name)
if not libraries:
_LOGGER.debug("No third-party libraries found")
return
_LOGGER.info(
"Scanning %d libraries: %s",
len(libraries),
", ".join(sorted(libraries)),
)
# Heuristic redirect catches local symbols (e.g., mdns_task_buffer$14)
# that can't be safely added to the symbol map due to name collisions
self._heuristic_to_lib = self._build_heuristic_to_lib_mapping(
set(libraries.keys())
)
# Try linker map file first (authoritative, includes local symbols)
map_symbols = self._parse_map_file()
if map_symbols is not None:
self._lib_symbol_map = map_symbols
_LOGGER.info(
"Built library symbol map from linker map: %d symbols",
len(self._lib_symbol_map),
)
return
# Fall back to nm scanning (global symbols only)
self._lib_symbol_map = self._build_library_symbol_map(libraries)
_LOGGER.info(
"Built library symbol map from nm: %d symbols from %d libraries",
len(self._lib_symbol_map),
len(libraries),
)
def _find_object_files_dir(self) -> Path | None:
"""Find the directory containing object files for this build.
@@ -397,47 +741,38 @@ class MemoryAnalyzer:
return pioenvs_dir
return None
def _scan_cswtch_in_objects(
self, obj_dir: Path
) -> dict[str, list[tuple[str, int]]]:
"""Scan object files for CSWTCH symbols using a single nm invocation.
@staticmethod
def _parse_nm_cswtch_output(
output: str,
base_dir: Path | None,
cswtch_map: dict[str, list[tuple[str, int]]],
) -> None:
"""Parse nm output for CSWTCH symbols and add to cswtch_map.
Uses ``nm --print-file-name -S`` on all ``.o`` files at once.
Output format: ``/path/to/file.o:address size type name``
Handles both ``.o`` files and ``.a`` archives.
nm output formats::
.o files: /path/file.o:hex_addr hex_size type name
.a files: /path/lib.a:member.o:hex_addr hex_size type name
For ``.o`` files, paths are made relative to *base_dir* when possible.
For ``.a`` archives (detected by ``:`` in the file portion), paths are
formatted as ``archive_stem/member.o`` (e.g. ``liblwip2-536-feat/lwip-esp.o``).
Args:
obj_dir: Directory containing object files (.pioenvs/<env>/)
Returns:
Dict mapping "CSWTCH$NNN:size" to list of (source_file, size) tuples.
output: Raw stdout from ``nm --print-file-name -S``.
base_dir: Base directory for computing relative paths of ``.o`` files.
Pass ``None`` when scanning archives outside the build tree.
cswtch_map: Dict to populate, mapping ``"CSWTCH$N:size"`` to source list.
"""
cswtch_map: dict[str, list[tuple[str, int]]] = defaultdict(list)
if not self.nm_path:
return cswtch_map
# Find all .o files recursively, sorted for deterministic output
obj_files = sorted(obj_dir.rglob("*.o"))
if not obj_files:
return cswtch_map
_LOGGER.debug("Scanning %d object files for CSWTCH symbols", len(obj_files))
# Single nm call with --print-file-name for all object files
result = run_tool(
[self.nm_path, "--print-file-name", "-S"] + [str(f) for f in obj_files],
timeout=30,
)
if result is None or result.returncode != 0:
return cswtch_map
for line in result.stdout.splitlines():
for line in output.splitlines():
if "CSWTCH$" not in line:
continue
# Split on last ":" that precedes a hex address.
# nm --print-file-name format: filepath:hex_addr hex_size type name
# We split from the right: find the last colon followed by hex digits.
# For .o: "filepath.o" : "hex_addr hex_size type name"
# For .a: "filepath.a:member.o" : "hex_addr hex_size type name"
parts_after_colon = line.rsplit(":", 1)
if len(parts_after_colon) != 2:
continue
@@ -457,16 +792,89 @@ class MemoryAnalyzer:
except ValueError:
continue
# Get relative path from obj_dir for readability
try:
rel_path = str(Path(file_path).relative_to(obj_dir))
except ValueError:
# Determine readable source path
# Use ".a:" to detect archive format (not bare ":" which matches
# Windows drive letters like "C:\...\file.o").
if ".a:" in file_path:
# Archive format: "archive.a:member.o" → "archive_stem/member.o"
archive_part, member = file_path.rsplit(":", 1)
archive_name = Path(archive_part).stem
rel_path = f"{archive_name}/{member}"
elif base_dir is not None:
try:
rel_path = str(Path(file_path).relative_to(base_dir))
except ValueError:
rel_path = file_path
else:
rel_path = file_path
key = f"{sym_name}:{size}"
cswtch_map[key].append((rel_path, size))
return cswtch_map
def _run_nm_cswtch_scan(
self,
files: list[Path],
base_dir: Path | None,
cswtch_map: dict[str, list[tuple[str, int]]],
) -> None:
"""Run nm on *files* and add any CSWTCH symbols to *cswtch_map*.
Args:
files: Object (``.o``) or archive (``.a``) files to scan.
base_dir: Base directory for relative path computation (see
:meth:`_parse_nm_cswtch_output`).
cswtch_map: Dict to populate with results.
"""
if not self.nm_path or not files:
return
_LOGGER.debug("Scanning %d files for CSWTCH symbols", len(files))
result = run_tool(
[self.nm_path, "--print-file-name", "-S"] + [str(f) for f in files],
timeout=30,
)
if result is None or result.returncode != 0:
_LOGGER.debug(
"nm failed or timed out scanning %d files for CSWTCH symbols",
len(files),
)
return
self._parse_nm_cswtch_output(result.stdout, base_dir, cswtch_map)
def _scan_cswtch_in_sdk_archives(
self, cswtch_map: dict[str, list[tuple[str, int]]]
) -> None:
"""Scan SDK library archives (.a) for CSWTCH symbols.
Prebuilt SDK libraries (e.g. lwip, bearssl) are not compiled from source,
so their CSWTCH symbols only exist inside ``.a`` archives. Results are
merged into *cswtch_map* for keys not already found in ``.o`` files.
The same source file (e.g. ``lwip-esp.o``) often appears in multiple
library variants (``liblwip2-536.a``, ``liblwip2-1460-feat.a``, etc.),
so results are deduplicated by member name.
"""
sdk_dirs = self._find_sdk_library_dirs()
if not sdk_dirs:
return
sdk_archives = sorted(a for sdk_dir in sdk_dirs for a in sdk_dir.glob("*.a"))
sdk_map: dict[str, list[tuple[str, int]]] = defaultdict(list)
self._run_nm_cswtch_scan(sdk_archives, None, sdk_map)
# Merge SDK results, deduplicating by member name.
for key, sources in sdk_map.items():
if key in cswtch_map:
continue
seen: dict[str, tuple[str, int]] = {}
for path, sz in sources:
member = Path(path).name
if member not in seen:
seen[member] = (path, sz)
cswtch_map[key] = list(seen.values())
def _source_file_to_component(self, source_file: str) -> str:
"""Map a source object file path to its component name.
@@ -495,9 +903,21 @@ class MemoryAnalyzer:
if "esphome" in parts and "components" not in parts:
return _COMPONENT_CORE
# Framework/library files - return the first path component
# e.g., lib65b/ESPAsyncTCP/... -> lib65b
# FrameworkArduino/... -> FrameworkArduino
# Framework/library files - check for PlatformIO library hash dirs
# e.g., lib65b/ESPAsyncTCP/... -> [lib]espasynctcp
if parts and parts[0] in self._lib_hash_to_name:
return f"{_COMPONENT_PREFIX_LIB}{self._lib_hash_to_name[parts[0]]}"
# ESP-IDF managed components: managed_components/espressif__mdns/... -> [lib]mdns
if (
len(parts) >= 2
and parts[0] == "managed_components"
and parts[1] in self._lib_hash_to_name
):
return f"{_COMPONENT_PREFIX_LIB}{self._lib_hash_to_name[parts[1]]}"
# Other framework/library files - return the first path component
# e.g., FrameworkArduino/... -> FrameworkArduino
return parts[0] if parts else source_file
def _analyze_cswtch_symbols(self) -> None:
@@ -505,17 +925,25 @@ class MemoryAnalyzer:
CSWTCH symbols are compiler-generated lookup tables for switch statements.
They are local symbols, so the same name can appear in different object files.
This method scans .o files to attribute them to their source components.
This method scans .o files and SDK archives to attribute them to their
source components.
"""
obj_dir = self._find_object_files_dir()
if obj_dir is None:
_LOGGER.debug("No object files directory found, skipping CSWTCH analysis")
return
# Scan object files for CSWTCH symbols
cswtch_map = self._scan_cswtch_in_objects(obj_dir)
# Scan build-dir object files for CSWTCH symbols
cswtch_map: dict[str, list[tuple[str, int]]] = defaultdict(list)
self._run_nm_cswtch_scan(sorted(obj_dir.rglob("*.o")), obj_dir, cswtch_map)
# Also scan SDK library archives (.a) for CSWTCH symbols.
# Prebuilt SDK libraries (e.g. lwip, bearssl) are not compiled from source
# so their symbols only exist inside .a archives, not as loose .o files.
self._scan_cswtch_in_sdk_archives(cswtch_map)
if not cswtch_map:
_LOGGER.debug("No CSWTCH symbols found in object files")
_LOGGER.debug("No CSWTCH symbols found in object files or SDK archives")
return
# Collect CSWTCH symbols from the ELF (already parsed in sections)

View File

@@ -14,6 +14,7 @@ from . import (
_COMPONENT_CORE,
_COMPONENT_PREFIX_ESPHOME,
_COMPONENT_PREFIX_EXTERNAL,
_COMPONENT_PREFIX_LIB,
RAM_SECTIONS,
MemoryAnalyzer,
)
@@ -407,6 +408,11 @@ class MemoryAnalyzerCLI(MemoryAnalyzer):
for name, mem in components
if name.startswith(_COMPONENT_PREFIX_EXTERNAL)
]
library_components = [
(name, mem)
for name, mem in components
if name.startswith(_COMPONENT_PREFIX_LIB)
]
top_esphome_components = sorted(
esphome_components, key=lambda x: x[1].flash_total, reverse=True
@@ -417,6 +423,11 @@ class MemoryAnalyzerCLI(MemoryAnalyzer):
external_components, key=lambda x: x[1].flash_total, reverse=True
)
# Include all library components
top_library_components = sorted(
library_components, key=lambda x: x[1].flash_total, reverse=True
)
# Check if API component exists and ensure it's included
api_component = None
for name, mem in components:
@@ -435,10 +446,11 @@ class MemoryAnalyzerCLI(MemoryAnalyzer):
if name in system_components_to_include
]
# Combine all components to analyze: top ESPHome + all external + API if not already included + system components
# Combine all components to analyze: top ESPHome + all external + libraries + API if not already included + system components
components_to_analyze = (
list(top_esphome_components)
+ list(top_external_components)
+ list(top_library_components)
+ system_components
)
if api_component and api_component not in components_to_analyze: