1
0
mirror of https://github.com/esphome/esphome.git synced 2025-10-31 23:21:54 +00:00

cache github downloads

This commit is contained in:
J. Nick Koston
2025-10-19 14:33:26 -10:00
parent 0f87e7508b
commit 0eab64ffe5
5 changed files with 870 additions and 4 deletions

279
esphome/github_cache.py Normal file
View File

@@ -0,0 +1,279 @@
"""GitHub download cache for ESPHome.
This module provides caching functionality for GitHub release downloads
to avoid redundant network I/O when switching between platforms.
"""
from __future__ import annotations
import hashlib
import json
import logging
from pathlib import Path
import shutil
import time
import urllib.error
import urllib.request
_LOGGER = logging.getLogger(__name__)
class GitHubCache:
"""Manages caching of GitHub release downloads."""
def __init__(self, cache_dir: Path | None = None):
"""Initialize the cache manager.
Args:
cache_dir: Directory to store cached files.
Defaults to ~/.esphome_cache/github
"""
if cache_dir is None:
cache_dir = Path.home() / ".esphome_cache" / "github"
self.cache_dir = cache_dir
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.metadata_file = self.cache_dir / "cache_metadata.json"
def _load_metadata(self) -> dict:
"""Load cache metadata from disk."""
if self.metadata_file.exists():
try:
with open(self.metadata_file) as f:
return json.load(f)
except Exception:
return {}
return {}
def _save_metadata(self, metadata: dict) -> None:
"""Save cache metadata to disk."""
try:
with open(self.metadata_file, "w") as f:
json.dump(metadata, f, indent=2)
except Exception as e:
_LOGGER.debug("Failed to save cache metadata: %s", e)
@staticmethod
def is_github_url(url: str) -> bool:
"""Check if URL is a GitHub release download."""
return "github.com" in url.lower() and url.endswith(".zip")
def _get_cache_key(self, url: str) -> str:
"""Get cache key (hash) for a URL."""
return hashlib.sha256(url.encode()).hexdigest()
def _get_cache_path(self, url: str) -> Path:
"""Get cache file path for a URL."""
cache_key = self._get_cache_key(url)
ext = Path(url.split("?")[0]).suffix
return self.cache_dir / f"{cache_key}{ext}"
def _check_if_modified(
self,
url: str,
last_modified: str | None = None,
etag: str | None = None,
) -> bool:
"""Check if a URL has been modified using HTTP 304.
Args:
url: URL to check
last_modified: Last-Modified header from previous response
etag: ETag header from previous response
Returns:
True if modified (or unable to check), False if not modified
"""
if not last_modified and not etag:
# No cache headers available, assume modified
return True
try:
request = urllib.request.Request(url)
request.get_method = lambda: "HEAD"
if last_modified:
request.add_header("If-Modified-Since", last_modified)
if etag:
request.add_header("If-None-Match", etag)
try:
urllib.request.urlopen(request, timeout=10)
# 200 OK = file was modified
return True
except urllib.error.HTTPError as e:
if e.code == 304:
# Not modified
_LOGGER.debug("File not modified (HTTP 304): %s", url)
return False
# Other errors, assume modified to be safe
return True
except Exception as e:
# If check fails, assume not modified (use cache)
_LOGGER.debug("Failed to check if modified: %s", e)
return False
def get_cached_path(self, url: str, check_updates: bool = True) -> Path | None:
"""Get path to cached file if available and valid.
Args:
url: URL to check
check_updates: Whether to check for updates using HTTP 304
Returns:
Path to cached file if valid, None if needs download
"""
if not self.is_github_url(url):
return None
cache_path = self._get_cache_path(url)
if not cache_path.exists():
return None
if not check_updates:
_LOGGER.debug("Using cached file (no update check): %s", url)
return cache_path
# Load metadata and check if modified
metadata = self._load_metadata()
cache_key = self._get_cache_key(url)
if cache_key not in metadata:
# Have file but no metadata, use it anyway
_LOGGER.debug("Using cached file (no metadata): %s", url)
return cache_path
last_modified = metadata[cache_key].get("last_modified")
etag = metadata[cache_key].get("etag")
if self._check_if_modified(url, last_modified, etag):
# File was modified, need to re-download
_LOGGER.debug("Cached file is outdated: %s", url)
return None
# File not modified, use cache
_LOGGER.debug("Using cached file: %s", url)
return cache_path
def save_to_cache(self, url: str, source_path: Path) -> None:
"""Save a downloaded file to cache.
Args:
url: URL the file was downloaded from
source_path: Path to the downloaded file
"""
if not self.is_github_url(url):
return
try:
cache_path = self._get_cache_path(url)
# Only copy if source and destination are different
if source_path.resolve() != cache_path.resolve():
shutil.copy2(source_path, cache_path)
# Try to get HTTP headers for caching
last_modified = None
etag = None
try:
request = urllib.request.Request(url)
request.get_method = lambda: "HEAD"
response = urllib.request.urlopen(request, timeout=10)
last_modified = response.headers.get("Last-Modified")
etag = response.headers.get("ETag")
except Exception:
pass
# Update metadata
metadata = self._load_metadata()
cache_key = self._get_cache_key(url)
metadata[cache_key] = {
"url": url,
"size": cache_path.stat().st_size,
"cached_at": time.time(),
"last_modified": last_modified,
"etag": etag,
}
self._save_metadata(metadata)
_LOGGER.debug("Saved to cache: %s", url)
except Exception as e:
_LOGGER.debug("Failed to save to cache: %s", e)
def copy_from_cache(self, url: str, destination: Path) -> bool:
"""Copy a cached file to destination.
Args:
url: URL of the cached file
destination: Where to copy the file
Returns:
True if successful, False otherwise
"""
cached_path = self.get_cached_path(url, check_updates=True)
if not cached_path:
return False
try:
shutil.copy2(cached_path, destination)
_LOGGER.info("Using cached download for %s", url)
return True
except Exception as e:
_LOGGER.warning("Failed to use cache: %s", e)
return False
def cache_size(self) -> int:
"""Get total size of cached files in bytes."""
total = 0
try:
for file_path in self.cache_dir.glob("*"):
if file_path.is_file() and file_path != self.metadata_file:
total += file_path.stat().st_size
except Exception:
pass
return total
def list_cached(self) -> list[dict]:
"""List all cached files with metadata."""
cached_files = []
metadata = self._load_metadata()
for cache_key, meta in metadata.items():
cache_path = (
self.cache_dir / f"{cache_key}{Path(meta['url'].split('?')[0]).suffix}"
)
if cache_path.exists():
cached_files.append(
{
"url": meta["url"],
"path": cache_path,
"size": meta["size"],
"cached_at": meta.get("cached_at"),
"last_modified": meta.get("last_modified"),
"etag": meta.get("etag"),
}
)
return cached_files
def clear_cache(self) -> None:
"""Clear all cached files."""
try:
for file_path in self.cache_dir.glob("*"):
if file_path.is_file():
file_path.unlink()
_LOGGER.info("Cache cleared: %s", self.cache_dir)
except Exception as e:
_LOGGER.warning("Failed to clear cache: %s", e)
# Global cache instance
_cache: GitHubCache | None = None
def get_cache() -> GitHubCache:
"""Get the global GitHub cache instance."""
global _cache # noqa: PLW0603
if _cache is None:
_cache = GitHubCache()
return _cache

View File

@@ -5,7 +5,6 @@ import os
from pathlib import Path
import re
import subprocess
from typing import Any
from esphome.const import CONF_COMPILE_PROCESS_LIMIT, CONF_ESPHOME, KEY_CORE
from esphome.core import CORE, EsphomeError
@@ -44,15 +43,67 @@ def patch_structhash():
def patch_file_downloader():
"""Patch PlatformIO's FileDownloader to retry on PackageException errors."""
"""Patch PlatformIO's FileDownloader to add caching and retry on PackageException errors."""
from platformio.package.download import FileDownloader
from platformio.package.exception import PackageException
# Import our cache module
from esphome.github_cache import GitHubCache
_LOGGER.info("Applying GitHub download cache patch...")
original_init = FileDownloader.__init__
original_start = FileDownloader.start
def patched_init(self, *args: Any, **kwargs: Any) -> None:
# Initialize cache in .platformio directory so it benefits from GitHub Actions cache
platformio_dir = Path.home() / ".platformio"
cache = GitHubCache(cache_dir=platformio_dir / "esphome_download_cache")
_LOGGER.info("GitHub download cache initialized at: %s", cache.cache_dir)
def patched_init(self, *args, **kwargs):
"""Patched init that checks cache before making HTTP connection."""
# Extract URL from args (first positional argument)
url = args[0] if args else kwargs.get("url")
dest_dir = args[1] if len(args) > 1 else kwargs.get("dest_dir")
# Debug: Log all downloads
_LOGGER.debug("[GitHub Cache] Download request for: %s", url)
# Store URL for later use (original FileDownloader doesn't store it)
self._esphome_cache_url = url if cache.is_github_url(url) else None
# Check cache for GitHub URLs BEFORE making HTTP request
if self._esphome_cache_url:
_LOGGER.debug("[GitHub Cache] This is a GitHub URL, checking cache...")
self._esphome_use_cache = cache.get_cached_path(url, check_updates=True)
if self._esphome_use_cache:
_LOGGER.debug(
"[GitHub Cache] Found in cache: %s", self._esphome_use_cache
)
else:
_LOGGER.debug("[GitHub Cache] Not in cache, will download and cache")
else:
self._esphome_use_cache = None
if url and str(url).startswith("http"):
_LOGGER.debug("[GitHub Cache] Not a GitHub URL, skipping cache")
# Only make HTTP connection if we don't have cached file
if self._esphome_use_cache:
# Skip HTTP connection, we'll handle this in start()
# Set minimal attributes to satisfy FileDownloader
self._http_session = None
self._http_response = None
self._fname = Path(url.split("?")[0]).name
self._destination = self._fname
if dest_dir:
from os.path import join
self._destination = join(dest_dir, self._fname)
_LOGGER.info("Using cached download for %s", url)
return None # Don't call original_init
# Normal initialization with retry logic
max_retries = 3
for attempt in range(max_retries):
try:
return original_init(self, *args, **kwargs)
@@ -69,7 +120,37 @@ def patch_file_downloader():
raise
return None
def patched_start(self, *args, **kwargs):
"""Patched start that uses cache when available."""
import shutil
# Get the cache URL and path that were set in __init__
cache_url = getattr(self, "_esphome_cache_url", None)
cached_file = getattr(self, "_esphome_use_cache", None)
# If we're using cache, copy file instead of downloading
if cached_file:
try:
shutil.copy2(cached_file, self._destination)
return True
except Exception as e:
_LOGGER.warning("Failed to copy from cache: %s", e)
# Fall through to re-download
# Perform normal download
result = original_start(self, *args, **kwargs)
# Save to cache if it was a GitHub URL
if cache_url:
try:
cache.save_to_cache(cache_url, Path(self._destination))
except Exception as e:
_LOGGER.debug("Failed to save to cache: %s", e)
return result
FileDownloader.__init__ = patched_init
FileDownloader.start = patched_start
IGNORE_LIB_WARNINGS = f"(?:{'|'.join(['Hash', 'Update'])})"
@@ -87,6 +168,8 @@ FILTER_PLATFORMIO_LINES = [
r"Memory Usage -> https://bit.ly/pio-memory-usage",
r"Found: https://platformio.org/lib/show/.*",
r"Using cache: .*",
# Don't filter our cache messages - let users see when cache is being used
# r"Using cached download for .*",
r"Installing dependencies",
r"Library Manager: Already installed, built-in library",
r"Building in .* mode",

View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""
Pre-cache PlatformIO GitHub Downloads
This script extracts GitHub URLs from platformio.ini and pre-caches them
to avoid redundant downloads when switching between ESP8266 and ESP32 builds.
Usage:
python3 script/cache_platformio_downloads.py [platformio.ini]
"""
import argparse
import configparser
from pathlib import Path
import re
import sys
# Import the cache manager
sys.path.insert(0, str(Path(__file__).parent.parent))
from esphome.github_cache import GitHubCache
def extract_github_urls(platformio_ini: Path) -> list[str]:
"""Extract all GitHub URLs from platformio.ini.
Args:
platformio_ini: Path to platformio.ini file
Returns:
List of GitHub URLs found
"""
config = configparser.ConfigParser(inline_comment_prefixes=(";",))
config.read(platformio_ini)
urls = []
github_pattern = re.compile(r"https://github\.com/[^\s;]+\.zip")
for section in config.sections():
conf = config[section]
# Check platform
if "platform" in conf:
platform_value = conf["platform"]
matches = github_pattern.findall(platform_value)
urls.extend(matches)
# Check platform_packages
if "platform_packages" in conf:
for line in conf["platform_packages"].splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
matches = github_pattern.findall(line)
urls.extend(matches)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
return unique_urls
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Pre-cache PlatformIO GitHub downloads",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
This script scans platformio.ini for GitHub URLs and pre-caches them.
This avoids redundant downloads when switching between platforms (e.g., ESP8266 and ESP32).
Examples:
# Cache downloads from default platformio.ini
%(prog)s
# Cache downloads from specific file
%(prog)s custom_platformio.ini
# Show what would be cached without downloading
%(prog)s --dry-run
""",
)
parser.add_argument(
"platformio_ini",
nargs="?",
default="platformio.ini",
help="Path to platformio.ini (default: platformio.ini)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be cached without downloading",
)
parser.add_argument(
"--cache-dir",
type=Path,
help="Cache directory (default: ~/.platformio/esphome_download_cache)",
)
parser.add_argument(
"--force",
action="store_true",
help="Force re-download even if cached",
)
args = parser.parse_args()
platformio_ini = Path(args.platformio_ini)
if not platformio_ini.exists():
print(f"Error: {platformio_ini} not found", file=sys.stderr)
return 1
# Extract URLs
print(f"Scanning {platformio_ini} for GitHub URLs...")
urls = extract_github_urls(platformio_ini)
if not urls:
print("No GitHub URLs found in platformio.ini")
return 0
print(f"Found {len(urls)} unique GitHub URL(s):")
for url in urls:
print(f" - {url}")
print()
if args.dry_run:
print("Dry run - not downloading")
return 0
# Initialize cache (use PlatformIO directory by default)
cache_dir = args.cache_dir
if cache_dir is None:
cache_dir = Path.home() / ".platformio" / "esphome_download_cache"
cache = GitHubCache(cache_dir)
# Cache each URL
success_count = 0
for i, url in enumerate(urls, 1):
print(f"[{i}/{len(urls)}] Caching {url}")
try:
# Use the download_with_progress from github_download_cache CLI
from script.github_download_cache import download_with_progress
download_with_progress(cache, url, force=args.force, check_updates=True)
success_count += 1
print()
except Exception as e:
print(f"Error caching {url}: {e}", file=sys.stderr)
print()
# Show cache stats
total_size = cache.cache_size()
size_mb = total_size / (1024 * 1024)
print("\nCache summary:")
print(f" Successfully cached: {success_count}/{len(urls)}")
print(f" Total cache size: {size_mb:.2f} MB")
print(f" Cache location: {cache.cache_dir}")
return 0 if success_count == len(urls) else 1
if __name__ == "__main__":
sys.exit(main())

195
script/github_download_cache.py Executable file
View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
GitHub Download Cache CLI
This script provides a command-line interface to the GitHub download cache.
The actual caching logic is in esphome/github_cache.py.
Usage:
python3 script/github_download_cache.py download URL
python3 script/github_download_cache.py list
python3 script/github_download_cache.py stats
python3 script/github_download_cache.py clear
"""
import argparse
from pathlib import Path
import sys
import urllib.request
# Add parent directory to path to import esphome modules
sys.path.insert(0, str(Path(__file__).parent.parent))
from esphome.github_cache import GitHubCache
def download_with_progress(
cache: GitHubCache, url: str, force: bool = False, check_updates: bool = True
) -> Path:
"""Download a URL with progress indicator and caching.
Args:
cache: GitHubCache instance
url: URL to download
force: Force re-download even if cached
check_updates: Check for updates using HTTP 304
Returns:
Path to cached file
"""
# If force, skip cache check
if not force:
cached_path = cache.get_cached_path(url, check_updates=check_updates)
if cached_path:
print(f"Using cached file for {url}")
print(f" Cache: {cached_path}")
return cached_path
# Need to download
print(f"Downloading {url}")
cache_path = cache._get_cache_path(url)
print(f" Cache: {cache_path}")
# Download with progress
temp_path = cache_path.with_suffix(cache_path.suffix + ".tmp")
try:
with urllib.request.urlopen(url) as response:
total_size = int(response.headers.get("Content-Length", 0))
downloaded = 0
with open(temp_path, "wb") as f:
while True:
chunk = response.read(8192)
if not chunk:
break
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = (downloaded / total_size) * 100
print(f"\r Progress: {percent:.1f}%", end="", flush=True)
print() # New line after progress
# Move to final location
temp_path.replace(cache_path)
# Let cache handle metadata
cache.save_to_cache(url, cache_path)
return cache_path
except Exception as e:
if temp_path.exists():
temp_path.unlink()
raise RuntimeError(f"Failed to download {url}: {e}") from e
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="GitHub Download Cache Manager",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Download and cache a URL
%(prog)s download https://github.com/pioarduino/registry/releases/download/0.0.1/esptoolpy-v5.1.0.zip
# List cached files
%(prog)s list
# Show cache statistics
%(prog)s stats
# Clear cache
%(prog)s clear
""",
)
parser.add_argument(
"--cache-dir",
type=Path,
help="Cache directory (default: ~/.platformio/esphome_download_cache)",
)
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Download command
download_parser = subparsers.add_parser("download", help="Download and cache a URL")
download_parser.add_argument("url", help="URL to download")
download_parser.add_argument(
"--force", action="store_true", help="Force re-download even if cached"
)
download_parser.add_argument(
"--no-check-updates",
action="store_true",
help="Skip checking for updates (don't use HTTP 304)",
)
# List command
subparsers.add_parser("list", help="List cached files")
# Stats command
subparsers.add_parser("stats", help="Show cache statistics")
# Clear command
subparsers.add_parser("clear", help="Clear all cached files")
args = parser.parse_args()
if not args.command:
parser.print_help()
return 1
# Use PlatformIO cache directory by default
if args.cache_dir is None:
args.cache_dir = Path.home() / ".platformio" / "esphome_download_cache"
cache = GitHubCache(args.cache_dir)
if args.command == "download":
try:
check_updates = not args.no_check_updates
cache_path = download_with_progress(
cache, args.url, force=args.force, check_updates=check_updates
)
print(f"\nCached at: {cache_path}")
return 0
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
return 1
elif args.command == "list":
cached = cache.list_cached()
if not cached:
print("No cached files")
return 0
print(f"Cached files ({len(cached)}):")
for item in cached:
size_mb = item["size"] / (1024 * 1024)
print(f" {item['url']}")
print(f" Size: {size_mb:.2f} MB")
print(f" Path: {item['path']}")
return 0
elif args.command == "stats":
total_size = cache.cache_size()
cached_count = len(cache.list_cached())
size_mb = total_size / (1024 * 1024)
print(f"Cache directory: {cache.cache_dir}")
print(f"Cached files: {cached_count}")
print(f"Total size: {size_mb:.2f} MB")
return 0
elif args.command == "clear":
cache.clear_cache()
return 0
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env python3
"""
PlatformIO Download Wrapper with Caching
This script can be used as a wrapper around PlatformIO downloads to add caching.
It intercepts download operations and uses the GitHub download cache.
This is designed to be called from PlatformIO's extra_scripts if needed.
"""
from pathlib import Path
import sys
# Import the cache manager
sys.path.insert(0, str(Path(__file__).parent))
from github_download_cache import GitHubDownloadCache
def is_github_url(url: str) -> bool:
"""Check if a URL is a GitHub URL."""
return "github.com" in url.lower()
def cached_download_handler(source, target, env):
"""PlatformIO download handler that uses caching for GitHub URLs.
This function can be registered as a custom download handler in PlatformIO.
Args:
source: Source URL
target: Target file path
env: SCons environment
"""
import shutil
import urllib.request
url = str(source[0])
target_path = Path(str(target[0]))
# Only cache GitHub URLs
if not is_github_url(url):
# Fall back to default download
print(f"Downloading (no cache): {url}")
with (
urllib.request.urlopen(url) as response,
open(target_path, "wb") as out_file,
):
shutil.copyfileobj(response, out_file)
return
# Use cache for GitHub URLs
cache = GitHubDownloadCache()
print(f"Downloading with cache: {url}")
try:
cached_path = cache.download_with_cache(url, check_updates=True)
# Copy from cache to target
shutil.copy2(cached_path, target_path)
print(f" Copied to: {target_path}")
except Exception as e:
print(f"Cache download failed, using direct download: {e}")
# Fall back to direct download
with (
urllib.request.urlopen(url) as response,
open(target_path, "wb") as out_file,
):
shutil.copyfileobj(response, out_file)
def setup_platformio_caching():
"""Setup PlatformIO to use cached downloads.
This should be called from an extra_scripts file in platformio.ini.
Example extra_scripts file (e.g., platformio_hooks.py):
Import("env")
from script.platformio_download_wrapper import setup_platformio_caching
setup_platformio_caching()
"""
try:
from SCons.Script import DefaultEnvironment
DefaultEnvironment()
# Register custom download handler
# Note: This may not work with all PlatformIO versions
# as the download mechanism is internal
print("Note: Direct download interception is not fully supported.")
print("Please use the cache_platformio_downloads.py script instead.")
except ImportError:
print("Warning: SCons not available, cannot setup download caching")
if __name__ == "__main__":
# CLI mode - can be used to manually download a URL with caching
import argparse
parser = argparse.ArgumentParser(description="Download a URL with caching")
parser.add_argument("url", help="URL to download")
parser.add_argument("target", help="Target file path")
parser.add_argument("--cache-dir", type=Path, help="Cache directory")
args = parser.parse_args()
cache = GitHubDownloadCache(args.cache_dir)
target_path = Path(args.target)
try:
if is_github_url(args.url):
print(f"Downloading with cache: {args.url}")
cached_path = cache.download_with_cache(args.url)
# Copy to target
import shutil
target_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(cached_path, target_path)
print(f"Copied to: {target_path}")
else:
print(f"Downloading directly (not a GitHub URL): {args.url}")
import shutil
import urllib.request
target_path.parent.mkdir(parents=True, exist_ok=True)
with (
urllib.request.urlopen(args.url) as response,
open(target_path, "wb") as out_file,
):
shutil.copyfileobj(response, out_file)
sys.exit(0)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)