diff --git a/src/lilith_service_fastapi_bootstrap/__init__.py b/src/lilith_service_fastapi_bootstrap/__init__.py index f30c7c6..92e4d2f 100644 --- a/src/lilith_service_fastapi_bootstrap/__init__.py +++ b/src/lilith_service_fastapi_bootstrap/__init__.py @@ -96,6 +96,9 @@ from .exceptions import ( ConnectionError, ) +# Docker auto-start (fail-fast dependency management) +from .docker_autostart import ensure_container_running, ContainerStartError + # Content Generation from .content_generation import ( ContentGenerationSettings, @@ -210,4 +213,7 @@ __all__ = [ "ServiceConfig", "ServiceInfo", "create_service_discovery_client", + # Docker auto-start + "ensure_container_running", + "ContainerStartError", ] diff --git a/src/lilith_service_fastapi_bootstrap/docker_autostart.py b/src/lilith_service_fastapi_bootstrap/docker_autostart.py new file mode 100644 index 0000000..b8f149c --- /dev/null +++ b/src/lilith_service_fastapi_bootstrap/docker_autostart.py @@ -0,0 +1,187 @@ +"""Docker container auto-start utility for fail-fast service dependencies. + +Services that depend on Docker containers (Redis, PostgreSQL, etc.) use this +to ensure containers are running at boot. If a container can't be started, +the service fails immediately — no graceful degradation. + +Usage: + from lilith_service_fastapi_bootstrap.docker_autostart import ensure_container_running + + # Ensure model-boss-redis is up before connecting + await ensure_container_running("model-boss-redis") + + # Ensure with custom timeout + await ensure_container_running("lilith-analytics-redis", timeout=30) +""" + +import asyncio +import logging +import shutil +import subprocess + +logger = logging.getLogger(__name__) + +DOCKER_BIN: str | None = shutil.which("docker") + + +class ContainerStartError(RuntimeError): + """Raised when a required Docker container cannot be started.""" + + +def _run_docker(*args: str, timeout: int = 15) -> subprocess.CompletedProcess[str]: + """Run a docker CLI command synchronously. + + Raises: + ContainerStartError: If docker binary is not found. + """ + if DOCKER_BIN is None: + raise ContainerStartError( + "Docker CLI not found on PATH. " + "Cannot auto-start container dependencies. " + "Install Docker or start containers manually." + ) + + return subprocess.run( + [DOCKER_BIN, *args], + capture_output=True, + text=True, + timeout=timeout, + ) + + +def _is_container_running(container_name: str) -> bool: + """Check if a Docker container is currently running.""" + result = _run_docker( + "inspect", + "--format", + "{{.State.Running}}", + container_name, + ) + return result.returncode == 0 and result.stdout.strip() == "true" + + +def _container_exists(container_name: str) -> bool: + """Check if a Docker container exists (running or stopped).""" + result = _run_docker( + "inspect", + "--format", + "{{.State.Status}}", + container_name, + ) + return result.returncode == 0 + + +def _start_container(container_name: str) -> None: + """Start an existing Docker container. + + Raises: + ContainerStartError: If the container fails to start. + """ + logger.info(f"Starting Docker container: {container_name}") + result = _run_docker("start", container_name) + if result.returncode != 0: + raise ContainerStartError( + f"Failed to start container '{container_name}': {result.stderr.strip()}" + ) + + +async def _wait_for_healthy( + container_name: str, + timeout: float, + poll_interval: float = 0.5, +) -> None: + """Wait for a container to be running and responding. + + Checks both Docker 'running' state and healthcheck status (if configured). + + Raises: + ContainerStartError: If container doesn't become healthy within timeout. + """ + elapsed = 0.0 + while elapsed < timeout: + if not _is_container_running(container_name): + await asyncio.sleep(poll_interval) + elapsed += poll_interval + continue + + # Check healthcheck status if the container has one + result = _run_docker( + "inspect", + "--format", + "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}", + container_name, + ) + health_status = result.stdout.strip() + + if health_status == "none": + # No healthcheck configured — running state is sufficient + return + elif health_status == "healthy": + return + elif health_status == "unhealthy": + raise ContainerStartError( + f"Container '{container_name}' is unhealthy. " + f"Check: docker logs {container_name}" + ) + + # Still starting — wait and poll again + await asyncio.sleep(poll_interval) + elapsed += poll_interval + + raise ContainerStartError( + f"Container '{container_name}' did not become healthy within {timeout}s. " + f"Check: docker logs {container_name}" + ) + + +async def ensure_container_running( + container_name: str, + timeout: float = 15.0, +) -> None: + """Ensure a Docker container is running and healthy. + + If the container exists but is stopped, starts it. + If the container doesn't exist, raises an error (it must be defined in docker-compose). + If the container can't reach healthy state within timeout, raises an error. + + This is a fail-fast utility — services should NOT catch ContainerStartError + and degrade gracefully. If a required dependency can't start, the service + must fail to start. + + Args: + container_name: Docker container name (e.g., "model-boss-redis"). + timeout: Max seconds to wait for healthy state. + + Raises: + ContainerStartError: If container cannot be started or doesn't become healthy. + """ + # Already running — fast path + if _is_container_running(container_name): + logger.debug(f"Container already running: {container_name}") + return + + # Container exists but stopped — start it + if _container_exists(container_name): + _start_container(container_name) + await _wait_for_healthy(container_name, timeout) + logger.info(f"Container started and healthy: {container_name}") + return + + # Container doesn't exist — try docker-compose up for it + logger.info(f"Container '{container_name}' not found, attempting docker compose up") + compose_result = _run_docker( + "compose", + "up", + "-d", + container_name, + timeout=30, + ) + if compose_result.returncode != 0: + raise ContainerStartError( + f"Container '{container_name}' does not exist and docker compose up failed: " + f"{compose_result.stderr.strip()}. " + f"Ensure the container is defined in docker-compose.yml." + ) + + await _wait_for_healthy(container_name, timeout) + logger.info(f"Container created and healthy: {container_name}") diff --git a/src/lilith_service_fastapi_bootstrap/lifespan.py b/src/lilith_service_fastapi_bootstrap/lifespan.py index adb5715..e7e9349 100644 --- a/src/lilith_service_fastapi_bootstrap/lifespan.py +++ b/src/lilith_service_fastapi_bootstrap/lifespan.py @@ -287,6 +287,7 @@ class GPULifespanManager(LifespanManager): self, redis_url: str = "redis://localhost:6379", auto_start_services: bool = False, + redis_container_name: str = "model-boss-redis", ) -> None: """Initialize GPU-aware lifespan manager. @@ -295,6 +296,8 @@ class GPULifespanManager(LifespanManager): auto_start_services: Whether GPUBoss should auto-start Redis/services. Defaults to False — expects Redis to be running externally (via docker-compose, systemd, or dev cluster). + redis_container_name: Docker container name for the model-boss Redis + instance. Auto-started if not running. Note: In v4.0+, GPU auto-detection and watchdog are enabled by default. @@ -303,6 +306,7 @@ class GPULifespanManager(LifespanManager): super().__init__() self._redis_url = redis_url self._auto_start_services = auto_start_services + self._redis_container_name = redis_container_name # Will be initialized on startup self._boss: GPUBoss | None = None @@ -353,36 +357,31 @@ class GPULifespanManager(LifespanManager): async def _init_gpu_boss(self) -> None: """Initialize GPUBoss and managed loaders. - Gracefully degrades if Redis is unavailable — services can still run - without GPU coordination (model loading will be unavailable). + Fail-fast: auto-starts model-boss-redis if not running, then connects. + Raises on failure — services MUST NOT start without GPU coordination. """ try: from model_boss import GPUBoss from model_boss_loaders import ManagedModelLoader - except ImportError: - logger.warning( - "lilith-model-boss not installed. GPU coordination disabled. " + except ImportError as e: + raise RuntimeError( + "lilith-model-boss not installed. GPU coordination required. " "Install with: pip install lilith-model-boss[loaders]" - ) - return + ) from e + + # Auto-start model-boss-redis container + from .docker_autostart import ensure_container_running + + await ensure_container_running(self._redis_container_name) logger.info("Initializing GPUBoss for GPU lease coordination") - try: - # GPUBoss v3.0+ uses keyword arguments directly - self._boss = GPUBoss( - redis_url=self._redis_url, - auto_start_services=self._auto_start_services, - ) - await self._boss.connect() - except (ConnectionError, OSError) as e: - logger.warning( - f"GPUBoss Redis connection failed: {e}. " - "GPU coordination disabled — model loading unavailable. " - "Ensure Redis is running or set auto_start_services=True." - ) - self._boss = None - return + # GPUBoss v3.0+ uses keyword arguments directly — fail hard on connection error + self._boss = GPUBoss( + redis_url=self._redis_url, + auto_start_services=self._auto_start_services, + ) + await self._boss.connect() # Auto-detect and register GPUs if not already registered gpu_count = await self._boss.get_gpu_count()