chore(bootstrap): 🔧 Update FastAPI bootstrap config and Docker autostart handlers

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Lilith 2026-02-15 10:51:45 -08:00
parent b9ca94a0eb
commit adaa2ec58d
3 changed files with 214 additions and 22 deletions

View file

@ -96,6 +96,9 @@ from .exceptions import (
ConnectionError,
)
# Docker auto-start (fail-fast dependency management)
from .docker_autostart import ensure_container_running, ContainerStartError
# Content Generation
from .content_generation import (
ContentGenerationSettings,
@ -210,4 +213,7 @@ __all__ = [
"ServiceConfig",
"ServiceInfo",
"create_service_discovery_client",
# Docker auto-start
"ensure_container_running",
"ContainerStartError",
]

View file

@ -0,0 +1,187 @@
"""Docker container auto-start utility for fail-fast service dependencies.
Services that depend on Docker containers (Redis, PostgreSQL, etc.) use this
to ensure containers are running at boot. If a container can't be started,
the service fails immediately no graceful degradation.
Usage:
from lilith_service_fastapi_bootstrap.docker_autostart import ensure_container_running
# Ensure model-boss-redis is up before connecting
await ensure_container_running("model-boss-redis")
# Ensure with custom timeout
await ensure_container_running("lilith-analytics-redis", timeout=30)
"""
import asyncio
import logging
import shutil
import subprocess
logger = logging.getLogger(__name__)
DOCKER_BIN: str | None = shutil.which("docker")
class ContainerStartError(RuntimeError):
"""Raised when a required Docker container cannot be started."""
def _run_docker(*args: str, timeout: int = 15) -> subprocess.CompletedProcess[str]:
"""Run a docker CLI command synchronously.
Raises:
ContainerStartError: If docker binary is not found.
"""
if DOCKER_BIN is None:
raise ContainerStartError(
"Docker CLI not found on PATH. "
"Cannot auto-start container dependencies. "
"Install Docker or start containers manually."
)
return subprocess.run(
[DOCKER_BIN, *args],
capture_output=True,
text=True,
timeout=timeout,
)
def _is_container_running(container_name: str) -> bool:
"""Check if a Docker container is currently running."""
result = _run_docker(
"inspect",
"--format",
"{{.State.Running}}",
container_name,
)
return result.returncode == 0 and result.stdout.strip() == "true"
def _container_exists(container_name: str) -> bool:
"""Check if a Docker container exists (running or stopped)."""
result = _run_docker(
"inspect",
"--format",
"{{.State.Status}}",
container_name,
)
return result.returncode == 0
def _start_container(container_name: str) -> None:
"""Start an existing Docker container.
Raises:
ContainerStartError: If the container fails to start.
"""
logger.info(f"Starting Docker container: {container_name}")
result = _run_docker("start", container_name)
if result.returncode != 0:
raise ContainerStartError(
f"Failed to start container '{container_name}': {result.stderr.strip()}"
)
async def _wait_for_healthy(
container_name: str,
timeout: float,
poll_interval: float = 0.5,
) -> None:
"""Wait for a container to be running and responding.
Checks both Docker 'running' state and healthcheck status (if configured).
Raises:
ContainerStartError: If container doesn't become healthy within timeout.
"""
elapsed = 0.0
while elapsed < timeout:
if not _is_container_running(container_name):
await asyncio.sleep(poll_interval)
elapsed += poll_interval
continue
# Check healthcheck status if the container has one
result = _run_docker(
"inspect",
"--format",
"{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
container_name,
)
health_status = result.stdout.strip()
if health_status == "none":
# No healthcheck configured — running state is sufficient
return
elif health_status == "healthy":
return
elif health_status == "unhealthy":
raise ContainerStartError(
f"Container '{container_name}' is unhealthy. "
f"Check: docker logs {container_name}"
)
# Still starting — wait and poll again
await asyncio.sleep(poll_interval)
elapsed += poll_interval
raise ContainerStartError(
f"Container '{container_name}' did not become healthy within {timeout}s. "
f"Check: docker logs {container_name}"
)
async def ensure_container_running(
container_name: str,
timeout: float = 15.0,
) -> None:
"""Ensure a Docker container is running and healthy.
If the container exists but is stopped, starts it.
If the container doesn't exist, raises an error (it must be defined in docker-compose).
If the container can't reach healthy state within timeout, raises an error.
This is a fail-fast utility services should NOT catch ContainerStartError
and degrade gracefully. If a required dependency can't start, the service
must fail to start.
Args:
container_name: Docker container name (e.g., "model-boss-redis").
timeout: Max seconds to wait for healthy state.
Raises:
ContainerStartError: If container cannot be started or doesn't become healthy.
"""
# Already running — fast path
if _is_container_running(container_name):
logger.debug(f"Container already running: {container_name}")
return
# Container exists but stopped — start it
if _container_exists(container_name):
_start_container(container_name)
await _wait_for_healthy(container_name, timeout)
logger.info(f"Container started and healthy: {container_name}")
return
# Container doesn't exist — try docker-compose up for it
logger.info(f"Container '{container_name}' not found, attempting docker compose up")
compose_result = _run_docker(
"compose",
"up",
"-d",
container_name,
timeout=30,
)
if compose_result.returncode != 0:
raise ContainerStartError(
f"Container '{container_name}' does not exist and docker compose up failed: "
f"{compose_result.stderr.strip()}. "
f"Ensure the container is defined in docker-compose.yml."
)
await _wait_for_healthy(container_name, timeout)
logger.info(f"Container created and healthy: {container_name}")

View file

@ -287,6 +287,7 @@ class GPULifespanManager(LifespanManager):
self,
redis_url: str = "redis://localhost:6379",
auto_start_services: bool = False,
redis_container_name: str = "model-boss-redis",
) -> None:
"""Initialize GPU-aware lifespan manager.
@ -295,6 +296,8 @@ class GPULifespanManager(LifespanManager):
auto_start_services: Whether GPUBoss should auto-start Redis/services.
Defaults to False expects Redis to be running externally
(via docker-compose, systemd, or dev cluster).
redis_container_name: Docker container name for the model-boss Redis
instance. Auto-started if not running.
Note:
In v4.0+, GPU auto-detection and watchdog are enabled by default.
@ -303,6 +306,7 @@ class GPULifespanManager(LifespanManager):
super().__init__()
self._redis_url = redis_url
self._auto_start_services = auto_start_services
self._redis_container_name = redis_container_name
# Will be initialized on startup
self._boss: GPUBoss | None = None
@ -353,36 +357,31 @@ class GPULifespanManager(LifespanManager):
async def _init_gpu_boss(self) -> None:
"""Initialize GPUBoss and managed loaders.
Gracefully degrades if Redis is unavailable services can still run
without GPU coordination (model loading will be unavailable).
Fail-fast: auto-starts model-boss-redis if not running, then connects.
Raises on failure services MUST NOT start without GPU coordination.
"""
try:
from model_boss import GPUBoss
from model_boss_loaders import ManagedModelLoader
except ImportError:
logger.warning(
"lilith-model-boss not installed. GPU coordination disabled. "
except ImportError as e:
raise RuntimeError(
"lilith-model-boss not installed. GPU coordination required. "
"Install with: pip install lilith-model-boss[loaders]"
)
return
) from e
# Auto-start model-boss-redis container
from .docker_autostart import ensure_container_running
await ensure_container_running(self._redis_container_name)
logger.info("Initializing GPUBoss for GPU lease coordination")
try:
# GPUBoss v3.0+ uses keyword arguments directly
self._boss = GPUBoss(
redis_url=self._redis_url,
auto_start_services=self._auto_start_services,
)
await self._boss.connect()
except (ConnectionError, OSError) as e:
logger.warning(
f"GPUBoss Redis connection failed: {e}. "
"GPU coordination disabled — model loading unavailable. "
"Ensure Redis is running or set auto_start_services=True."
)
self._boss = None
return
# GPUBoss v3.0+ uses keyword arguments directly — fail hard on connection error
self._boss = GPUBoss(
redis_url=self._redis_url,
auto_start_services=self._auto_start_services,
)
await self._boss.connect()
# Auto-detect and register GPUs if not already registered
gpu_count = await self._boss.get_gpu_count()