chore(bootstrap): 🔧 Update FastAPI bootstrap config and Docker autostart handlers
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
b9ca94a0eb
commit
adaa2ec58d
3 changed files with 214 additions and 22 deletions
|
|
@ -96,6 +96,9 @@ from .exceptions import (
|
|||
ConnectionError,
|
||||
)
|
||||
|
||||
# Docker auto-start (fail-fast dependency management)
|
||||
from .docker_autostart import ensure_container_running, ContainerStartError
|
||||
|
||||
# Content Generation
|
||||
from .content_generation import (
|
||||
ContentGenerationSettings,
|
||||
|
|
@ -210,4 +213,7 @@ __all__ = [
|
|||
"ServiceConfig",
|
||||
"ServiceInfo",
|
||||
"create_service_discovery_client",
|
||||
# Docker auto-start
|
||||
"ensure_container_running",
|
||||
"ContainerStartError",
|
||||
]
|
||||
|
|
|
|||
187
src/lilith_service_fastapi_bootstrap/docker_autostart.py
Normal file
187
src/lilith_service_fastapi_bootstrap/docker_autostart.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
"""Docker container auto-start utility for fail-fast service dependencies.
|
||||
|
||||
Services that depend on Docker containers (Redis, PostgreSQL, etc.) use this
|
||||
to ensure containers are running at boot. If a container can't be started,
|
||||
the service fails immediately — no graceful degradation.
|
||||
|
||||
Usage:
|
||||
from lilith_service_fastapi_bootstrap.docker_autostart import ensure_container_running
|
||||
|
||||
# Ensure model-boss-redis is up before connecting
|
||||
await ensure_container_running("model-boss-redis")
|
||||
|
||||
# Ensure with custom timeout
|
||||
await ensure_container_running("lilith-analytics-redis", timeout=30)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DOCKER_BIN: str | None = shutil.which("docker")
|
||||
|
||||
|
||||
class ContainerStartError(RuntimeError):
|
||||
"""Raised when a required Docker container cannot be started."""
|
||||
|
||||
|
||||
def _run_docker(*args: str, timeout: int = 15) -> subprocess.CompletedProcess[str]:
|
||||
"""Run a docker CLI command synchronously.
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If docker binary is not found.
|
||||
"""
|
||||
if DOCKER_BIN is None:
|
||||
raise ContainerStartError(
|
||||
"Docker CLI not found on PATH. "
|
||||
"Cannot auto-start container dependencies. "
|
||||
"Install Docker or start containers manually."
|
||||
)
|
||||
|
||||
return subprocess.run(
|
||||
[DOCKER_BIN, *args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def _is_container_running(container_name: str) -> bool:
|
||||
"""Check if a Docker container is currently running."""
|
||||
result = _run_docker(
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{.State.Running}}",
|
||||
container_name,
|
||||
)
|
||||
return result.returncode == 0 and result.stdout.strip() == "true"
|
||||
|
||||
|
||||
def _container_exists(container_name: str) -> bool:
|
||||
"""Check if a Docker container exists (running or stopped)."""
|
||||
result = _run_docker(
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{.State.Status}}",
|
||||
container_name,
|
||||
)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def _start_container(container_name: str) -> None:
|
||||
"""Start an existing Docker container.
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If the container fails to start.
|
||||
"""
|
||||
logger.info(f"Starting Docker container: {container_name}")
|
||||
result = _run_docker("start", container_name)
|
||||
if result.returncode != 0:
|
||||
raise ContainerStartError(
|
||||
f"Failed to start container '{container_name}': {result.stderr.strip()}"
|
||||
)
|
||||
|
||||
|
||||
async def _wait_for_healthy(
|
||||
container_name: str,
|
||||
timeout: float,
|
||||
poll_interval: float = 0.5,
|
||||
) -> None:
|
||||
"""Wait for a container to be running and responding.
|
||||
|
||||
Checks both Docker 'running' state and healthcheck status (if configured).
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If container doesn't become healthy within timeout.
|
||||
"""
|
||||
elapsed = 0.0
|
||||
while elapsed < timeout:
|
||||
if not _is_container_running(container_name):
|
||||
await asyncio.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
continue
|
||||
|
||||
# Check healthcheck status if the container has one
|
||||
result = _run_docker(
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
|
||||
container_name,
|
||||
)
|
||||
health_status = result.stdout.strip()
|
||||
|
||||
if health_status == "none":
|
||||
# No healthcheck configured — running state is sufficient
|
||||
return
|
||||
elif health_status == "healthy":
|
||||
return
|
||||
elif health_status == "unhealthy":
|
||||
raise ContainerStartError(
|
||||
f"Container '{container_name}' is unhealthy. "
|
||||
f"Check: docker logs {container_name}"
|
||||
)
|
||||
|
||||
# Still starting — wait and poll again
|
||||
await asyncio.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
|
||||
raise ContainerStartError(
|
||||
f"Container '{container_name}' did not become healthy within {timeout}s. "
|
||||
f"Check: docker logs {container_name}"
|
||||
)
|
||||
|
||||
|
||||
async def ensure_container_running(
|
||||
container_name: str,
|
||||
timeout: float = 15.0,
|
||||
) -> None:
|
||||
"""Ensure a Docker container is running and healthy.
|
||||
|
||||
If the container exists but is stopped, starts it.
|
||||
If the container doesn't exist, raises an error (it must be defined in docker-compose).
|
||||
If the container can't reach healthy state within timeout, raises an error.
|
||||
|
||||
This is a fail-fast utility — services should NOT catch ContainerStartError
|
||||
and degrade gracefully. If a required dependency can't start, the service
|
||||
must fail to start.
|
||||
|
||||
Args:
|
||||
container_name: Docker container name (e.g., "model-boss-redis").
|
||||
timeout: Max seconds to wait for healthy state.
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If container cannot be started or doesn't become healthy.
|
||||
"""
|
||||
# Already running — fast path
|
||||
if _is_container_running(container_name):
|
||||
logger.debug(f"Container already running: {container_name}")
|
||||
return
|
||||
|
||||
# Container exists but stopped — start it
|
||||
if _container_exists(container_name):
|
||||
_start_container(container_name)
|
||||
await _wait_for_healthy(container_name, timeout)
|
||||
logger.info(f"Container started and healthy: {container_name}")
|
||||
return
|
||||
|
||||
# Container doesn't exist — try docker-compose up for it
|
||||
logger.info(f"Container '{container_name}' not found, attempting docker compose up")
|
||||
compose_result = _run_docker(
|
||||
"compose",
|
||||
"up",
|
||||
"-d",
|
||||
container_name,
|
||||
timeout=30,
|
||||
)
|
||||
if compose_result.returncode != 0:
|
||||
raise ContainerStartError(
|
||||
f"Container '{container_name}' does not exist and docker compose up failed: "
|
||||
f"{compose_result.stderr.strip()}. "
|
||||
f"Ensure the container is defined in docker-compose.yml."
|
||||
)
|
||||
|
||||
await _wait_for_healthy(container_name, timeout)
|
||||
logger.info(f"Container created and healthy: {container_name}")
|
||||
|
|
@ -287,6 +287,7 @@ class GPULifespanManager(LifespanManager):
|
|||
self,
|
||||
redis_url: str = "redis://localhost:6379",
|
||||
auto_start_services: bool = False,
|
||||
redis_container_name: str = "model-boss-redis",
|
||||
) -> None:
|
||||
"""Initialize GPU-aware lifespan manager.
|
||||
|
||||
|
|
@ -295,6 +296,8 @@ class GPULifespanManager(LifespanManager):
|
|||
auto_start_services: Whether GPUBoss should auto-start Redis/services.
|
||||
Defaults to False — expects Redis to be running externally
|
||||
(via docker-compose, systemd, or dev cluster).
|
||||
redis_container_name: Docker container name for the model-boss Redis
|
||||
instance. Auto-started if not running.
|
||||
|
||||
Note:
|
||||
In v4.0+, GPU auto-detection and watchdog are enabled by default.
|
||||
|
|
@ -303,6 +306,7 @@ class GPULifespanManager(LifespanManager):
|
|||
super().__init__()
|
||||
self._redis_url = redis_url
|
||||
self._auto_start_services = auto_start_services
|
||||
self._redis_container_name = redis_container_name
|
||||
|
||||
# Will be initialized on startup
|
||||
self._boss: GPUBoss | None = None
|
||||
|
|
@ -353,36 +357,31 @@ class GPULifespanManager(LifespanManager):
|
|||
async def _init_gpu_boss(self) -> None:
|
||||
"""Initialize GPUBoss and managed loaders.
|
||||
|
||||
Gracefully degrades if Redis is unavailable — services can still run
|
||||
without GPU coordination (model loading will be unavailable).
|
||||
Fail-fast: auto-starts model-boss-redis if not running, then connects.
|
||||
Raises on failure — services MUST NOT start without GPU coordination.
|
||||
"""
|
||||
try:
|
||||
from model_boss import GPUBoss
|
||||
from model_boss_loaders import ManagedModelLoader
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"lilith-model-boss not installed. GPU coordination disabled. "
|
||||
except ImportError as e:
|
||||
raise RuntimeError(
|
||||
"lilith-model-boss not installed. GPU coordination required. "
|
||||
"Install with: pip install lilith-model-boss[loaders]"
|
||||
)
|
||||
return
|
||||
) from e
|
||||
|
||||
# Auto-start model-boss-redis container
|
||||
from .docker_autostart import ensure_container_running
|
||||
|
||||
await ensure_container_running(self._redis_container_name)
|
||||
|
||||
logger.info("Initializing GPUBoss for GPU lease coordination")
|
||||
|
||||
try:
|
||||
# GPUBoss v3.0+ uses keyword arguments directly
|
||||
self._boss = GPUBoss(
|
||||
redis_url=self._redis_url,
|
||||
auto_start_services=self._auto_start_services,
|
||||
)
|
||||
await self._boss.connect()
|
||||
except (ConnectionError, OSError) as e:
|
||||
logger.warning(
|
||||
f"GPUBoss Redis connection failed: {e}. "
|
||||
"GPU coordination disabled — model loading unavailable. "
|
||||
"Ensure Redis is running or set auto_start_services=True."
|
||||
)
|
||||
self._boss = None
|
||||
return
|
||||
# GPUBoss v3.0+ uses keyword arguments directly — fail hard on connection error
|
||||
self._boss = GPUBoss(
|
||||
redis_url=self._redis_url,
|
||||
auto_start_services=self._auto_start_services,
|
||||
)
|
||||
await self._boss.connect()
|
||||
|
||||
# Auto-detect and register GPUs if not already registered
|
||||
gpu_count = await self._boss.get_gpu_count()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue