Compare commits
10 commits
475359ee7e
...
797bcf9973
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
797bcf9973 | ||
|
|
84a6046386 | ||
|
|
adaa2ec58d | ||
|
|
b9ca94a0eb | ||
|
|
c259fe6cbe | ||
|
|
22e4984af8 | ||
|
|
1fde135419 | ||
|
|
56f82816be | ||
|
|
f053ba727c | ||
|
|
3588165488 |
5 changed files with 279 additions and 13 deletions
7
.forgejo/workflows/publish.yml
Normal file
7
.forgejo/workflows/publish.yml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Calls reusable workflow from lilith/workflows
|
||||
name: Publish
|
||||
on: [push, workflow_dispatch]
|
||||
jobs:
|
||||
publish:
|
||||
uses: lilith/workflows/.forgejo/workflows/publish-pypi.yml@main
|
||||
secrets: inherit
|
||||
|
|
@ -4,11 +4,11 @@ build-backend = "hatchling.build"
|
|||
|
||||
[project]
|
||||
name = "lilith-service-fastapi-bootstrap"
|
||||
version = "4.0.0"
|
||||
version = "4.2.0"
|
||||
description = "Base utilities for FastAPI microservices with optional ML support"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
authors = [{ name = "Lilith", email = "dev@atlilith.com" }]
|
||||
authors = [{ name = "Lilith", email = "quinn@ftw.codes" }]
|
||||
keywords = ["fastapi", "microservice", "bootstrap", "ml", "ai"]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
|
|
@ -32,7 +32,7 @@ dependencies = [
|
|||
redis = ["redis>=5.0.0"]
|
||||
database = ["sqlalchemy>=2.0.0", "asyncpg>=0.29.0"]
|
||||
ml = [
|
||||
"lilith-model-boss>=3.0.0",
|
||||
"lilith-model-boss>=4.0.0",
|
||||
"lilith-model-boss-loaders>=1.0.0",
|
||||
]
|
||||
service-addresses = ["lilith-service-addresses>=1.0.0"] # For dependency startup (v2.0+)
|
||||
|
|
|
|||
|
|
@ -96,6 +96,9 @@ from .exceptions import (
|
|||
ConnectionError,
|
||||
)
|
||||
|
||||
# Docker auto-start (fail-fast dependency management)
|
||||
from .docker_autostart import ensure_container_running, ContainerStartError
|
||||
|
||||
# Content Generation
|
||||
from .content_generation import (
|
||||
ContentGenerationSettings,
|
||||
|
|
@ -210,4 +213,7 @@ __all__ = [
|
|||
"ServiceConfig",
|
||||
"ServiceInfo",
|
||||
"create_service_discovery_client",
|
||||
# Docker auto-start
|
||||
"ensure_container_running",
|
||||
"ContainerStartError",
|
||||
]
|
||||
|
|
|
|||
187
src/lilith_service_fastapi_bootstrap/docker_autostart.py
Normal file
187
src/lilith_service_fastapi_bootstrap/docker_autostart.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
"""Docker container auto-start utility for fail-fast service dependencies.
|
||||
|
||||
Services that depend on Docker containers (Redis, PostgreSQL, etc.) use this
|
||||
to ensure containers are running at boot. If a container can't be started,
|
||||
the service fails immediately — no graceful degradation.
|
||||
|
||||
Usage:
|
||||
from lilith_service_fastapi_bootstrap.docker_autostart import ensure_container_running
|
||||
|
||||
# Ensure model-boss-redis is up before connecting
|
||||
await ensure_container_running("model-boss-redis")
|
||||
|
||||
# Ensure with custom timeout
|
||||
await ensure_container_running("lilith-analytics-redis", timeout=30)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DOCKER_BIN: str | None = shutil.which("docker")
|
||||
|
||||
|
||||
class ContainerStartError(RuntimeError):
|
||||
"""Raised when a required Docker container cannot be started."""
|
||||
|
||||
|
||||
def _run_docker(*args: str, timeout: int = 15) -> subprocess.CompletedProcess[str]:
|
||||
"""Run a docker CLI command synchronously.
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If docker binary is not found.
|
||||
"""
|
||||
if DOCKER_BIN is None:
|
||||
raise ContainerStartError(
|
||||
"Docker CLI not found on PATH. "
|
||||
"Cannot auto-start container dependencies. "
|
||||
"Install Docker or start containers manually."
|
||||
)
|
||||
|
||||
return subprocess.run(
|
||||
[DOCKER_BIN, *args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def _is_container_running(container_name: str) -> bool:
|
||||
"""Check if a Docker container is currently running."""
|
||||
result = _run_docker(
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{.State.Running}}",
|
||||
container_name,
|
||||
)
|
||||
return result.returncode == 0 and result.stdout.strip() == "true"
|
||||
|
||||
|
||||
def _container_exists(container_name: str) -> bool:
|
||||
"""Check if a Docker container exists (running or stopped)."""
|
||||
result = _run_docker(
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{.State.Status}}",
|
||||
container_name,
|
||||
)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def _start_container(container_name: str) -> None:
|
||||
"""Start an existing Docker container.
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If the container fails to start.
|
||||
"""
|
||||
logger.info(f"Starting Docker container: {container_name}")
|
||||
result = _run_docker("start", container_name)
|
||||
if result.returncode != 0:
|
||||
raise ContainerStartError(
|
||||
f"Failed to start container '{container_name}': {result.stderr.strip()}"
|
||||
)
|
||||
|
||||
|
||||
async def _wait_for_healthy(
|
||||
container_name: str,
|
||||
timeout: float,
|
||||
poll_interval: float = 0.5,
|
||||
) -> None:
|
||||
"""Wait for a container to be running and responding.
|
||||
|
||||
Checks both Docker 'running' state and healthcheck status (if configured).
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If container doesn't become healthy within timeout.
|
||||
"""
|
||||
elapsed = 0.0
|
||||
while elapsed < timeout:
|
||||
if not _is_container_running(container_name):
|
||||
await asyncio.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
continue
|
||||
|
||||
# Check healthcheck status if the container has one
|
||||
result = _run_docker(
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
|
||||
container_name,
|
||||
)
|
||||
health_status = result.stdout.strip()
|
||||
|
||||
if health_status == "none":
|
||||
# No healthcheck configured — running state is sufficient
|
||||
return
|
||||
elif health_status == "healthy":
|
||||
return
|
||||
elif health_status == "unhealthy":
|
||||
raise ContainerStartError(
|
||||
f"Container '{container_name}' is unhealthy. "
|
||||
f"Check: docker logs {container_name}"
|
||||
)
|
||||
|
||||
# Still starting — wait and poll again
|
||||
await asyncio.sleep(poll_interval)
|
||||
elapsed += poll_interval
|
||||
|
||||
raise ContainerStartError(
|
||||
f"Container '{container_name}' did not become healthy within {timeout}s. "
|
||||
f"Check: docker logs {container_name}"
|
||||
)
|
||||
|
||||
|
||||
async def ensure_container_running(
|
||||
container_name: str,
|
||||
timeout: float = 15.0,
|
||||
) -> None:
|
||||
"""Ensure a Docker container is running and healthy.
|
||||
|
||||
If the container exists but is stopped, starts it.
|
||||
If the container doesn't exist, raises an error (it must be defined in docker-compose).
|
||||
If the container can't reach healthy state within timeout, raises an error.
|
||||
|
||||
This is a fail-fast utility — services should NOT catch ContainerStartError
|
||||
and degrade gracefully. If a required dependency can't start, the service
|
||||
must fail to start.
|
||||
|
||||
Args:
|
||||
container_name: Docker container name (e.g., "model-boss-redis").
|
||||
timeout: Max seconds to wait for healthy state.
|
||||
|
||||
Raises:
|
||||
ContainerStartError: If container cannot be started or doesn't become healthy.
|
||||
"""
|
||||
# Already running — fast path
|
||||
if _is_container_running(container_name):
|
||||
logger.debug(f"Container already running: {container_name}")
|
||||
return
|
||||
|
||||
# Container exists but stopped — start it
|
||||
if _container_exists(container_name):
|
||||
_start_container(container_name)
|
||||
await _wait_for_healthy(container_name, timeout)
|
||||
logger.info(f"Container started and healthy: {container_name}")
|
||||
return
|
||||
|
||||
# Container doesn't exist — try docker-compose up for it
|
||||
logger.info(f"Container '{container_name}' not found, attempting docker compose up")
|
||||
compose_result = _run_docker(
|
||||
"compose",
|
||||
"up",
|
||||
"-d",
|
||||
container_name,
|
||||
timeout=30,
|
||||
)
|
||||
if compose_result.returncode != 0:
|
||||
raise ContainerStartError(
|
||||
f"Container '{container_name}' does not exist and docker compose up failed: "
|
||||
f"{compose_result.stderr.strip()}. "
|
||||
f"Ensure the container is defined in docker-compose.yml."
|
||||
)
|
||||
|
||||
await _wait_for_healthy(container_name, timeout)
|
||||
logger.info(f"Container created and healthy: {container_name}")
|
||||
|
|
@ -286,11 +286,18 @@ class GPULifespanManager(LifespanManager):
|
|||
def __init__(
|
||||
self,
|
||||
redis_url: str = "redis://localhost:6379",
|
||||
auto_start_services: bool = False,
|
||||
redis_container_name: str = "model-boss-redis",
|
||||
) -> None:
|
||||
"""Initialize GPU-aware lifespan manager.
|
||||
|
||||
Args:
|
||||
redis_url: Redis URL for GPUBoss coordination.
|
||||
auto_start_services: Whether GPUBoss should auto-start Redis/services.
|
||||
Defaults to False — expects Redis to be running externally
|
||||
(via docker-compose, systemd, or dev cluster).
|
||||
redis_container_name: Docker container name for the model-boss Redis
|
||||
instance. Auto-started if not running.
|
||||
|
||||
Note:
|
||||
In v4.0+, GPU auto-detection and watchdog are enabled by default.
|
||||
|
|
@ -298,6 +305,8 @@ class GPULifespanManager(LifespanManager):
|
|||
"""
|
||||
super().__init__()
|
||||
self._redis_url = redis_url
|
||||
self._auto_start_services = auto_start_services
|
||||
self._redis_container_name = redis_container_name
|
||||
|
||||
# Will be initialized on startup
|
||||
self._boss: GPUBoss | None = None
|
||||
|
|
@ -346,24 +355,38 @@ class GPULifespanManager(LifespanManager):
|
|||
return self._hf_loader
|
||||
|
||||
async def _init_gpu_boss(self) -> None:
|
||||
"""Initialize GPUBoss and managed loaders."""
|
||||
"""Initialize GPUBoss and managed loaders.
|
||||
|
||||
Fail-fast: auto-starts model-boss-redis if not running, then connects.
|
||||
Raises on failure — services MUST NOT start without GPU coordination.
|
||||
"""
|
||||
try:
|
||||
from model_boss import GPUBoss
|
||||
from model_boss_loaders import ManagedModelLoader
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"lilith-model-boss not installed. GPU coordination disabled. "
|
||||
except ImportError as e:
|
||||
raise RuntimeError(
|
||||
"lilith-model-boss not installed. GPU coordination required. "
|
||||
"Install with: pip install lilith-model-boss[loaders]"
|
||||
)
|
||||
return
|
||||
) from e
|
||||
|
||||
# Auto-start model-boss-redis container
|
||||
from .docker_autostart import ensure_container_running
|
||||
|
||||
await ensure_container_running(self._redis_container_name)
|
||||
|
||||
logger.info("Initializing GPUBoss for GPU lease coordination")
|
||||
|
||||
# GPUBoss v3.0+ uses keyword arguments directly
|
||||
self._boss = GPUBoss(redis_url=self._redis_url)
|
||||
# GPUBoss v3.0+ uses keyword arguments directly — fail hard on connection error
|
||||
self._boss = GPUBoss(
|
||||
redis_url=self._redis_url,
|
||||
auto_start_services=self._auto_start_services,
|
||||
)
|
||||
await self._boss.connect()
|
||||
|
||||
# Watchdog is now automatic in v3.0+ - no manual start needed
|
||||
# Auto-detect and register GPUs if not already registered
|
||||
gpu_count = await self._boss.get_gpu_count()
|
||||
if gpu_count == 0:
|
||||
await self._register_gpus()
|
||||
|
||||
# Create managed loaders (unified in v3.0+)
|
||||
self._gguf_loader = ManagedModelLoader(boss=self._boss)
|
||||
|
|
@ -381,7 +404,6 @@ class GPULifespanManager(LifespanManager):
|
|||
if self._hf_loader is not None:
|
||||
logger.info("Unloading HuggingFace models")
|
||||
await self._hf_loader.unload_all()
|
||||
await self._hf_loader.close()
|
||||
|
||||
if self._gguf_loader is not None:
|
||||
logger.info("Unloading GGUF models")
|
||||
|
|
@ -392,3 +414,47 @@ class GPULifespanManager(LifespanManager):
|
|||
await self._boss.close()
|
||||
|
||||
logger.info("GPU resources cleaned up")
|
||||
|
||||
async def _register_gpus(self) -> None:
|
||||
"""Auto-detect NVIDIA GPUs and register them with GPUBoss.
|
||||
|
||||
Uses nvidia-smi to query GPU names and VRAM. Falls back gracefully
|
||||
if nvidia-smi is not available (e.g., CPU-only environments).
|
||||
"""
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
if not shutil.which("nvidia-smi"):
|
||||
logger.warning(
|
||||
"nvidia-smi not found — cannot auto-detect GPUs. "
|
||||
"GPU lease coordination requires manual registration."
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning(f"nvidia-smi failed: {result.stderr.strip()}")
|
||||
return
|
||||
|
||||
for line in result.stdout.strip().split("\n"):
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) != 3:
|
||||
continue
|
||||
gpu_index = int(parts[0])
|
||||
gpu_name = parts[1]
|
||||
vram_total_mb = int(float(parts[2]))
|
||||
await self._boss.initialize_gpu(gpu_index, vram_total_mb, gpu_name)
|
||||
|
||||
except (subprocess.TimeoutExpired, ValueError, OSError) as e:
|
||||
logger.warning(f"GPU auto-detection failed: {e}")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue