chore(api): 🔧 Add LLM client configuration options and update API endpoints

This commit is contained in:
Lilith 2026-01-18 04:31:07 -08:00
parent fc1353e0f1
commit 2df6f4039a
3 changed files with 85 additions and 89 deletions

View file

@ -80,7 +80,7 @@ def _create_lifespan():
if config.llm.backend == "model-boss":
@asynccontextmanager
async def model_boss_lifespan(app: FastAPI):
"""Model-boss lifespan with direct model-boss integration."""
"""Model-boss lifespan with v3 inference service routing."""
global _model_boss_client
setup_logging(service_name=config.service.name, level=config.logging.level)
@ -92,32 +92,26 @@ def _create_lifespan():
logger.info(f"Loading stages from external paths: {stage_paths}")
load_stages_from_paths(stage_paths)
logger.info(f"Initializing model-boss backend with model: {config.llm.model_id}")
logger.info(f"Initializing model-boss v3 backend with model: {config.llm.model_id}")
try:
# Import and create ManagedModelLoader directly
from lilith_model_boss import ManagedModelLoader
managed_loader = ManagedModelLoader()
logger.info("ManagedModelLoader created")
# Create ModelBoss client
# Create ModelBoss v3 client (routes to inference services)
_model_boss_client = ModelBossLLMClient(
model_id=config.llm.model_id,
max_tokens=2048,
)
# Configure client with managed loader
_model_boss_client.set_managed_loader(managed_loader)
# Load the model
loaded = await _model_boss_client.load_model()
if not loaded:
raise RuntimeError(f"Failed to load model: {config.llm.model_id}")
# Connect to inference services
connected = await _model_boss_client.connect()
if not connected:
logger.warning(
f"Could not connect to inference services for {config.llm.model_id}. "
"Service will start but requests may fail until llama-http is available."
)
# Set as global LLM client
set_llm_client(_model_boss_client)
logger.info(f"Model-boss client ready with model: {config.llm.model_id}")
logger.info(f"Model-boss v3 client ready for model: {config.llm.model_id}")
# Initialize reasoning engine
engine = await get_reasoning_engine()
@ -126,10 +120,10 @@ def _create_lifespan():
yield
finally:
# Cleanup model on shutdown
# Cleanup on shutdown
if _model_boss_client is not None:
await _model_boss_client.unload_model()
logger.info("Model unloaded")
await _model_boss_client.disconnect()
logger.info("Model-boss disconnected")
_model_boss_client = None
logger.info("Shutting down CoT Reasoning Service")

View file

@ -57,13 +57,12 @@ class ReasoningConfig(BaseModel):
def _get_default_port() -> int:
"""Get port from environment, service-addresses, or fallback.
"""Get port from environment or service-addresses. Fails fast if not configured.
Priority:
1. PORT environment variable (for multi-instance deployment)
2. COT_SERVICE__PORT environment variable (pydantic-settings)
3. lilith-service-addresses lookup
4. Hardcoded fallback (8110)
"""
import os
@ -71,22 +70,18 @@ def _get_default_port() -> int:
if "PORT" in os.environ:
return int(os.environ["PORT"])
# Try service-addresses
try:
from lilith_service_addresses import get_service_port
return get_service_port("ml", "cot-reasoning")
except Exception:
return 8110 # Fallback
# Service-addresses lookup (required)
from lilith_service_addresses import get_service_port
return get_service_port("ml", "cot-reasoning")
class ServiceConfig(BaseModel):
"""Service configuration.
Port can be set via:
- PORT env var (for lilith-platform orchestration)
- COT_SERVICE__PORT env var (pydantic-settings pattern)
- lilith-service-addresses lookup
- config.yaml
Port resolution (in order):
- PORT env var (orchestrator passes this)
- COT_SERVICE__PORT env var (pydantic-settings)
- lilith-service-addresses lookup (fails if not found)
"""
name: str = "cot-reasoning"

View file

@ -4,17 +4,12 @@ from __future__ import annotations
import logging
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
import httpx
from lilith_ollama_provider import OllamaAsyncProvider, OllamaConfig, ChatMessage
from ..config import get_config
if TYPE_CHECKING:
from lilith_model_boss import ManagedModelLoader
from lilith_model_boss.llm.models import LoadedModel
logger = logging.getLogger(__name__)
@ -39,9 +34,9 @@ class LLMClient(ABC):
class ModelBossLLMClient(LLMClient):
"""LLM client using lilith-model-boss for GPU-coordinated GGUF model loading.
"""LLM client using model-boss v3 for inference service routing.
Uses ManagedModelLoader for GPU lease coordination and GGUF model inference.
Routes requests to llama-http inference service via ModelBoss.
This is the recommended backend for local model inference.
"""
@ -49,67 +44,68 @@ class ModelBossLLMClient(LLMClient):
"""Initialize ModelBoss client.
Args:
model_id: Model ID from model-boss manifest (e.g., "qwen2.5-3b-instruct")
model_id: Model ID for inference (e.g., "qwen2.5-3b-instruct")
max_tokens: Default max tokens for generation
"""
self._model_id = model_id
self._max_tokens = max_tokens
self._managed_loader: ManagedModelLoader | None = None
self._loaded_model: LoadedModel | None = None
self._boss: "ModelBoss | None" = None
self._connected = False
def set_managed_loader(self, loader: ManagedModelLoader) -> None:
"""Set the managed loader from GPULifespanManager.
This must be called during startup before using the client.
Args:
loader: ManagedModelLoader instance from GPULifespanManager.gguf_loader
"""
self._managed_loader = loader
logger.info("ModelBoss client configured with managed loader")
async def load_model(self) -> bool:
"""Load the model via model-boss with GPU lease coordination.
async def connect(self) -> bool:
"""Connect to inference services via ModelBoss.
Returns:
True if model loaded successfully
True if connected successfully
"""
if self._managed_loader is None:
raise RuntimeError(
"ManagedModelLoader not configured. "
"Call set_managed_loader() during startup."
)
try:
from lilith_model_boss import Priority
from model_boss import ModelBoss
logger.info(f"Loading model via model-boss: {self._model_id}")
self._loaded_model = await self._managed_loader.load(
logger.info(f"Connecting to model-boss for model: {self._model_id}")
self._boss = ModelBoss(
model_id=self._model_id,
priority=Priority.NORMAL,
auto_start_services=False, # Services managed externally
)
logger.info(f"Model loaded: {self._model_id}")
await self._boss.connect()
self._connected = True
logger.info(f"Model-boss connected for: {self._model_id}")
return True
except Exception as e:
logger.error(f"Failed to load model: {e}")
logger.error(f"Failed to connect to model-boss: {e}")
self._connected = False
return False
async def unload_model(self) -> None:
"""Unload the model and release GPU lease."""
if self._loaded_model is not None and self._managed_loader is not None:
async def disconnect(self) -> None:
"""Disconnect from inference services."""
if self._boss is not None:
try:
await self._managed_loader.unload(self._model_id)
logger.info(f"Model unloaded: {self._model_id}")
await self._boss.dispose()
logger.info(f"Model-boss disconnected for: {self._model_id}")
except Exception as e:
logger.warning(f"Error during model unload: {e}")
logger.warning(f"Error during model-boss disconnect: {e}")
self._loaded_model = None
self._boss = None
self._connected = False
@property
def is_connected(self) -> bool:
"""Check if connected to inference services."""
return self._connected and self._boss is not None
# Backwards compatibility aliases
async def load_model(self) -> bool:
"""Alias for connect() - for backwards compatibility."""
return await self.connect()
async def unload_model(self) -> None:
"""Alias for disconnect() - for backwards compatibility."""
await self.disconnect()
@property
def is_loaded(self) -> bool:
"""Check if model is currently loaded."""
return self._loaded_model is not None
"""Alias for is_connected - for backwards compatibility."""
return self.is_connected
async def chat(
self,
@ -118,28 +114,39 @@ class ModelBossLLMClient(LLMClient):
temperature: float = 0.1,
max_tokens: int = 2048,
) -> str:
"""Send chat completion via model-boss loaded model."""
if self._loaded_model is None:
raise RuntimeError("Model not loaded. Call load_model() first.")
from lilith_model_boss.llm.models import ChatMessage as BossChatMessage
"""Send chat completion via model-boss inference router."""
if self._boss is None or not self._connected:
raise RuntimeError("Not connected. Call connect() first.")
messages = [
BossChatMessage(role="system", content=system_prompt),
BossChatMessage(role="user", content=user_prompt),
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
response = await self._loaded_model.chat(
response = await self._boss.chat(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
)
return response.strip()
# Handle both string and iterator responses
if isinstance(response, str):
return response.strip()
# If streaming, collect the response
result = []
async for chunk in response:
result.append(chunk)
return "".join(result).strip()
async def health_check(self) -> bool:
"""Check if the model is loaded and ready."""
return self.is_loaded
"""Check if connected and router has healthy services."""
if not self.is_connected or self._boss is None:
return False
try:
healthy = self._boss.router.get_healthy_services()
return len(healthy) > 0
except Exception:
return False
class OllamaLLMClient(LLMClient):