From 2df6f4039afd1ca0300932b4afeac3f3a53bcbd1 Mon Sep 17 00:00:00 2001 From: Lilith Date: Sun, 18 Jan 2026 04:31:07 -0800 Subject: [PATCH] =?UTF-8?q?chore(api):=20=F0=9F=94=A7=20Add=20LLM=20client?= =?UTF-8?q?=20configuration=20options=20and=20update=20API=20endpoints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- service/src/api/main.py | 34 +++++------ service/src/config.py | 21 +++---- service/src/llm/client.py | 119 ++++++++++++++++++++------------------ 3 files changed, 85 insertions(+), 89 deletions(-) diff --git a/service/src/api/main.py b/service/src/api/main.py index df46ce0..f685a8a 100644 --- a/service/src/api/main.py +++ b/service/src/api/main.py @@ -80,7 +80,7 @@ def _create_lifespan(): if config.llm.backend == "model-boss": @asynccontextmanager async def model_boss_lifespan(app: FastAPI): - """Model-boss lifespan with direct model-boss integration.""" + """Model-boss lifespan with v3 inference service routing.""" global _model_boss_client setup_logging(service_name=config.service.name, level=config.logging.level) @@ -92,32 +92,26 @@ def _create_lifespan(): logger.info(f"Loading stages from external paths: {stage_paths}") load_stages_from_paths(stage_paths) - logger.info(f"Initializing model-boss backend with model: {config.llm.model_id}") + logger.info(f"Initializing model-boss v3 backend with model: {config.llm.model_id}") try: - # Import and create ManagedModelLoader directly - from lilith_model_boss import ManagedModelLoader - - managed_loader = ManagedModelLoader() - logger.info("ManagedModelLoader created") - - # Create ModelBoss client + # Create ModelBoss v3 client (routes to inference services) _model_boss_client = ModelBossLLMClient( model_id=config.llm.model_id, max_tokens=2048, ) - # Configure client with managed loader - _model_boss_client.set_managed_loader(managed_loader) - - # Load the model - loaded = await _model_boss_client.load_model() - if not loaded: - raise RuntimeError(f"Failed to load model: {config.llm.model_id}") + # Connect to inference services + connected = await _model_boss_client.connect() + if not connected: + logger.warning( + f"Could not connect to inference services for {config.llm.model_id}. " + "Service will start but requests may fail until llama-http is available." + ) # Set as global LLM client set_llm_client(_model_boss_client) - logger.info(f"Model-boss client ready with model: {config.llm.model_id}") + logger.info(f"Model-boss v3 client ready for model: {config.llm.model_id}") # Initialize reasoning engine engine = await get_reasoning_engine() @@ -126,10 +120,10 @@ def _create_lifespan(): yield finally: - # Cleanup model on shutdown + # Cleanup on shutdown if _model_boss_client is not None: - await _model_boss_client.unload_model() - logger.info("Model unloaded") + await _model_boss_client.disconnect() + logger.info("Model-boss disconnected") _model_boss_client = None logger.info("Shutting down CoT Reasoning Service") diff --git a/service/src/config.py b/service/src/config.py index 56a5844..41a2122 100644 --- a/service/src/config.py +++ b/service/src/config.py @@ -57,13 +57,12 @@ class ReasoningConfig(BaseModel): def _get_default_port() -> int: - """Get port from environment, service-addresses, or fallback. + """Get port from environment or service-addresses. Fails fast if not configured. Priority: 1. PORT environment variable (for multi-instance deployment) 2. COT_SERVICE__PORT environment variable (pydantic-settings) 3. lilith-service-addresses lookup - 4. Hardcoded fallback (8110) """ import os @@ -71,22 +70,18 @@ def _get_default_port() -> int: if "PORT" in os.environ: return int(os.environ["PORT"]) - # Try service-addresses - try: - from lilith_service_addresses import get_service_port - return get_service_port("ml", "cot-reasoning") - except Exception: - return 8110 # Fallback + # Service-addresses lookup (required) + from lilith_service_addresses import get_service_port + return get_service_port("ml", "cot-reasoning") class ServiceConfig(BaseModel): """Service configuration. - Port can be set via: - - PORT env var (for lilith-platform orchestration) - - COT_SERVICE__PORT env var (pydantic-settings pattern) - - lilith-service-addresses lookup - - config.yaml + Port resolution (in order): + - PORT env var (orchestrator passes this) + - COT_SERVICE__PORT env var (pydantic-settings) + - lilith-service-addresses lookup (fails if not found) """ name: str = "cot-reasoning" diff --git a/service/src/llm/client.py b/service/src/llm/client.py index a6d1e12..0fbddca 100644 --- a/service/src/llm/client.py +++ b/service/src/llm/client.py @@ -4,17 +4,12 @@ from __future__ import annotations import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING import httpx from lilith_ollama_provider import OllamaAsyncProvider, OllamaConfig, ChatMessage from ..config import get_config -if TYPE_CHECKING: - from lilith_model_boss import ManagedModelLoader - from lilith_model_boss.llm.models import LoadedModel - logger = logging.getLogger(__name__) @@ -39,9 +34,9 @@ class LLMClient(ABC): class ModelBossLLMClient(LLMClient): - """LLM client using lilith-model-boss for GPU-coordinated GGUF model loading. + """LLM client using model-boss v3 for inference service routing. - Uses ManagedModelLoader for GPU lease coordination and GGUF model inference. + Routes requests to llama-http inference service via ModelBoss. This is the recommended backend for local model inference. """ @@ -49,67 +44,68 @@ class ModelBossLLMClient(LLMClient): """Initialize ModelBoss client. Args: - model_id: Model ID from model-boss manifest (e.g., "qwen2.5-3b-instruct") + model_id: Model ID for inference (e.g., "qwen2.5-3b-instruct") max_tokens: Default max tokens for generation """ self._model_id = model_id self._max_tokens = max_tokens - self._managed_loader: ManagedModelLoader | None = None - self._loaded_model: LoadedModel | None = None + self._boss: "ModelBoss | None" = None + self._connected = False - def set_managed_loader(self, loader: ManagedModelLoader) -> None: - """Set the managed loader from GPULifespanManager. - - This must be called during startup before using the client. - - Args: - loader: ManagedModelLoader instance from GPULifespanManager.gguf_loader - """ - self._managed_loader = loader - logger.info("ModelBoss client configured with managed loader") - - async def load_model(self) -> bool: - """Load the model via model-boss with GPU lease coordination. + async def connect(self) -> bool: + """Connect to inference services via ModelBoss. Returns: - True if model loaded successfully + True if connected successfully """ - if self._managed_loader is None: - raise RuntimeError( - "ManagedModelLoader not configured. " - "Call set_managed_loader() during startup." - ) - try: - from lilith_model_boss import Priority + from model_boss import ModelBoss - logger.info(f"Loading model via model-boss: {self._model_id}") - self._loaded_model = await self._managed_loader.load( + logger.info(f"Connecting to model-boss for model: {self._model_id}") + self._boss = ModelBoss( model_id=self._model_id, - priority=Priority.NORMAL, + auto_start_services=False, # Services managed externally ) - logger.info(f"Model loaded: {self._model_id}") + await self._boss.connect() + self._connected = True + logger.info(f"Model-boss connected for: {self._model_id}") return True except Exception as e: - logger.error(f"Failed to load model: {e}") + logger.error(f"Failed to connect to model-boss: {e}") + self._connected = False return False - async def unload_model(self) -> None: - """Unload the model and release GPU lease.""" - if self._loaded_model is not None and self._managed_loader is not None: + async def disconnect(self) -> None: + """Disconnect from inference services.""" + if self._boss is not None: try: - await self._managed_loader.unload(self._model_id) - logger.info(f"Model unloaded: {self._model_id}") + await self._boss.dispose() + logger.info(f"Model-boss disconnected for: {self._model_id}") except Exception as e: - logger.warning(f"Error during model unload: {e}") + logger.warning(f"Error during model-boss disconnect: {e}") - self._loaded_model = None + self._boss = None + self._connected = False + + @property + def is_connected(self) -> bool: + """Check if connected to inference services.""" + return self._connected and self._boss is not None + + # Backwards compatibility aliases + async def load_model(self) -> bool: + """Alias for connect() - for backwards compatibility.""" + return await self.connect() + + async def unload_model(self) -> None: + """Alias for disconnect() - for backwards compatibility.""" + await self.disconnect() @property def is_loaded(self) -> bool: - """Check if model is currently loaded.""" - return self._loaded_model is not None + """Alias for is_connected - for backwards compatibility.""" + return self.is_connected async def chat( self, @@ -118,28 +114,39 @@ class ModelBossLLMClient(LLMClient): temperature: float = 0.1, max_tokens: int = 2048, ) -> str: - """Send chat completion via model-boss loaded model.""" - if self._loaded_model is None: - raise RuntimeError("Model not loaded. Call load_model() first.") - - from lilith_model_boss.llm.models import ChatMessage as BossChatMessage + """Send chat completion via model-boss inference router.""" + if self._boss is None or not self._connected: + raise RuntimeError("Not connected. Call connect() first.") messages = [ - BossChatMessage(role="system", content=system_prompt), - BossChatMessage(role="user", content=user_prompt), + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, ] - response = await self._loaded_model.chat( + response = await self._boss.chat( messages=messages, max_tokens=max_tokens, temperature=temperature, ) - return response.strip() + # Handle both string and iterator responses + if isinstance(response, str): + return response.strip() + # If streaming, collect the response + result = [] + async for chunk in response: + result.append(chunk) + return "".join(result).strip() async def health_check(self) -> bool: - """Check if the model is loaded and ready.""" - return self.is_loaded + """Check if connected and router has healthy services.""" + if not self.is_connected or self._boss is None: + return False + try: + healthy = self._boss.router.get_healthy_services() + return len(healthy) > 0 + except Exception: + return False class OllamaLLMClient(LLMClient):