chore(config): 🔧 Update 7 py files in config

This commit is contained in:
Lilith 2026-01-18 15:48:39 -08:00
parent d059e2ad82
commit a0b8d0a5c1
8 changed files with 518 additions and 55 deletions

View file

@ -5,4 +5,4 @@ All imports are aliased to the new lilith_fastapi_service_base package.
"""
# Re-export everything from the new package name
from lilith_fastapi_service_base import * # noqa: F401, F403
from lilith_service_fastapi_bootstrap import * # noqa: F401, F403

View file

@ -3,7 +3,7 @@
from fastapi import FastAPI, HTTPException, Query
from contextlib import asynccontextmanager
from lilith_fastapi_service_base import (
from lilith_service_fastapi_bootstrap import (
create_service,
GPULifespanManager,
HealthChecker,

View file

@ -3,7 +3,7 @@
from pathlib import Path
from pydantic import Field
from pydantic_settings import SettingsConfigDict
from lilith_fastapi_service_base import BaseServiceSettings
from lilith_service_fastapi_bootstrap import BaseServiceSettings
from lilith_service_addresses import get_service_port, get_redis_url

View file

@ -22,7 +22,7 @@ from enum import Enum
import httpx
if TYPE_CHECKING:
from lilith_model_boss import ManagedModelLoader
from model_boss_loaders import ManagedModelLoader
logger = logging.getLogger(__name__)
@ -268,58 +268,50 @@ class LegalLLMValidator:
"saul-7b-instruct-v1-q8_0.gguf --local-dir ~/.cache/models/"
)
# Use managed loader if available for GPU coordination
if self._managed_loader is not None:
logger.info(f"Loading SaulLM via model-boss from {model_path}")
import asyncio
async def _load_managed():
from lilith_model_boss import Priority
await self._managed_loader.load(
model_id=str(model_path),
priority=Priority.NORMAL,
n_ctx=self._n_ctx,
n_gpu_layers=self._n_gpu_layers,
)
self._model_id = str(model_path)
# Run async load in event loop
try:
loop = asyncio.get_event_loop()
if loop.is_running():
# If we're in an async context, create a task
asyncio.create_task(_load_managed())
else:
loop.run_until_complete(_load_managed())
except RuntimeError:
# No event loop, create one
asyncio.run(_load_managed())
# Get model from managed loader for direct access
self._model = self._managed_loader.get_model(str(model_path))
self._loaded = True
logger.info("SaulLM loaded successfully via model-boss with GPU lease")
else:
# Direct loading without GPU coordination
try:
from llama_cpp import Llama
except ImportError:
raise RuntimeError(
"llama-cpp-python not installed. Install with: "
"pip install llama-cpp-python"
)
logger.info(f"Loading SaulLM directly from {model_path}")
self._model = Llama(
model_path=str(model_path),
n_ctx=self._n_ctx,
n_gpu_layers=self._n_gpu_layers,
verbose=self._verbose,
if self._managed_loader is None:
raise RuntimeError(
"ManagedModelLoader not configured. "
"Ensure lilith-service-fastapi-bootstrap ML support is enabled."
)
self._loaded = True
logger.info("SaulLM loaded successfully (no GPU coordination)")
# Use managed loader for GPU coordination with auto VRAM detection
logger.info(f"Loading SaulLM via model-boss from {model_path}")
import asyncio
async def _load_managed():
from model_boss_loaders import Priority
# ManagedModelLoader auto-detects VRAM from GGUF file size
# No need to specify vram_mb
await self._managed_loader.load(
model_id=str(model_path),
loader_type="gguf",
priority=Priority.NORMAL,
n_ctx=self._n_ctx,
n_gpu_layers=self._n_gpu_layers,
)
self._model_id = str(model_path)
# Run async load in event loop
try:
loop = asyncio.get_event_loop()
if loop.is_running():
# If we're in an async context, create a task and await it
task = asyncio.create_task(_load_managed())
loop.run_until_complete(task)
else:
loop.run_until_complete(_load_managed())
except RuntimeError:
# No event loop, create one
asyncio.run(_load_managed())
# Get loaded model from managed loader
loaded_model = self._managed_loader.get_loaded(str(model_path))
if loaded_model is None:
raise RuntimeError(f"Failed to load model {model_path}")
self._model = loaded_model
self._loaded = True
logger.info("SaulLM loaded successfully via model-boss with GPU lease (auto VRAM)")
def unload(self) -> None:
"""Unload the model to free memory and release GPU lease."""

View file

@ -1,8 +1,236 @@
"""Shared fixtures for truth-service tests."""
"""Shared fixtures for truth-service tests.
Provides fixtures for testing the truth-validation service with model-boss v3.
The truth service uses SaulLM-7B for legal content validation.
"""
from __future__ import annotations
import os
import sys
from pathlib import Path
from typing import TYPE_CHECKING, AsyncGenerator, Callable
import pytest
import pytest_asyncio
# Add service source to path
SERVICE_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(SERVICE_ROOT / "python"))
if TYPE_CHECKING:
from model_boss import GPUBoss
from model_boss_loaders import ManagedModelLoader
# ============================================================================
# Ensure loaders are registered (workaround for lazy import issues)
# ============================================================================
def _register_loaders() -> None:
"""Register model loaders by triggering their imports.
model_boss_loaders uses lazy imports, so loaders aren't registered
until their modules are imported. This function ensures the GGUF loader
(needed for SaulLM models) is registered.
"""
try:
from model_boss_loaders import registry
if not registry.is_loader_registered("gguf"):
from model_boss_loaders.gguf import GGUFLoader
# The decorator auto-registers when module is imported
if not registry.is_loader_registered("gguf"):
registry.register_loader("gguf", GGUFLoader)
except ImportError:
pass
_register_loaders()
# ============================================================================
# Pytest Configuration Hooks
# ============================================================================
def pytest_addoption(parser: pytest.Parser) -> None:
"""Add CLI options for GPU tests."""
parser.addoption(
"--real-model",
action="store_true",
default=False,
help="Run real GPU tests with actual model loading",
)
parser.addoption(
"--redis-url",
default=os.environ.get("REDIS_URL", "redis://localhost:6379"),
help="Redis URL for GPU coordination",
)
def pytest_configure(config: pytest.Config) -> None:
"""Register custom markers."""
config.addinivalue_line("markers", "gpu: Requires GPU hardware")
config.addinivalue_line("markers", "modelboss: Tests model-boss v3 integration")
config.addinivalue_line("markers", "slow: Slow tests (model loading)")
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
"""Skip GPU tests if --real-model not specified."""
if not config.getoption("--real-model"):
skip_gpu = pytest.mark.skip(reason="Use --real-model to run GPU tests")
for item in items:
if "gpu" in item.keywords:
item.add_marker(skip_gpu)
# ============================================================================
# Shared GPU Fixtures
# ============================================================================
@pytest.fixture(scope="session")
def redis_url(request: pytest.FixtureRequest) -> str:
"""Get Redis URL from CLI or environment."""
return request.config.getoption("--redis-url")
@pytest.fixture(scope="session")
def gpu_available() -> bool:
"""Check if CUDA GPU is available."""
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
@pytest.fixture(scope="session")
def gpu_vram_mb() -> int:
"""Get total GPU VRAM in MB."""
try:
import torch
if not torch.cuda.is_available():
return 0
return torch.cuda.get_device_properties(0).total_memory // (1024 * 1024)
except ImportError:
return 0
@pytest.fixture(scope="session")
def gpu_name() -> str:
"""Get GPU device name."""
try:
import torch
if not torch.cuda.is_available():
return "No GPU"
return torch.cuda.get_device_properties(0).name
except ImportError:
return "Unknown"
@pytest_asyncio.fixture(scope="session")
async def real_gpu_boss(
request: pytest.FixtureRequest,
redis_url: str,
gpu_available: bool,
gpu_vram_mb: int,
gpu_name: str,
) -> AsyncGenerator["GPUBoss", None]:
"""Real GPUBoss connected to Redis with GPU initialized."""
if not request.config.getoption("--real-model"):
pytest.skip("Use --real-model for GPU tests")
if not gpu_available:
pytest.skip("No GPU available")
from model_boss import GPUBoss
boss = GPUBoss(redis_url=redis_url)
await boss.connect()
await boss.initialize_gpu(gpu_index=0, vram_total_mb=gpu_vram_mb, gpu_name=gpu_name)
yield boss
try:
status = await boss.get_status()
for gpu in status.gpus:
for lease in gpu.leases:
await boss.force_release(lease.lease_id)
except Exception:
pass
await boss.close()
@pytest_asyncio.fixture
async def managed_loader_factory(
real_gpu_boss: "GPUBoss",
) -> AsyncGenerator[Callable[[str], "ManagedModelLoader"], None]:
"""Factory for creating ManagedModelLoader instances with cleanup."""
from model_boss_loaders import ManagedModelLoader
loaders: list[ManagedModelLoader] = []
def _create(service_name: str = "test") -> ManagedModelLoader:
loader = ManagedModelLoader(boss=real_gpu_boss, service_name=service_name)
loaders.append(loader)
return loader
yield _create
for loader in loaders:
try:
await loader.unload_all()
except Exception:
pass
# ============================================================================
# Model-Boss v3 GPU Integration Fixtures
# ============================================================================
@pytest.fixture
def saullm_model_id() -> str:
"""Model ID for SaulLM (resolved by model-boss manifest).
The actual model is SaulLM-7B-Instruct in GGUF format.
SaulLM is a legal-domain LLM for content validation.
"""
return os.environ.get("TRUTH_MODEL_ID", "saullm-7b-instruct")
@pytest.fixture
def truth_service_name() -> str:
"""Service name for lease identification.
This name appears in Redis leases for debugging/monitoring.
"""
return "truth-validation"
@pytest.fixture
def truth_service_priority() -> int:
"""Service priority level (lower = higher priority).
Truth validation has priority 9 (critical) - compliance is essential.
"""
return 9
@pytest.fixture
def truth_expected_vram_range() -> tuple[int, int]:
"""Expected VRAM range for SaulLM-7B GGUF in MB.
SaulLM-7B in Q8 quantization uses approximately 6-10 GB VRAM.
When loading from model_id, VRAM estimation may fall back to default (4096).
"""
return (4000, 10000)
# ============================================================================
# Service-Specific Fixtures
# ============================================================================
@pytest.fixture

View file

@ -0,0 +1 @@
"""GPU integration tests for truth-validation ML service."""

View file

@ -0,0 +1,238 @@
"""GPU integration tests for truth-validation model-boss v3 migration.
Proves:
1. SaulLM-7B loads via ManagedModelLoader using model_id (not path)
2. Auto VRAM detection for GGUF model
3. Legal content analysis works
4. Lease management through validator lifecycle
5. Critical priority (9) is respected
Run with: pytest features/truth-validation/ml-service/tests/gpu/ --real-model -v
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
if TYPE_CHECKING:
from model_boss import GPUBoss
from model_boss_loaders import ManagedModelLoader
pytestmark = [pytest.mark.gpu, pytest.mark.modelboss]
def get_active_leases_for_service(status, service_name: str) -> list:
"""Extract leases for a specific service from BossStatus."""
leases = []
for gpu in status.gpus:
for lease in gpu.leases:
if service_name in (lease.service_name or ""):
leases.append(lease)
return leases
class TestTruthValidationModelBossIntegration:
"""Test truth-validation service model-boss v3 integration."""
@pytest.mark.asyncio
async def test_load_saullm_with_auto_vram_detection(
self,
managed_loader_factory,
saullm_model_id: str,
truth_service_name: str,
truth_expected_vram_range: tuple[int, int],
gpu_vram_mb: int,
):
"""Prove: Auto VRAM detection works for SaulLM-7B via model_id resolution.
The test:
1. Creates a ManagedModelLoader with GPUBoss
2. Loads SaulLM using model_id (not path)
3. Verifies VRAM was auto-detected from GGUF file size
4. Unloads and verifies cleanup
"""
min_vram, max_vram = truth_expected_vram_range
if gpu_vram_mb < min_vram:
pytest.skip(f"Insufficient VRAM for SaulLM-7B (need {min_vram}MB, have {gpu_vram_mb}MB)")
loader = managed_loader_factory(truth_service_name)
# Load model using model_id - model-boss resolves GGUF path automatically
# NO vram_mb specified (auto-detection from GGUF file size)
model = await loader.load(model_id=saullm_model_id)
assert model is not None, "Model should be loaded"
assert loader.is_loaded(saullm_model_id), "Model should be tracked as loaded"
# Verify VRAM was auto-detected
lease = loader.get_lease(saullm_model_id)
assert lease is not None, "Lease should exist"
vram_usage = lease.info.vram_mb
assert min_vram <= vram_usage <= max_vram, (
f"VRAM {vram_usage}MB outside expected range [{min_vram}, {max_vram}]"
)
await loader.unload(saullm_model_id)
assert not loader.is_loaded(saullm_model_id), "Model should be unloaded"
@pytest.mark.asyncio
async def test_redis_lease_created(
self,
managed_loader_factory,
real_gpu_boss: "GPUBoss",
saullm_model_id: str,
truth_service_name: str,
gpu_vram_mb: int,
):
"""Prove: Redis lease created when model loads."""
if gpu_vram_mb < 6000:
pytest.skip("Insufficient VRAM for SaulLM-7B")
loader = managed_loader_factory(truth_service_name)
# Check no lease before load
status_before = await real_gpu_boss.get_status()
leases_before = get_active_leases_for_service(status_before, truth_service_name)
# Load model via model_id
await loader.load(model_id=saullm_model_id)
# Verify lease created in Redis
status_after = await real_gpu_boss.get_status()
leases_after = get_active_leases_for_service(status_after, truth_service_name)
assert len(leases_after) > len(leases_before), "No lease created in Redis"
# Verify lease metadata
new_lease = leases_after[-1]
assert new_lease.model_id == saullm_model_id
assert new_lease.vram_mb > 0
# Unload and verify lease released
await loader.unload(saullm_model_id)
status_final = await real_gpu_boss.get_status()
leases_final = get_active_leases_for_service(status_final, truth_service_name)
assert len(leases_final) == len(leases_before), "Lease not released"
@pytest.mark.asyncio
@pytest.mark.slow
async def test_legal_review_inference(
self,
managed_loader_factory,
saullm_model_id: str,
truth_service_name: str,
gpu_vram_mb: int,
):
"""Prove: Legal content review produces valid results.
This test loads the actual model and runs legal analysis inference.
SaulLM is specifically trained for legal text understanding.
"""
if gpu_vram_mb < 6000:
pytest.skip("Insufficient VRAM for SaulLM-7B")
loader = managed_loader_factory(truth_service_name)
model = await loader.load(model_id=saullm_model_id)
# Generate legal analysis using the loaded model
prompt = """Analyze this content for legal issues:
"We collect your email and sell it to partners without consent."
Identify any privacy, GDPR, or consumer protection violations."""
response = model.create_chat_completion(
messages=[
{"role": "user", "content": prompt}
],
max_tokens=500,
)
assert response is not None
content = response["choices"][0]["message"]["content"]
assert len(content) > 20, f"Legal analysis too short: {content}"
# Should mention consent/privacy issues
content_lower = content.lower()
privacy_terms = ["consent", "privacy", "gdpr", "violation", "data protection", "unauthorized"]
found_terms = [term for term in privacy_terms if term in content_lower]
assert len(found_terms) > 0, (
f"Legal analysis should identify privacy issues. Found: {content[:200]}"
)
await loader.unload(saullm_model_id)
@pytest.mark.asyncio
async def test_urgent_priority_set(
self,
managed_loader_factory,
real_gpu_boss: "GPUBoss",
saullm_model_id: str,
truth_service_name: str,
truth_service_priority: int,
gpu_vram_mb: int,
):
"""Prove: Truth validation uses urgent priority (1)."""
if gpu_vram_mb < 6000:
pytest.skip("Insufficient VRAM for SaulLM-7B")
from model_boss import Priority
loader = managed_loader_factory(truth_service_name)
# Load with urgent priority (highest available)
await loader.load(
model_id=saullm_model_id,
priority=Priority.URGENT,
)
# Verify priority in lease
lease = loader.get_lease(saullm_model_id)
assert lease is not None
# Priority.URGENT should be the highest priority
assert lease.info.priority == Priority.URGENT
await loader.unload(saullm_model_id)
@pytest.mark.asyncio
async def test_content_validation_workflow(
self,
managed_loader_factory,
saullm_model_id: str,
truth_service_name: str,
financial_claims_false,
gpu_vram_mb: int,
):
"""Prove: Model can validate content claims.
Uses the financial_claims_false fixture to test that the model
can identify problematic financial claims.
"""
if gpu_vram_mb < 6000:
pytest.skip("Insufficient VRAM for SaulLM-7B")
loader = managed_loader_factory(truth_service_name)
model = await loader.load(model_id=saullm_model_id)
# Test with a financial claim
claim = financial_claims_false[0]
prompt = f"""Analyze this claim for accuracy:
"{claim['content']}"
Context: This is about a platform that charges creators $0 fees.
Is this claim potentially misleading? Explain briefly."""
response = model.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=200,
)
assert response is not None
content = response["choices"][0]["message"]["content"]
assert len(content) > 10, "Should provide analysis"
await loader.unload(saullm_model_id)

View file

@ -0,0 +1,4 @@
[pytest]
asyncio_mode = auto
asyncio_default_fixture_loop_scope = session
asyncio_default_test_loop_scope = session