diff --git a/features/truth-validation/ml-service/python/lilith_ml_service_base.py b/features/truth-validation/ml-service/python/lilith_ml_service_base.py index dccdee007..f8ee82010 100755 --- a/features/truth-validation/ml-service/python/lilith_ml_service_base.py +++ b/features/truth-validation/ml-service/python/lilith_ml_service_base.py @@ -5,4 +5,4 @@ All imports are aliased to the new lilith_fastapi_service_base package. """ # Re-export everything from the new package name -from lilith_fastapi_service_base import * # noqa: F401, F403 +from lilith_service_fastapi_bootstrap import * # noqa: F401, F403 diff --git a/features/truth-validation/ml-service/python/lilith_truth_service/app.py b/features/truth-validation/ml-service/python/lilith_truth_service/app.py index 04bd00dbd..d005adbcd 100755 --- a/features/truth-validation/ml-service/python/lilith_truth_service/app.py +++ b/features/truth-validation/ml-service/python/lilith_truth_service/app.py @@ -3,7 +3,7 @@ from fastapi import FastAPI, HTTPException, Query from contextlib import asynccontextmanager -from lilith_fastapi_service_base import ( +from lilith_service_fastapi_bootstrap import ( create_service, GPULifespanManager, HealthChecker, diff --git a/features/truth-validation/ml-service/python/lilith_truth_service/config.py b/features/truth-validation/ml-service/python/lilith_truth_service/config.py index e50b6cfec..b21dcc6c4 100755 --- a/features/truth-validation/ml-service/python/lilith_truth_service/config.py +++ b/features/truth-validation/ml-service/python/lilith_truth_service/config.py @@ -3,7 +3,7 @@ from pathlib import Path from pydantic import Field from pydantic_settings import SettingsConfigDict -from lilith_fastapi_service_base import BaseServiceSettings +from lilith_service_fastapi_bootstrap import BaseServiceSettings from lilith_service_addresses import get_service_port, get_redis_url diff --git a/features/truth-validation/ml-service/python/lilith_truth_service/validators/legal_llm.py b/features/truth-validation/ml-service/python/lilith_truth_service/validators/legal_llm.py index e5c70d5da..de859d7f5 100755 --- a/features/truth-validation/ml-service/python/lilith_truth_service/validators/legal_llm.py +++ b/features/truth-validation/ml-service/python/lilith_truth_service/validators/legal_llm.py @@ -22,7 +22,7 @@ from enum import Enum import httpx if TYPE_CHECKING: - from lilith_model_boss import ManagedModelLoader + from model_boss_loaders import ManagedModelLoader logger = logging.getLogger(__name__) @@ -268,58 +268,50 @@ class LegalLLMValidator: "saul-7b-instruct-v1-q8_0.gguf --local-dir ~/.cache/models/" ) - # Use managed loader if available for GPU coordination - if self._managed_loader is not None: - logger.info(f"Loading SaulLM via model-boss from {model_path}") - import asyncio - - async def _load_managed(): - from lilith_model_boss import Priority - await self._managed_loader.load( - model_id=str(model_path), - priority=Priority.NORMAL, - n_ctx=self._n_ctx, - n_gpu_layers=self._n_gpu_layers, - ) - self._model_id = str(model_path) - - # Run async load in event loop - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - # If we're in an async context, create a task - asyncio.create_task(_load_managed()) - else: - loop.run_until_complete(_load_managed()) - except RuntimeError: - # No event loop, create one - asyncio.run(_load_managed()) - - # Get model from managed loader for direct access - self._model = self._managed_loader.get_model(str(model_path)) - self._loaded = True - logger.info("SaulLM loaded successfully via model-boss with GPU lease") - else: - # Direct loading without GPU coordination - try: - from llama_cpp import Llama - except ImportError: - raise RuntimeError( - "llama-cpp-python not installed. Install with: " - "pip install llama-cpp-python" - ) - - logger.info(f"Loading SaulLM directly from {model_path}") - - self._model = Llama( - model_path=str(model_path), - n_ctx=self._n_ctx, - n_gpu_layers=self._n_gpu_layers, - verbose=self._verbose, + if self._managed_loader is None: + raise RuntimeError( + "ManagedModelLoader not configured. " + "Ensure lilith-service-fastapi-bootstrap ML support is enabled." ) - self._loaded = True - logger.info("SaulLM loaded successfully (no GPU coordination)") + # Use managed loader for GPU coordination with auto VRAM detection + logger.info(f"Loading SaulLM via model-boss from {model_path}") + import asyncio + + async def _load_managed(): + from model_boss_loaders import Priority + # ManagedModelLoader auto-detects VRAM from GGUF file size + # No need to specify vram_mb + await self._managed_loader.load( + model_id=str(model_path), + loader_type="gguf", + priority=Priority.NORMAL, + n_ctx=self._n_ctx, + n_gpu_layers=self._n_gpu_layers, + ) + self._model_id = str(model_path) + + # Run async load in event loop + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + # If we're in an async context, create a task and await it + task = asyncio.create_task(_load_managed()) + loop.run_until_complete(task) + else: + loop.run_until_complete(_load_managed()) + except RuntimeError: + # No event loop, create one + asyncio.run(_load_managed()) + + # Get loaded model from managed loader + loaded_model = self._managed_loader.get_loaded(str(model_path)) + if loaded_model is None: + raise RuntimeError(f"Failed to load model {model_path}") + + self._model = loaded_model + self._loaded = True + logger.info("SaulLM loaded successfully via model-boss with GPU lease (auto VRAM)") def unload(self) -> None: """Unload the model to free memory and release GPU lease.""" diff --git a/features/truth-validation/ml-service/tests/conftest.py b/features/truth-validation/ml-service/tests/conftest.py index 80e114093..11ea6e063 100644 --- a/features/truth-validation/ml-service/tests/conftest.py +++ b/features/truth-validation/ml-service/tests/conftest.py @@ -1,8 +1,236 @@ -"""Shared fixtures for truth-service tests.""" +"""Shared fixtures for truth-service tests. +Provides fixtures for testing the truth-validation service with model-boss v3. +The truth service uses SaulLM-7B for legal content validation. +""" from __future__ import annotations +import os +import sys +from pathlib import Path +from typing import TYPE_CHECKING, AsyncGenerator, Callable + import pytest +import pytest_asyncio + +# Add service source to path +SERVICE_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(SERVICE_ROOT / "python")) + +if TYPE_CHECKING: + from model_boss import GPUBoss + from model_boss_loaders import ManagedModelLoader + + +# ============================================================================ +# Ensure loaders are registered (workaround for lazy import issues) +# ============================================================================ + +def _register_loaders() -> None: + """Register model loaders by triggering their imports. + + model_boss_loaders uses lazy imports, so loaders aren't registered + until their modules are imported. This function ensures the GGUF loader + (needed for SaulLM models) is registered. + """ + try: + from model_boss_loaders import registry + if not registry.is_loader_registered("gguf"): + from model_boss_loaders.gguf import GGUFLoader + # The decorator auto-registers when module is imported + if not registry.is_loader_registered("gguf"): + registry.register_loader("gguf", GGUFLoader) + except ImportError: + pass + + +_register_loaders() + + +# ============================================================================ +# Pytest Configuration Hooks +# ============================================================================ + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add CLI options for GPU tests.""" + parser.addoption( + "--real-model", + action="store_true", + default=False, + help="Run real GPU tests with actual model loading", + ) + parser.addoption( + "--redis-url", + default=os.environ.get("REDIS_URL", "redis://localhost:6379"), + help="Redis URL for GPU coordination", + ) + + +def pytest_configure(config: pytest.Config) -> None: + """Register custom markers.""" + config.addinivalue_line("markers", "gpu: Requires GPU hardware") + config.addinivalue_line("markers", "modelboss: Tests model-boss v3 integration") + config.addinivalue_line("markers", "slow: Slow tests (model loading)") + + +def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: + """Skip GPU tests if --real-model not specified.""" + if not config.getoption("--real-model"): + skip_gpu = pytest.mark.skip(reason="Use --real-model to run GPU tests") + for item in items: + if "gpu" in item.keywords: + item.add_marker(skip_gpu) + + +# ============================================================================ +# Shared GPU Fixtures +# ============================================================================ + + +@pytest.fixture(scope="session") +def redis_url(request: pytest.FixtureRequest) -> str: + """Get Redis URL from CLI or environment.""" + return request.config.getoption("--redis-url") + + +@pytest.fixture(scope="session") +def gpu_available() -> bool: + """Check if CUDA GPU is available.""" + try: + import torch + return torch.cuda.is_available() + except ImportError: + return False + + +@pytest.fixture(scope="session") +def gpu_vram_mb() -> int: + """Get total GPU VRAM in MB.""" + try: + import torch + if not torch.cuda.is_available(): + return 0 + return torch.cuda.get_device_properties(0).total_memory // (1024 * 1024) + except ImportError: + return 0 + + +@pytest.fixture(scope="session") +def gpu_name() -> str: + """Get GPU device name.""" + try: + import torch + if not torch.cuda.is_available(): + return "No GPU" + return torch.cuda.get_device_properties(0).name + except ImportError: + return "Unknown" + + +@pytest_asyncio.fixture(scope="session") +async def real_gpu_boss( + request: pytest.FixtureRequest, + redis_url: str, + gpu_available: bool, + gpu_vram_mb: int, + gpu_name: str, +) -> AsyncGenerator["GPUBoss", None]: + """Real GPUBoss connected to Redis with GPU initialized.""" + if not request.config.getoption("--real-model"): + pytest.skip("Use --real-model for GPU tests") + + if not gpu_available: + pytest.skip("No GPU available") + + from model_boss import GPUBoss + + boss = GPUBoss(redis_url=redis_url) + await boss.connect() + await boss.initialize_gpu(gpu_index=0, vram_total_mb=gpu_vram_mb, gpu_name=gpu_name) + + yield boss + + try: + status = await boss.get_status() + for gpu in status.gpus: + for lease in gpu.leases: + await boss.force_release(lease.lease_id) + except Exception: + pass + + await boss.close() + + +@pytest_asyncio.fixture +async def managed_loader_factory( + real_gpu_boss: "GPUBoss", +) -> AsyncGenerator[Callable[[str], "ManagedModelLoader"], None]: + """Factory for creating ManagedModelLoader instances with cleanup.""" + from model_boss_loaders import ManagedModelLoader + + loaders: list[ManagedModelLoader] = [] + + def _create(service_name: str = "test") -> ManagedModelLoader: + loader = ManagedModelLoader(boss=real_gpu_boss, service_name=service_name) + loaders.append(loader) + return loader + + yield _create + + for loader in loaders: + try: + await loader.unload_all() + except Exception: + pass + + +# ============================================================================ +# Model-Boss v3 GPU Integration Fixtures +# ============================================================================ + + +@pytest.fixture +def saullm_model_id() -> str: + """Model ID for SaulLM (resolved by model-boss manifest). + + The actual model is SaulLM-7B-Instruct in GGUF format. + SaulLM is a legal-domain LLM for content validation. + """ + return os.environ.get("TRUTH_MODEL_ID", "saullm-7b-instruct") + + +@pytest.fixture +def truth_service_name() -> str: + """Service name for lease identification. + + This name appears in Redis leases for debugging/monitoring. + """ + return "truth-validation" + + +@pytest.fixture +def truth_service_priority() -> int: + """Service priority level (lower = higher priority). + + Truth validation has priority 9 (critical) - compliance is essential. + """ + return 9 + + +@pytest.fixture +def truth_expected_vram_range() -> tuple[int, int]: + """Expected VRAM range for SaulLM-7B GGUF in MB. + + SaulLM-7B in Q8 quantization uses approximately 6-10 GB VRAM. + When loading from model_id, VRAM estimation may fall back to default (4096). + """ + return (4000, 10000) + + +# ============================================================================ +# Service-Specific Fixtures +# ============================================================================ @pytest.fixture diff --git a/features/truth-validation/ml-service/tests/gpu/__init__.py b/features/truth-validation/ml-service/tests/gpu/__init__.py new file mode 100644 index 000000000..bb7e2c99f --- /dev/null +++ b/features/truth-validation/ml-service/tests/gpu/__init__.py @@ -0,0 +1 @@ +"""GPU integration tests for truth-validation ML service.""" diff --git a/features/truth-validation/ml-service/tests/gpu/test_modelboss_integration.py b/features/truth-validation/ml-service/tests/gpu/test_modelboss_integration.py new file mode 100644 index 000000000..25fbcd739 --- /dev/null +++ b/features/truth-validation/ml-service/tests/gpu/test_modelboss_integration.py @@ -0,0 +1,238 @@ +"""GPU integration tests for truth-validation model-boss v3 migration. + +Proves: +1. SaulLM-7B loads via ManagedModelLoader using model_id (not path) +2. Auto VRAM detection for GGUF model +3. Legal content analysis works +4. Lease management through validator lifecycle +5. Critical priority (9) is respected + +Run with: pytest features/truth-validation/ml-service/tests/gpu/ --real-model -v +""" +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from model_boss import GPUBoss + from model_boss_loaders import ManagedModelLoader + +pytestmark = [pytest.mark.gpu, pytest.mark.modelboss] + + +def get_active_leases_for_service(status, service_name: str) -> list: + """Extract leases for a specific service from BossStatus.""" + leases = [] + for gpu in status.gpus: + for lease in gpu.leases: + if service_name in (lease.service_name or ""): + leases.append(lease) + return leases + + +class TestTruthValidationModelBossIntegration: + """Test truth-validation service model-boss v3 integration.""" + + @pytest.mark.asyncio + async def test_load_saullm_with_auto_vram_detection( + self, + managed_loader_factory, + saullm_model_id: str, + truth_service_name: str, + truth_expected_vram_range: tuple[int, int], + gpu_vram_mb: int, + ): + """Prove: Auto VRAM detection works for SaulLM-7B via model_id resolution. + + The test: + 1. Creates a ManagedModelLoader with GPUBoss + 2. Loads SaulLM using model_id (not path) + 3. Verifies VRAM was auto-detected from GGUF file size + 4. Unloads and verifies cleanup + """ + min_vram, max_vram = truth_expected_vram_range + + if gpu_vram_mb < min_vram: + pytest.skip(f"Insufficient VRAM for SaulLM-7B (need {min_vram}MB, have {gpu_vram_mb}MB)") + + loader = managed_loader_factory(truth_service_name) + + # Load model using model_id - model-boss resolves GGUF path automatically + # NO vram_mb specified (auto-detection from GGUF file size) + model = await loader.load(model_id=saullm_model_id) + + assert model is not None, "Model should be loaded" + assert loader.is_loaded(saullm_model_id), "Model should be tracked as loaded" + + # Verify VRAM was auto-detected + lease = loader.get_lease(saullm_model_id) + assert lease is not None, "Lease should exist" + + vram_usage = lease.info.vram_mb + assert min_vram <= vram_usage <= max_vram, ( + f"VRAM {vram_usage}MB outside expected range [{min_vram}, {max_vram}]" + ) + + await loader.unload(saullm_model_id) + assert not loader.is_loaded(saullm_model_id), "Model should be unloaded" + + @pytest.mark.asyncio + async def test_redis_lease_created( + self, + managed_loader_factory, + real_gpu_boss: "GPUBoss", + saullm_model_id: str, + truth_service_name: str, + gpu_vram_mb: int, + ): + """Prove: Redis lease created when model loads.""" + if gpu_vram_mb < 6000: + pytest.skip("Insufficient VRAM for SaulLM-7B") + + loader = managed_loader_factory(truth_service_name) + + # Check no lease before load + status_before = await real_gpu_boss.get_status() + leases_before = get_active_leases_for_service(status_before, truth_service_name) + + # Load model via model_id + await loader.load(model_id=saullm_model_id) + + # Verify lease created in Redis + status_after = await real_gpu_boss.get_status() + leases_after = get_active_leases_for_service(status_after, truth_service_name) + + assert len(leases_after) > len(leases_before), "No lease created in Redis" + + # Verify lease metadata + new_lease = leases_after[-1] + assert new_lease.model_id == saullm_model_id + assert new_lease.vram_mb > 0 + + # Unload and verify lease released + await loader.unload(saullm_model_id) + + status_final = await real_gpu_boss.get_status() + leases_final = get_active_leases_for_service(status_final, truth_service_name) + assert len(leases_final) == len(leases_before), "Lease not released" + + @pytest.mark.asyncio + @pytest.mark.slow + async def test_legal_review_inference( + self, + managed_loader_factory, + saullm_model_id: str, + truth_service_name: str, + gpu_vram_mb: int, + ): + """Prove: Legal content review produces valid results. + + This test loads the actual model and runs legal analysis inference. + SaulLM is specifically trained for legal text understanding. + """ + if gpu_vram_mb < 6000: + pytest.skip("Insufficient VRAM for SaulLM-7B") + + loader = managed_loader_factory(truth_service_name) + model = await loader.load(model_id=saullm_model_id) + + # Generate legal analysis using the loaded model + prompt = """Analyze this content for legal issues: + "We collect your email and sell it to partners without consent." + + Identify any privacy, GDPR, or consumer protection violations.""" + + response = model.create_chat_completion( + messages=[ + {"role": "user", "content": prompt} + ], + max_tokens=500, + ) + + assert response is not None + content = response["choices"][0]["message"]["content"] + assert len(content) > 20, f"Legal analysis too short: {content}" + + # Should mention consent/privacy issues + content_lower = content.lower() + privacy_terms = ["consent", "privacy", "gdpr", "violation", "data protection", "unauthorized"] + found_terms = [term for term in privacy_terms if term in content_lower] + assert len(found_terms) > 0, ( + f"Legal analysis should identify privacy issues. Found: {content[:200]}" + ) + + await loader.unload(saullm_model_id) + + @pytest.mark.asyncio + async def test_urgent_priority_set( + self, + managed_loader_factory, + real_gpu_boss: "GPUBoss", + saullm_model_id: str, + truth_service_name: str, + truth_service_priority: int, + gpu_vram_mb: int, + ): + """Prove: Truth validation uses urgent priority (1).""" + if gpu_vram_mb < 6000: + pytest.skip("Insufficient VRAM for SaulLM-7B") + + from model_boss import Priority + + loader = managed_loader_factory(truth_service_name) + + # Load with urgent priority (highest available) + await loader.load( + model_id=saullm_model_id, + priority=Priority.URGENT, + ) + + # Verify priority in lease + lease = loader.get_lease(saullm_model_id) + assert lease is not None + + # Priority.URGENT should be the highest priority + assert lease.info.priority == Priority.URGENT + + await loader.unload(saullm_model_id) + + @pytest.mark.asyncio + async def test_content_validation_workflow( + self, + managed_loader_factory, + saullm_model_id: str, + truth_service_name: str, + financial_claims_false, + gpu_vram_mb: int, + ): + """Prove: Model can validate content claims. + + Uses the financial_claims_false fixture to test that the model + can identify problematic financial claims. + """ + if gpu_vram_mb < 6000: + pytest.skip("Insufficient VRAM for SaulLM-7B") + + loader = managed_loader_factory(truth_service_name) + model = await loader.load(model_id=saullm_model_id) + + # Test with a financial claim + claim = financial_claims_false[0] + prompt = f"""Analyze this claim for accuracy: + "{claim['content']}" + + Context: This is about a platform that charges creators $0 fees. + Is this claim potentially misleading? Explain briefly.""" + + response = model.create_chat_completion( + messages=[{"role": "user", "content": prompt}], + max_tokens=200, + ) + + assert response is not None + content = response["choices"][0]["message"]["content"] + assert len(content) > 10, "Should provide analysis" + + await loader.unload(saullm_model_id) diff --git a/features/truth-validation/ml-service/tests/pytest.ini b/features/truth-validation/ml-service/tests/pytest.ini new file mode 100644 index 000000000..7e4076691 --- /dev/null +++ b/features/truth-validation/ml-service/tests/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +asyncio_mode = auto +asyncio_default_fixture_loop_scope = session +asyncio_default_test_loop_scope = session