"""GPU integration tests with real models. These tests require: - GPU with loaded models (ministral-14b-reasoning, ministral-3b-instruct) - The gpu_services session fixture auto-starts llama-http systemd services Run with: pytest tests/test_integration_gpu.py -v Skip with: pytest -m "not gpu" Tests are designed to fail fast: - Short inference timeouts (30s for reasoning, 15s for instruct) - Services guaranteed running via gpu_services fixture """ import asyncio from typing import AsyncGenerator import pytest from auto_commit_service.llm import MultiModelLlamaClient from auto_commit_service.pipeline.format_utils import ( build_format_system_prompt, sanitize_message, correct_emoji, extract_commit_message ) # Mark entire module as GPU tests pytestmark = pytest.mark.gpu # ============================================================================= # Fixtures # ============================================================================= @pytest.fixture async def multi_model_client( gpu_services: dict[str, str], ) -> AsyncGenerator[MultiModelLlamaClient, None]: """Create multi-model client backed by gpu_services fixture. Services are guaranteed healthy by the session-scoped gpu_services fixture. """ client = MultiModelLlamaClient( reasoning_model_id="ministral-14b-reasoning", instruct_model_id="ministral-3b-instruct", timeout=30.0, temperature=0.2, ) yield client await client.close() # ============================================================================= # Service Health Tests # ============================================================================= class TestServiceHealth: """Quick health checks for inference services.""" async def test_reasoning_model_available(self, multi_model_client: MultiModelLlamaClient) -> None: """Verify reasoning model (14B) is loaded and responding.""" is_available = await multi_model_client.is_available() assert is_available, "Reasoning model should be available" async def test_client_has_correct_model_ids(self, multi_model_client: MultiModelLlamaClient) -> None: """Verify client is configured with expected model IDs.""" assert multi_model_client.reasoning_model_id == "ministral-14b-reasoning" assert multi_model_client.instruct_model_id == "ministral-3b-instruct" # ============================================================================= # Reasoning Model Tests (14B) # ============================================================================= class TestReasoningModel: """Integration tests for the 14B reasoning model.""" @pytest.fixture def sample_diff(self) -> str: """Sample diff for analysis.""" return """diff --git a/src/auth/login.py b/src/auth/login.py new file mode 100644 --- /dev/null +++ b/src/auth/login.py @@ -0,0 +1,15 @@ +from flask import request, jsonify + +def login_endpoint(): + username = request.json.get('username') + password = request.json.get('password') + + if not username or not password: + return jsonify({'error': 'Missing credentials'}), 400 + + if authenticate(username, password): + token = generate_token(username) + return jsonify({'token': token}) + + return jsonify({'error': 'Invalid credentials'}), 401 """ async def test_analyze_commit_returns_response( self, multi_model_client: MultiModelLlamaClient, sample_diff: str ) -> None: """Reasoning model returns non-empty analysis for a diff.""" prompt = f"""Analyze these file changes and provide: 1. TYPE: feat/fix/refactor/chore/docs/test/perf/build/ci/style 2. SCOPE: component or area affected 3. IMPACT: brief description of what changed 4. REASONING: why these changes were made Changes: {sample_diff} """ system_prompt = "You are a code analysis assistant. Analyze git diffs concisely." response = await asyncio.wait_for( multi_model_client.analyze_commit( prompt=prompt, system_prompt=system_prompt, max_tokens=256, ), timeout=30.0, # 30s timeout for reasoning ) assert response, "Reasoning model should return non-empty response" assert len(response) > 20, "Response should be substantial" # Should contain some analysis keywords response_lower = response.lower() assert any(kw in response_lower for kw in ["type", "feat", "add", "auth", "login"]), \ f"Response should contain relevant analysis: {response[:200]}" async def test_group_files_returns_response( self, multi_model_client: MultiModelLlamaClient ) -> None: """Reasoning model can group files logically.""" files = [ "src/auth/login.py", "src/auth/logout.py", "src/auth/session.py", "tests/test_auth.py", "docs/auth.md", "pyproject.toml", ] diff_summary = "Auth module changes with test and doc updates" response = await asyncio.wait_for( multi_model_client.group_files( files=files, diff_summary=diff_summary, repo_name="test-repo", branch="main", ), timeout=30.0, ) assert response, "Reasoning model should return grouping response" assert any(f in response for f in ["auth", "login", "test"]), \ f"Response should reference files: {response[:200]}" # ============================================================================= # Instruct Model Tests (3B) # ============================================================================= class TestInstructModel: """Integration tests for the 3B instruct model.""" async def test_format_commit_message_returns_clean_output( self, multi_model_client: MultiModelLlamaClient ) -> None: """Instruct model formats commit message from analysis data.""" prompt = """Format this analysis into a conventional commit message: TYPE: feat SCOPE: auth IMPACT: Added user login endpoint with JWT token generation REASONING: Implementing authentication for the API Format: type(scope): description Keep it under 72 characters. """ system_prompt = "You are a commit message formatter. Output only the commit message, no explanation." response = await asyncio.wait_for( multi_model_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=100, ), timeout=15.0, # 15s timeout for instruct (faster model) ) assert response, "Instruct model should return formatted message" # Should be a short, formatted commit message assert len(response) < 200, "Commit message should be concise" # Should contain type and some description response_lower = response.lower() assert any(t in response_lower for t in ["feat", "add", "auth", "login"]), \ f"Response should be a proper commit message: {response}" async def test_instruct_handles_short_prompts( self, multi_model_client: MultiModelLlamaClient ) -> None: """Instruct model handles minimal prompts gracefully.""" prompt = "Format: fix(api): resolve null pointer bug" system_prompt = "Output only the commit message." response = await asyncio.wait_for( multi_model_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=50, ), timeout=15.0, ) assert response, "Should return something for short prompt" # ============================================================================= # Full Pipeline Tests # ============================================================================= class TestFullPipeline: """End-to-end tests using both models in sequence.""" async def test_analyze_then_format_pipeline( self, multi_model_client: MultiModelLlamaClient ) -> None: """Full pipeline: reasoning model analyzes, instruct model formats.""" # Step 1: Analyze with reasoning model analysis_prompt = """Analyze this change: - File: src/api/health.py (new) - Content: Added health check endpoint returning {"status": "ok"} Provide TYPE, SCOPE, IMPACT. """ analysis_system = "Analyze code changes concisely." analysis = await asyncio.wait_for( multi_model_client.analyze_commit( prompt=analysis_prompt, system_prompt=analysis_system, max_tokens=150, ), timeout=30.0, ) assert analysis, "Analysis step should return response" # Step 2: Format with instruct model (use real system prompt like production) format_prompt = f"""Based on this analysis, write a commit message: {analysis} Format: type(scope): emoji description """ format_system = build_format_system_prompt() message = await asyncio.wait_for( multi_model_client.format_commit_message( prompt=format_prompt, system_prompt=format_system, max_tokens=100, ), timeout=15.0, ) assert message, "Format step should return commit message" # Post-process like the real pipeline raw = extract_commit_message(message) first_line = sanitize_message(correct_emoji(raw)).split("\n")[0] assert len(first_line) <= 120, f"Commit message too long: {first_line}" # ============================================================================= # Error Handling Tests # ============================================================================= class TestErrorHandling: """Tests for graceful error handling.""" async def test_empty_prompt_handling( self, multi_model_client: MultiModelLlamaClient ) -> None: """Models handle empty/minimal prompts without crashing.""" response = await asyncio.wait_for( multi_model_client.format_commit_message( prompt="", system_prompt="Return a default commit message.", max_tokens=50, ), timeout=15.0, ) # Should return something, even if generic assert isinstance(response, str) async def test_very_long_prompt_handling( self, multi_model_client: MultiModelLlamaClient ) -> None: """Models handle long prompts (truncation test).""" # Create a long prompt (simulate large diff) long_content = "x" * 5000 prompt = f"Analyze this change:\n{long_content}\n\nProvide TYPE and SCOPE." response = await asyncio.wait_for( multi_model_client.analyze_commit( prompt=prompt, system_prompt="Analyze briefly.", max_tokens=100, ), timeout=60.0, ) # Should return something (model may truncate or summarize) assert isinstance(response, str) # ============================================================================= # Performance / Smoke Tests # ============================================================================= class TestPerformance: """Quick smoke tests to verify response times are acceptable.""" async def test_instruct_response_under_10_seconds( self, multi_model_client: MultiModelLlamaClient ) -> None: """Instruct model responds within 10 seconds for simple prompts.""" prompt = "Format: feat(api): add endpoint" system_prompt = "Output commit message." import time start = time.monotonic() response = await multi_model_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=50, ) elapsed = time.monotonic() - start assert response, "Should get response" assert elapsed < 10.0, f"Instruct model too slow: {elapsed:.2f}s" async def test_reasoning_response_under_30_seconds( self, multi_model_client: MultiModelLlamaClient ) -> None: """Reasoning model responds within 30 seconds for typical prompts.""" prompt = """Analyze: Added login endpoint with JWT tokens. - File: auth/login.py Provide TYPE, SCOPE, IMPACT.""" system_prompt = "Analyze concisely." import time start = time.monotonic() response = await multi_model_client.analyze_commit( prompt=prompt, system_prompt=system_prompt, max_tokens=150, ) elapsed = time.monotonic() - start assert response, "Should get response" assert elapsed < 30.0, f"Reasoning model too slow: {elapsed:.2f}s"