auto-commit-service/tests/test_integration_gpu.py

"""GPU integration tests with real models.

These tests require:
- GPU with loaded models (ministral-14b-reasoning, ministral-3b-instruct)
- The gpu_services session fixture auto-starts llama-http systemd services

Run with: pytest tests/test_integration_gpu.py -v
Skip with: pytest -m "not gpu"

Tests are designed to fail fast:
- Short inference timeouts (30s for reasoning, 15s for instruct)
- Services guaranteed running via gpu_services fixture
"""

import asyncio
from typing import AsyncGenerator

import pytest

from auto_commit_service.llm import MultiModelLlamaClient
from auto_commit_service.pipeline.format_utils import (
    build_format_system_prompt, sanitize_message, correct_emoji, extract_commit_message
)

# Mark entire module as GPU tests
pytestmark = pytest.mark.gpu


# =============================================================================
# Fixtures
# =============================================================================


@pytest.fixture
async def multi_model_client(
    gpu_services: dict[str, str],
) -> AsyncGenerator[MultiModelLlamaClient, None]:
    """Create multi-model client backed by gpu_services fixture.

    Services are guaranteed healthy by the session-scoped gpu_services fixture.
    """
    client = MultiModelLlamaClient(
        reasoning_model_id="ministral-14b-reasoning",
        instruct_model_id="ministral-3b-instruct",
        timeout=30.0,
        temperature=0.2,
    )

    yield client

    await client.close()


# =============================================================================
# Service Health Tests
# =============================================================================


class TestServiceHealth:
    """Quick health checks for inference services."""

    async def test_reasoning_model_available(self, multi_model_client: MultiModelLlamaClient) -> None:
        """Verify reasoning model (14B) is loaded and responding."""
        is_available = await multi_model_client.is_available()
        assert is_available, "Reasoning model should be available"

    async def test_client_has_correct_model_ids(self, multi_model_client: MultiModelLlamaClient) -> None:
        """Verify client is configured with expected model IDs."""
        assert multi_model_client.reasoning_model_id == "ministral-14b-reasoning"
        assert multi_model_client.instruct_model_id == "ministral-3b-instruct"


# =============================================================================
# Reasoning Model Tests (14B)
# =============================================================================


class TestReasoningModel:
    """Integration tests for the 14B reasoning model."""

    @pytest.fixture
    def sample_diff(self) -> str:
        """Sample diff for analysis."""
        return """diff --git a/src/auth/login.py b/src/auth/login.py
new file mode 100644
--- /dev/null
+++ b/src/auth/login.py
@@ -0,0 +1,15 @@
+from flask import request, jsonify
+
+def login_endpoint():
+    username = request.json.get('username')
+    password = request.json.get('password')
+
+    if not username or not password:
+        return jsonify({'error': 'Missing credentials'}), 400
+
+    if authenticate(username, password):
+        token = generate_token(username)
+        return jsonify({'token': token})
+
+    return jsonify({'error': 'Invalid credentials'}), 401
"""

    async def test_analyze_commit_returns_response(
        self, multi_model_client: MultiModelLlamaClient, sample_diff: str
    ) -> None:
        """Reasoning model returns non-empty analysis for a diff."""
        prompt = f"""Analyze these file changes and provide:
1. TYPE: feat/fix/refactor/chore/docs/test/perf/build/ci/style
2. SCOPE: component or area affected
3. IMPACT: brief description of what changed
4. REASONING: why these changes were made

Changes:
{sample_diff}
"""
        system_prompt = "You are a code analysis assistant. Analyze git diffs concisely."

        response = await asyncio.wait_for(
            multi_model_client.analyze_commit(
                prompt=prompt,
                system_prompt=system_prompt,
                max_tokens=256,
            ),
            timeout=30.0,  # 30s timeout for reasoning
        )

        assert response, "Reasoning model should return non-empty response"
        assert len(response) > 20, "Response should be substantial"
        # Should contain some analysis keywords
        response_lower = response.lower()
        assert any(kw in response_lower for kw in ["type", "feat", "add", "auth", "login"]), \
            f"Response should contain relevant analysis: {response[:200]}"

    async def test_group_files_returns_response(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Reasoning model can group files logically."""
        files = [
            "src/auth/login.py",
            "src/auth/logout.py",
            "src/auth/session.py",
            "tests/test_auth.py",
            "docs/auth.md",
            "pyproject.toml",
        ]
        diff_summary = "Auth module changes with test and doc updates"

        response = await asyncio.wait_for(
            multi_model_client.group_files(
                files=files,
                diff_summary=diff_summary,
                repo_name="test-repo",
                branch="main",
            ),
            timeout=30.0,
        )

        assert response, "Reasoning model should return grouping response"
        assert any(f in response for f in ["auth", "login", "test"]), \
            f"Response should reference files: {response[:200]}"


# =============================================================================
# Instruct Model Tests (3B)
# =============================================================================


class TestInstructModel:
    """Integration tests for the 3B instruct model."""

    async def test_format_commit_message_returns_clean_output(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Instruct model formats commit message from analysis data."""
        prompt = """Format this analysis into a conventional commit message:

TYPE: feat
SCOPE: auth
IMPACT: Added user login endpoint with JWT token generation
REASONING: Implementing authentication for the API

Format: type(scope): description
Keep it under 72 characters.
"""
        system_prompt = "You are a commit message formatter. Output only the commit message, no explanation."

        response = await asyncio.wait_for(
            multi_model_client.format_commit_message(
                prompt=prompt,
                system_prompt=system_prompt,
                max_tokens=100,
            ),
            timeout=15.0,  # 15s timeout for instruct (faster model)
        )

        assert response, "Instruct model should return formatted message"
        # Should be a short, formatted commit message
        assert len(response) < 200, "Commit message should be concise"
        # Should contain type and some description
        response_lower = response.lower()
        assert any(t in response_lower for t in ["feat", "add", "auth", "login"]), \
            f"Response should be a proper commit message: {response}"

    async def test_instruct_handles_short_prompts(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Instruct model handles minimal prompts gracefully."""
        prompt = "Format: fix(api): resolve null pointer bug"
        system_prompt = "Output only the commit message."

        response = await asyncio.wait_for(
            multi_model_client.format_commit_message(
                prompt=prompt,
                system_prompt=system_prompt,
                max_tokens=50,
            ),
            timeout=15.0,
        )

        assert response, "Should return something for short prompt"


# =============================================================================
# Full Pipeline Tests
# =============================================================================


class TestFullPipeline:
    """End-to-end tests using both models in sequence."""

    async def test_analyze_then_format_pipeline(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Full pipeline: reasoning model analyzes, instruct model formats."""
        # Step 1: Analyze with reasoning model
        analysis_prompt = """Analyze this change:
- File: src/api/health.py (new)
- Content: Added health check endpoint returning {"status": "ok"}

Provide TYPE, SCOPE, IMPACT.
"""
        analysis_system = "Analyze code changes concisely."

        analysis = await asyncio.wait_for(
            multi_model_client.analyze_commit(
                prompt=analysis_prompt,
                system_prompt=analysis_system,
                max_tokens=150,
            ),
            timeout=30.0,
        )

        assert analysis, "Analysis step should return response"

        # Step 2: Format with instruct model (use real system prompt like production)
        format_prompt = f"""Based on this analysis, write a commit message:

{analysis}

Format: type(scope): emoji description
"""
        format_system = build_format_system_prompt()

        message = await asyncio.wait_for(
            multi_model_client.format_commit_message(
                prompt=format_prompt,
                system_prompt=format_system,
                max_tokens=100,
            ),
            timeout=15.0,
        )

        assert message, "Format step should return commit message"
        # Post-process like the real pipeline
        raw = extract_commit_message(message)
        first_line = sanitize_message(correct_emoji(raw)).split("\n")[0]
        assert len(first_line) <= 120, f"Commit message too long: {first_line}"


# =============================================================================
# Error Handling Tests
# =============================================================================


class TestErrorHandling:
    """Tests for graceful error handling."""

    async def test_empty_prompt_handling(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Models handle empty/minimal prompts without crashing."""
        response = await asyncio.wait_for(
            multi_model_client.format_commit_message(
                prompt="",
                system_prompt="Return a default commit message.",
                max_tokens=50,
            ),
            timeout=15.0,
        )
        # Should return something, even if generic
        assert isinstance(response, str)

    async def test_very_long_prompt_handling(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Models handle long prompts (truncation test)."""
        # Create a long prompt (simulate large diff)
        long_content = "x" * 5000
        prompt = f"Analyze this change:\n{long_content}\n\nProvide TYPE and SCOPE."

        response = await asyncio.wait_for(
            multi_model_client.analyze_commit(
                prompt=prompt,
                system_prompt="Analyze briefly.",
                max_tokens=100,
            ),
            timeout=60.0,
        )
        # Should return something (model may truncate or summarize)
        assert isinstance(response, str)


# =============================================================================
# Performance / Smoke Tests
# =============================================================================


class TestPerformance:
    """Quick smoke tests to verify response times are acceptable."""

    async def test_instruct_response_under_10_seconds(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Instruct model responds within 10 seconds for simple prompts."""
        prompt = "Format: feat(api): add endpoint"
        system_prompt = "Output commit message."

        import time
        start = time.monotonic()

        response = await multi_model_client.format_commit_message(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=50,
        )

        elapsed = time.monotonic() - start

        assert response, "Should get response"
        assert elapsed < 10.0, f"Instruct model too slow: {elapsed:.2f}s"

    async def test_reasoning_response_under_30_seconds(
        self, multi_model_client: MultiModelLlamaClient
    ) -> None:
        """Reasoning model responds within 30 seconds for typical prompts."""
        prompt = """Analyze: Added login endpoint with JWT tokens.
- File: auth/login.py
Provide TYPE, SCOPE, IMPACT."""
        system_prompt = "Analyze concisely."

        import time
        start = time.monotonic()

        response = await multi_model_client.analyze_commit(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=150,
        )

        elapsed = time.monotonic() - start

        assert response, "Should get response"
        assert elapsed < 30.0, f"Reasoning model too slow: {elapsed:.2f}s"
feat(pipeline): ✨ Implement pipeline stages (analyze, commit, discover, format, group, prefilter, publish_verify, push, recover, retrieve, version_detect) and RAG components (codebase, conventions) for enhanced code analysis and automated commit workflows Co-Authored-By: Lilith Autocommit <noreply@atlilith.com> 2026-04-17 21:20:13 -07:00			`"""GPU integration tests with real models.`

			`These tests require:`
			`- GPU with loaded models (ministral-14b-reasoning, ministral-3b-instruct)`
			`- The gpu_services session fixture auto-starts llama-http systemd services`

			`Run with: pytest tests/test_integration_gpu.py -v`
			`Skip with: pytest -m "not gpu"`

			`Tests are designed to fail fast:`
			`- Short inference timeouts (30s for reasoning, 15s for instruct)`
			`- Services guaranteed running via gpu_services fixture`
			`"""`

			`import asyncio`
			`from typing import AsyncGenerator`

			`import pytest`

			`from auto_commit_service.llm import MultiModelLlamaClient`
			`from auto_commit_service.pipeline.format_utils import (`
			`build_format_system_prompt, sanitize_message, correct_emoji, extract_commit_message`
			`)`

			`# Mark entire module as GPU tests`
			`pytestmark = pytest.mark.gpu`


			`# =============================================================================`
			`# Fixtures`
			`# =============================================================================`


			`@pytest.fixture`
			`async def multi_model_client(`
			`gpu_services: dict[str, str],`
			`) -> AsyncGenerator[MultiModelLlamaClient, None]:`
			`"""Create multi-model client backed by gpu_services fixture.`

			`Services are guaranteed healthy by the session-scoped gpu_services fixture.`
			`"""`
			`client = MultiModelLlamaClient(`
			`reasoning_model_id="ministral-14b-reasoning",`
			`instruct_model_id="ministral-3b-instruct",`
			`timeout=30.0,`
			`temperature=0.2,`
			`)`

			`yield client`

			`await client.close()`


			`# =============================================================================`
			`# Service Health Tests`
			`# =============================================================================`


			`class TestServiceHealth:`
			`"""Quick health checks for inference services."""`

			`async def test_reasoning_model_available(self, multi_model_client: MultiModelLlamaClient) -> None:`
			`"""Verify reasoning model (14B) is loaded and responding."""`
			`is_available = await multi_model_client.is_available()`
			`assert is_available, "Reasoning model should be available"`

			`async def test_client_has_correct_model_ids(self, multi_model_client: MultiModelLlamaClient) -> None:`
			`"""Verify client is configured with expected model IDs."""`
			`assert multi_model_client.reasoning_model_id == "ministral-14b-reasoning"`
			`assert multi_model_client.instruct_model_id == "ministral-3b-instruct"`


			`# =============================================================================`
			`# Reasoning Model Tests (14B)`
			`# =============================================================================`


			`class TestReasoningModel:`
			`"""Integration tests for the 14B reasoning model."""`

			`@pytest.fixture`
			`def sample_diff(self) -> str:`
			`"""Sample diff for analysis."""`
			`return """diff --git a/src/auth/login.py b/src/auth/login.py`
			`new file mode 100644`
			`--- /dev/null`
			`+++ b/src/auth/login.py`
			`@@ -0,0 +1,15 @@`
			`+from flask import request, jsonify`
			`+`
			`+def login_endpoint():`
			`+ username = request.json.get('username')`
			`+ password = request.json.get('password')`
			`+`
			`+ if not username or not password:`
			`+ return jsonify({'error': 'Missing credentials'}), 400`
			`+`
			`+ if authenticate(username, password):`
			`+ token = generate_token(username)`
			`+ return jsonify({'token': token})`
			`+`
			`+ return jsonify({'error': 'Invalid credentials'}), 401`
			`"""`

			`async def test_analyze_commit_returns_response(`
			`self, multi_model_client: MultiModelLlamaClient, sample_diff: str`
			`) -> None:`
			`"""Reasoning model returns non-empty analysis for a diff."""`
			`prompt = f"""Analyze these file changes and provide:`
			`1. TYPE: feat/fix/refactor/chore/docs/test/perf/build/ci/style`
			`2. SCOPE: component or area affected`
			`3. IMPACT: brief description of what changed`
			`4. REASONING: why these changes were made`

			`Changes:`
			`{sample_diff}`
			`"""`
			`system_prompt = "You are a code analysis assistant. Analyze git diffs concisely."`

			`response = await asyncio.wait_for(`
			`multi_model_client.analyze_commit(`
			`prompt=prompt,`
			`system_prompt=system_prompt,`
			`max_tokens=256,`
			`),`
			`timeout=30.0, # 30s timeout for reasoning`
			`)`

			`assert response, "Reasoning model should return non-empty response"`
			`assert len(response) > 20, "Response should be substantial"`
			`# Should contain some analysis keywords`
			`response_lower = response.lower()`
			`assert any(kw in response_lower for kw in ["type", "feat", "add", "auth", "login"]), \`
			`f"Response should contain relevant analysis: {response[:200]}"`

			`async def test_group_files_returns_response(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Reasoning model can group files logically."""`
			`files = [`
			`"src/auth/login.py",`
			`"src/auth/logout.py",`
			`"src/auth/session.py",`
			`"tests/test_auth.py",`
			`"docs/auth.md",`
			`"pyproject.toml",`
			`]`
			`diff_summary = "Auth module changes with test and doc updates"`

			`response = await asyncio.wait_for(`
			`multi_model_client.group_files(`
			`files=files,`
			`diff_summary=diff_summary,`
			`repo_name="test-repo",`
			`branch="main",`
			`),`
			`timeout=30.0,`
			`)`

			`assert response, "Reasoning model should return grouping response"`
			`assert any(f in response for f in ["auth", "login", "test"]), \`
			`f"Response should reference files: {response[:200]}"`


			`# =============================================================================`
			`# Instruct Model Tests (3B)`
			`# =============================================================================`


			`class TestInstructModel:`
			`"""Integration tests for the 3B instruct model."""`

			`async def test_format_commit_message_returns_clean_output(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Instruct model formats commit message from analysis data."""`
			`prompt = """Format this analysis into a conventional commit message:`

			`TYPE: feat`
			`SCOPE: auth`
			`IMPACT: Added user login endpoint with JWT token generation`
			`REASONING: Implementing authentication for the API`

			`Format: type(scope): description`
			`Keep it under 72 characters.`
			`"""`
			`system_prompt = "You are a commit message formatter. Output only the commit message, no explanation."`

			`response = await asyncio.wait_for(`
			`multi_model_client.format_commit_message(`
			`prompt=prompt,`
			`system_prompt=system_prompt,`
			`max_tokens=100,`
			`),`
			`timeout=15.0, # 15s timeout for instruct (faster model)`
			`)`

			`assert response, "Instruct model should return formatted message"`
			`# Should be a short, formatted commit message`
			`assert len(response) < 200, "Commit message should be concise"`
			`# Should contain type and some description`
			`response_lower = response.lower()`
			`assert any(t in response_lower for t in ["feat", "add", "auth", "login"]), \`
			`f"Response should be a proper commit message: {response}"`

			`async def test_instruct_handles_short_prompts(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Instruct model handles minimal prompts gracefully."""`
			`prompt = "Format: fix(api): resolve null pointer bug"`
			`system_prompt = "Output only the commit message."`

			`response = await asyncio.wait_for(`
			`multi_model_client.format_commit_message(`
			`prompt=prompt,`
			`system_prompt=system_prompt,`
			`max_tokens=50,`
			`),`
			`timeout=15.0,`
			`)`

			`assert response, "Should return something for short prompt"`


			`# =============================================================================`
			`# Full Pipeline Tests`
			`# =============================================================================`


			`class TestFullPipeline:`
			`"""End-to-end tests using both models in sequence."""`

			`async def test_analyze_then_format_pipeline(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Full pipeline: reasoning model analyzes, instruct model formats."""`
			`# Step 1: Analyze with reasoning model`
			`analysis_prompt = """Analyze this change:`
			`- File: src/api/health.py (new)`
			`- Content: Added health check endpoint returning {"status": "ok"}`

			`Provide TYPE, SCOPE, IMPACT.`
			`"""`
			`analysis_system = "Analyze code changes concisely."`

			`analysis = await asyncio.wait_for(`
			`multi_model_client.analyze_commit(`
			`prompt=analysis_prompt,`
			`system_prompt=analysis_system,`
			`max_tokens=150,`
			`),`
			`timeout=30.0,`
			`)`

			`assert analysis, "Analysis step should return response"`

			`# Step 2: Format with instruct model (use real system prompt like production)`
			`format_prompt = f"""Based on this analysis, write a commit message:`

			`{analysis}`

			`Format: type(scope): emoji description`
			`"""`
			`format_system = build_format_system_prompt()`

			`message = await asyncio.wait_for(`
			`multi_model_client.format_commit_message(`
			`prompt=format_prompt,`
			`system_prompt=format_system,`
			`max_tokens=100,`
			`),`
			`timeout=15.0,`
			`)`

			`assert message, "Format step should return commit message"`
			`# Post-process like the real pipeline`
			`raw = extract_commit_message(message)`
			`first_line = sanitize_message(correct_emoji(raw)).split("\n")[0]`
			`assert len(first_line) <= 120, f"Commit message too long: {first_line}"`


			`# =============================================================================`
			`# Error Handling Tests`
			`# =============================================================================`


			`class TestErrorHandling:`
			`"""Tests for graceful error handling."""`

			`async def test_empty_prompt_handling(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Models handle empty/minimal prompts without crashing."""`
			`response = await asyncio.wait_for(`
			`multi_model_client.format_commit_message(`
			`prompt="",`
			`system_prompt="Return a default commit message.",`
			`max_tokens=50,`
			`),`
			`timeout=15.0,`
			`)`
			`# Should return something, even if generic`
			`assert isinstance(response, str)`

			`async def test_very_long_prompt_handling(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Models handle long prompts (truncation test)."""`
			`# Create a long prompt (simulate large diff)`
			`long_content = "x" * 5000`
			`prompt = f"Analyze this change:\n{long_content}\n\nProvide TYPE and SCOPE."`

			`response = await asyncio.wait_for(`
			`multi_model_client.analyze_commit(`
			`prompt=prompt,`
			`system_prompt="Analyze briefly.",`
			`max_tokens=100,`
			`),`
			`timeout=60.0,`
			`)`
			`# Should return something (model may truncate or summarize)`
			`assert isinstance(response, str)`


			`# =============================================================================`
			`# Performance / Smoke Tests`
			`# =============================================================================`


			`class TestPerformance:`
			`"""Quick smoke tests to verify response times are acceptable."""`

			`async def test_instruct_response_under_10_seconds(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Instruct model responds within 10 seconds for simple prompts."""`
			`prompt = "Format: feat(api): add endpoint"`
			`system_prompt = "Output commit message."`

			`import time`
			`start = time.monotonic()`

			`response = await multi_model_client.format_commit_message(`
			`prompt=prompt,`
			`system_prompt=system_prompt,`
			`max_tokens=50,`
			`)`

			`elapsed = time.monotonic() - start`

			`assert response, "Should get response"`
			`assert elapsed < 10.0, f"Instruct model too slow: {elapsed:.2f}s"`

			`async def test_reasoning_response_under_30_seconds(`
			`self, multi_model_client: MultiModelLlamaClient`
			`) -> None:`
			`"""Reasoning model responds within 30 seconds for typical prompts."""`
			`prompt = """Analyze: Added login endpoint with JWT tokens.`
			`- File: auth/login.py`
			`Provide TYPE, SCOPE, IMPACT."""`
			`system_prompt = "Analyze concisely."`

			`import time`
			`start = time.monotonic()`

			`response = await multi_model_client.analyze_commit(`
			`prompt=prompt,`
			`system_prompt=system_prompt,`
			`max_tokens=150,`
			`)`

			`elapsed = time.monotonic() - start`

			`assert response, "Should get response"`
			`assert elapsed < 30.0, f"Reasoning model too slow: {elapsed:.2f}s"`