auto-commit-service/tests/test_commit_quality.py

"""GPU integration tests for commit message quality.

These tests exercise the full analyze→format pipeline with real LLM inference
to verify that commit messages meet quality standards.

Run with: pytest -m gpu tests/test_commit_quality.py -v
"""

import pytest

from auto_commit_service.config import AutoCommitSettings
from auto_commit_service.pipeline.format_utils import (
    build_format_system_prompt,
    correct_emoji,
    extract_commit_message,
    sanitize_message,
    sanitize_message_scope,
    sanitize_scope,
)
from auto_commit_service.pipeline.gitmoji import get_valid_types_regex
from auto_commit_service.llm.validator import validate_commit_message

import re


@pytest.fixture
async def llm_client(gpu_services: dict[str, str], gpu_settings: AutoCommitSettings):
    """Initialize and return the multi-model LLM client."""
    from auto_commit_service.llm.multi_model_client import MultiModelLlamaClient

    client = MultiModelLlamaClient(
        timeout=gpu_settings.llm_timeout,
    )
    async with client:
        yield client


def _assert_conventional_commit(message: str) -> None:
    """Assert a message matches conventional commit format with clean scope."""
    # Use a simple regex to extract type, scope, emoji, description
    match = re.match(r'^(\S+?)(\([^)]+\))?:\s*(\S+)\s+(.+)$', message)
    assert match, f"Not a conventional commit: {message}"

    # Scope quality checks
    scope_match = re.match(r'^[^(]+\(([^)]+)\):', message)
    if scope_match:
        scope = scope_match.group(1)
        assert len(scope) <= 25, f"Scope too long ({len(scope)}): {scope}"
        assert ' ' not in scope, f"Scope contains spaces: {scope}"
        assert scope.lower() not in {'src', 'lib', 'app', 'code', 'files'}, (
            f"Scope too generic: {scope}"
        )
        # No stop words should remain
        stop_words = {'the', 'a', 'an', 'is', 'for', 'in', 'of', 'to', 'with', 'as', 'its'}
        scope_words = set(scope.lower().split('-'))
        leaked = scope_words & stop_words
        assert not leaked, f"Stop words leaked into scope: {leaked} in '{scope}'"


@pytest.mark.gpu
class TestFormatStageQuality:
    """Test that the 3B format model produces clean commit messages."""

    @pytest.mark.asyncio
    async def test_simple_feature_message(self, llm_client) -> None:
        """3B model should produce a clean feat message."""
        system_prompt = build_format_system_prompt()
        prompt = """Format this analysis into a conventional commit message:

**Type:** feat
**Scope:** auth
**Impact:** Added OAuth2 login support
**Description:** Add OAuth2 social login with Google and GitHub providers

**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description

Output ONE LINE only."""

        response = await llm_client.format_commit_message(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=150,
        )

        raw = extract_commit_message(response)
        final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))

        _assert_conventional_commit(final)
        result = validate_commit_message(final)
        assert result.valid, f"Quality validation failed: {result.violations}"

    @pytest.mark.asyncio
    async def test_chore_config_message(self, llm_client) -> None:
        """3B model should produce a clean chore message for config changes.

        Retries up to 3 times since the 3B model can be non-deterministic.
        """
        system_prompt = build_format_system_prompt()
        prompt = """Format this analysis into a conventional commit message:

**Type:** chore
**Scope:** config
**Impact:** Updated ESLint and TypeScript configuration
**Description:** Update ESLint rules to enforce strict type checking

**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description

Output ONE LINE only."""

        last_result = None
        for attempt in range(3):
            response = await llm_client.format_commit_message(
                prompt=prompt,
                system_prompt=system_prompt,
                max_tokens=150,
            )

            raw = extract_commit_message(response)
            final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))

            _assert_conventional_commit(final)
            last_result = validate_commit_message(final)
            if last_result.valid:
                break

        assert last_result and last_result.valid, (
            f"Quality validation failed after 3 attempts: {last_result.violations}"
        )

    @pytest.mark.asyncio
    async def test_deps_upgrade_message(self, llm_client) -> None:
        """3B model should produce a clean deps-upgrade message."""
        system_prompt = build_format_system_prompt()
        prompt = """Format this analysis into a conventional commit message:

**Type:** deps-upgrade
**Scope:** npm
**Impact:** Bumped vite from 5.2 to 6.0
**Description:** Upgrade vite to v6.0 for improved build performance

**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description

Output ONE LINE only."""

        response = await llm_client.format_commit_message(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=150,
        )

        raw = extract_commit_message(response)
        final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))

        _assert_conventional_commit(final)
        result = validate_commit_message(final)
        assert result.valid, f"Quality validation failed: {result.violations}"

    @pytest.mark.asyncio
    async def test_scope_sanitization_catches_bad_llm_scope(self, llm_client) -> None:
        """Even if we feed a bad scope, sanitize_message_scope should clean it."""
        system_prompt = build_format_system_prompt()
        # Intentionally provide a bad scope to see if the pipeline cleans it
        prompt = """Format this analysis into a conventional commit message:

**Type:** refactor
**Scope:** the primary authentication module for users
**Impact:** Extracted shared validation into utility
**Description:** Extract validation logic from auth handlers into shared utility

**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description

Use the type "refactor" and scope "the primary authentication module for users" provided.
Output ONE LINE only."""

        response = await llm_client.format_commit_message(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=150,
        )

        raw = extract_commit_message(response)
        final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))

        _assert_conventional_commit(final)
        # The scope should NOT contain stop words
        scope_match = re.match(r'^[^(]+\(([^)]+)\):', final)
        if scope_match:
            scope = scope_match.group(1)
            assert len(scope) <= 25, f"Scope too long after sanitization: {scope}"


@pytest.mark.gpu
class TestAnalyzeStageQuality:
    """Test that the 14B analyze model produces structured, clean analyses."""

    @pytest.mark.asyncio
    async def test_analyze_produces_valid_type(self, llm_client) -> None:
        """14B model should output a valid commit type from the gitmoji set."""
        from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types

        type_list = format_type_list_for_prompt()
        system_prompt = f"""You are an expert at analyzing code changes.

**CHANGE TYPES** (choose the most specific):
{type_list}

Provide your analysis in this format:
TYPE: [type from list]
SCOPE: [1-2 word identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""

        prompt = """Analyze these file changes:

**Changed Files (2):**
- src/auth/login.ts
- src/auth/oauth.ts

These files implement OAuth2 social login with Google and GitHub providers.

Output TYPE, SCOPE, IMPACT, DESCRIPTION."""

        response = await llm_client.analyze_commit(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=512,
        )

        # Parse TYPE from response
        type_match = re.search(r'TYPE:\s*(\S+)', response, re.IGNORECASE)
        assert type_match, f"No TYPE found in response: {response[:200]}"
        commit_type = type_match.group(1).lower().strip('*')

        valid_types = set(get_all_types())
        assert commit_type in valid_types, (
            f"Invalid type '{commit_type}' not in valid set. Response: {response[:200]}"
        )

    @pytest.mark.asyncio
    async def test_analyze_scope_is_clean(self, llm_client) -> None:
        """14B model scope output, after sanitization, should be a clean identifier."""
        from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt

        type_list = format_type_list_for_prompt()
        system_prompt = f"""You are an expert at analyzing code changes.

**CHANGE TYPES**:
{type_list}

**SCOPE RULES** (CRITICAL - follow exactly):
- Scope MUST be 1-2 words, kebab-case, max 20 characters
- GOOD scopes: "auth", "cli", "pipeline", "api-routes"
- BAD scopes: "src" (too generic), "the auth module" (natural language)
- NEVER write natural language in the scope

Provide your analysis in this format:
TYPE: [type]
SCOPE: [short identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""

        prompt = """Analyze these file changes:

**Changed Files (3):**
- packages/ui-theme/src/adapters/cyberpunk-adapter.ts
- packages/ui-theme/src/adapters/luxe-adapter.ts
- packages/ui-theme/src/types/ThemeInterface.ts

The adapters were updated to include new opacity tokens and the ThemeInterface
was extended with an optional opacity field.

Output TYPE, SCOPE, IMPACT, DESCRIPTION."""

        response = await llm_client.analyze_commit(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=512,
        )

        scope_match = re.search(r'SCOPE:\s*(.+)', response, re.IGNORECASE)
        assert scope_match, f"No SCOPE found in response: {response[:200]}"

        raw_scope = scope_match.group(1).strip().strip('*')
        clean_scope = sanitize_scope(raw_scope)

        assert clean_scope, f"Scope sanitized to empty from raw: '{raw_scope}'"
        assert len(clean_scope) <= 25, f"Scope too long: '{clean_scope}'"
        assert ' ' not in clean_scope, f"Scope has spaces: '{clean_scope}'"


@pytest.mark.gpu
class TestEndToEndMessageQuality:
    """End-to-end test: analyze → format → sanitize → validate."""

    @pytest.mark.asyncio
    async def test_full_pipeline_produces_valid_message(self, llm_client) -> None:
        """Full two-stage pipeline should produce a message that passes validation."""
        from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types

        # Stage 1: Analyze with 14B
        type_list = format_type_list_for_prompt()
        analyze_system = f"""You are an expert at analyzing code changes.

**CHANGE TYPES**:
{type_list}

**SCOPE RULES**:
- 1-2 words, kebab-case, max 20 chars
- Extract from file paths

Provide:
TYPE: [type]
SCOPE: [short identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""

        analyze_prompt = """Analyze these file changes:

**Changed Files (2):**
- src/pipeline/stages/format.py
- src/pipeline/format_utils.py

The format stage was updated to apply scope sanitization after emoji correction,
and format_utils gained a new sanitize_message_scope function that extracts,
cleans, and re-inserts the scope in formatted commit messages.

Output TYPE, SCOPE, IMPACT, DESCRIPTION."""

        analyze_response = await llm_client.analyze_commit(
            prompt=analyze_prompt,
            system_prompt=analyze_system,
            max_tokens=512,
        )

        # Parse analysis
        type_match = re.search(r'TYPE:\s*(\S+)', analyze_response, re.IGNORECASE)
        scope_match = re.search(r'SCOPE:\s*(.+)', analyze_response, re.IGNORECASE)
        desc_match = re.search(r'DESCRIPTION:\s*(.+)', analyze_response, re.IGNORECASE)
        impact_match = re.search(r'IMPACT:\s*(.+)', analyze_response, re.IGNORECASE)

        commit_type = type_match.group(1).strip().strip('*`').lower() if type_match else "chore"
        commit_type = re.split(r'[\s:(\[]+', commit_type)[0] if commit_type else "chore"
        raw_scope = scope_match.group(1).strip().strip('*') if scope_match else "pipeline"
        description = desc_match.group(1).strip().strip('*') if desc_match else "Update format pipeline"
        impact = impact_match.group(1).strip().strip('*') if impact_match else description

        valid_types = set(get_all_types())
        if commit_type not in valid_types:
            commit_type = "refactor"

        clean_scope = sanitize_scope(raw_scope) or "format-utils"

        # Stage 2: Format with 3B
        format_system = build_format_system_prompt()
        format_prompt = f"""Format this analysis into a conventional commit message:

**Type:** {commit_type}
**Scope:** {clean_scope}
**Impact:** {impact}
**Description:** {description}

**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description

Output ONE LINE only."""

        format_response = await llm_client.format_commit_message(
            prompt=format_prompt,
            system_prompt=format_system,
            max_tokens=150,
        )

        raw = extract_commit_message(format_response)
        final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))

        # Verify the final message
        _assert_conventional_commit(final)
        result = validate_commit_message(final)
        assert result.valid, (
            f"End-to-end message failed quality validation.\n"
            f"Message: {final}\n"
            f"Violations: {result.violations}\n"
            f"14B response: {analyze_response[:300]}\n"
            f"3B response: {format_response[:200]}"
        )