"""GPU integration tests for commit message quality. These tests exercise the full analyze→format pipeline with real LLM inference to verify that commit messages meet quality standards. Run with: pytest -m gpu tests/test_commit_quality.py -v """ import pytest from auto_commit_service.config import AutoCommitSettings from auto_commit_service.pipeline.format_utils import ( build_format_system_prompt, correct_emoji, extract_commit_message, sanitize_message, sanitize_message_scope, sanitize_scope, ) from auto_commit_service.pipeline.gitmoji import get_valid_types_regex from auto_commit_service.llm.validator import validate_commit_message import re @pytest.fixture async def llm_client(gpu_services: dict[str, str], gpu_settings: AutoCommitSettings): """Initialize and return the multi-model LLM client.""" from auto_commit_service.llm.multi_model_client import MultiModelLlamaClient client = MultiModelLlamaClient( timeout=gpu_settings.llm_timeout, ) async with client: yield client def _assert_conventional_commit(message: str) -> None: """Assert a message matches conventional commit format with clean scope.""" # Use a simple regex to extract type, scope, emoji, description match = re.match(r'^(\S+?)(\([^)]+\))?:\s*(\S+)\s+(.+)$', message) assert match, f"Not a conventional commit: {message}" # Scope quality checks scope_match = re.match(r'^[^(]+\(([^)]+)\):', message) if scope_match: scope = scope_match.group(1) assert len(scope) <= 25, f"Scope too long ({len(scope)}): {scope}" assert ' ' not in scope, f"Scope contains spaces: {scope}" assert scope.lower() not in {'src', 'lib', 'app', 'code', 'files'}, ( f"Scope too generic: {scope}" ) # No stop words should remain stop_words = {'the', 'a', 'an', 'is', 'for', 'in', 'of', 'to', 'with', 'as', 'its'} scope_words = set(scope.lower().split('-')) leaked = scope_words & stop_words assert not leaked, f"Stop words leaked into scope: {leaked} in '{scope}'" @pytest.mark.gpu class TestFormatStageQuality: """Test that the 3B format model produces clean commit messages.""" @pytest.mark.asyncio async def test_simple_feature_message(self, llm_client) -> None: """3B model should produce a clean feat message.""" system_prompt = build_format_system_prompt() prompt = """Format this analysis into a conventional commit message: **Type:** feat **Scope:** auth **Impact:** Added OAuth2 login support **Description:** Add OAuth2 social login with Google and GitHub providers **Output Format (CRITICAL - use parentheses):** type(scope): emoji description Output ONE LINE only.""" response = await llm_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=150, ) raw = extract_commit_message(response) final = sanitize_message_scope(correct_emoji(sanitize_message(raw))) _assert_conventional_commit(final) result = validate_commit_message(final) assert result.valid, f"Quality validation failed: {result.violations}" @pytest.mark.asyncio async def test_chore_config_message(self, llm_client) -> None: """3B model should produce a clean chore message for config changes. Retries up to 3 times since the 3B model can be non-deterministic. """ system_prompt = build_format_system_prompt() prompt = """Format this analysis into a conventional commit message: **Type:** chore **Scope:** config **Impact:** Updated ESLint and TypeScript configuration **Description:** Update ESLint rules to enforce strict type checking **Output Format (CRITICAL - use parentheses):** type(scope): emoji description Output ONE LINE only.""" last_result = None for attempt in range(3): response = await llm_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=150, ) raw = extract_commit_message(response) final = sanitize_message_scope(correct_emoji(sanitize_message(raw))) _assert_conventional_commit(final) last_result = validate_commit_message(final) if last_result.valid: break assert last_result and last_result.valid, ( f"Quality validation failed after 3 attempts: {last_result.violations}" ) @pytest.mark.asyncio async def test_deps_upgrade_message(self, llm_client) -> None: """3B model should produce a clean deps-upgrade message.""" system_prompt = build_format_system_prompt() prompt = """Format this analysis into a conventional commit message: **Type:** deps-upgrade **Scope:** npm **Impact:** Bumped vite from 5.2 to 6.0 **Description:** Upgrade vite to v6.0 for improved build performance **Output Format (CRITICAL - use parentheses):** type(scope): emoji description Output ONE LINE only.""" response = await llm_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=150, ) raw = extract_commit_message(response) final = sanitize_message_scope(correct_emoji(sanitize_message(raw))) _assert_conventional_commit(final) result = validate_commit_message(final) assert result.valid, f"Quality validation failed: {result.violations}" @pytest.mark.asyncio async def test_scope_sanitization_catches_bad_llm_scope(self, llm_client) -> None: """Even if we feed a bad scope, sanitize_message_scope should clean it.""" system_prompt = build_format_system_prompt() # Intentionally provide a bad scope to see if the pipeline cleans it prompt = """Format this analysis into a conventional commit message: **Type:** refactor **Scope:** the primary authentication module for users **Impact:** Extracted shared validation into utility **Description:** Extract validation logic from auth handlers into shared utility **Output Format (CRITICAL - use parentheses):** type(scope): emoji description Use the type "refactor" and scope "the primary authentication module for users" provided. Output ONE LINE only.""" response = await llm_client.format_commit_message( prompt=prompt, system_prompt=system_prompt, max_tokens=150, ) raw = extract_commit_message(response) final = sanitize_message_scope(correct_emoji(sanitize_message(raw))) _assert_conventional_commit(final) # The scope should NOT contain stop words scope_match = re.match(r'^[^(]+\(([^)]+)\):', final) if scope_match: scope = scope_match.group(1) assert len(scope) <= 25, f"Scope too long after sanitization: {scope}" @pytest.mark.gpu class TestAnalyzeStageQuality: """Test that the 14B analyze model produces structured, clean analyses.""" @pytest.mark.asyncio async def test_analyze_produces_valid_type(self, llm_client) -> None: """14B model should output a valid commit type from the gitmoji set.""" from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types type_list = format_type_list_for_prompt() system_prompt = f"""You are an expert at analyzing code changes. **CHANGE TYPES** (choose the most specific): {type_list} Provide your analysis in this format: TYPE: [type from list] SCOPE: [1-2 word identifier] IMPACT: [summary] DESCRIPTION: [specific description]""" prompt = """Analyze these file changes: **Changed Files (2):** - src/auth/login.ts - src/auth/oauth.ts These files implement OAuth2 social login with Google and GitHub providers. Output TYPE, SCOPE, IMPACT, DESCRIPTION.""" response = await llm_client.analyze_commit( prompt=prompt, system_prompt=system_prompt, max_tokens=512, ) # Parse TYPE from response type_match = re.search(r'TYPE:\s*(\S+)', response, re.IGNORECASE) assert type_match, f"No TYPE found in response: {response[:200]}" commit_type = type_match.group(1).lower().strip('*') valid_types = set(get_all_types()) assert commit_type in valid_types, ( f"Invalid type '{commit_type}' not in valid set. Response: {response[:200]}" ) @pytest.mark.asyncio async def test_analyze_scope_is_clean(self, llm_client) -> None: """14B model scope output, after sanitization, should be a clean identifier.""" from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt type_list = format_type_list_for_prompt() system_prompt = f"""You are an expert at analyzing code changes. **CHANGE TYPES**: {type_list} **SCOPE RULES** (CRITICAL - follow exactly): - Scope MUST be 1-2 words, kebab-case, max 20 characters - GOOD scopes: "auth", "cli", "pipeline", "api-routes" - BAD scopes: "src" (too generic), "the auth module" (natural language) - NEVER write natural language in the scope Provide your analysis in this format: TYPE: [type] SCOPE: [short identifier] IMPACT: [summary] DESCRIPTION: [specific description]""" prompt = """Analyze these file changes: **Changed Files (3):** - packages/ui-theme/src/adapters/cyberpunk-adapter.ts - packages/ui-theme/src/adapters/luxe-adapter.ts - packages/ui-theme/src/types/ThemeInterface.ts The adapters were updated to include new opacity tokens and the ThemeInterface was extended with an optional opacity field. Output TYPE, SCOPE, IMPACT, DESCRIPTION.""" response = await llm_client.analyze_commit( prompt=prompt, system_prompt=system_prompt, max_tokens=512, ) scope_match = re.search(r'SCOPE:\s*(.+)', response, re.IGNORECASE) assert scope_match, f"No SCOPE found in response: {response[:200]}" raw_scope = scope_match.group(1).strip().strip('*') clean_scope = sanitize_scope(raw_scope) assert clean_scope, f"Scope sanitized to empty from raw: '{raw_scope}'" assert len(clean_scope) <= 25, f"Scope too long: '{clean_scope}'" assert ' ' not in clean_scope, f"Scope has spaces: '{clean_scope}'" @pytest.mark.gpu class TestEndToEndMessageQuality: """End-to-end test: analyze → format → sanitize → validate.""" @pytest.mark.asyncio async def test_full_pipeline_produces_valid_message(self, llm_client) -> None: """Full two-stage pipeline should produce a message that passes validation.""" from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types # Stage 1: Analyze with 14B type_list = format_type_list_for_prompt() analyze_system = f"""You are an expert at analyzing code changes. **CHANGE TYPES**: {type_list} **SCOPE RULES**: - 1-2 words, kebab-case, max 20 chars - Extract from file paths Provide: TYPE: [type] SCOPE: [short identifier] IMPACT: [summary] DESCRIPTION: [specific description]""" analyze_prompt = """Analyze these file changes: **Changed Files (2):** - src/pipeline/stages/format.py - src/pipeline/format_utils.py The format stage was updated to apply scope sanitization after emoji correction, and format_utils gained a new sanitize_message_scope function that extracts, cleans, and re-inserts the scope in formatted commit messages. Output TYPE, SCOPE, IMPACT, DESCRIPTION.""" analyze_response = await llm_client.analyze_commit( prompt=analyze_prompt, system_prompt=analyze_system, max_tokens=512, ) # Parse analysis type_match = re.search(r'TYPE:\s*(\S+)', analyze_response, re.IGNORECASE) scope_match = re.search(r'SCOPE:\s*(.+)', analyze_response, re.IGNORECASE) desc_match = re.search(r'DESCRIPTION:\s*(.+)', analyze_response, re.IGNORECASE) impact_match = re.search(r'IMPACT:\s*(.+)', analyze_response, re.IGNORECASE) commit_type = type_match.group(1).strip().strip('*`').lower() if type_match else "chore" commit_type = re.split(r'[\s:(\[]+', commit_type)[0] if commit_type else "chore" raw_scope = scope_match.group(1).strip().strip('*') if scope_match else "pipeline" description = desc_match.group(1).strip().strip('*') if desc_match else "Update format pipeline" impact = impact_match.group(1).strip().strip('*') if impact_match else description valid_types = set(get_all_types()) if commit_type not in valid_types: commit_type = "refactor" clean_scope = sanitize_scope(raw_scope) or "format-utils" # Stage 2: Format with 3B format_system = build_format_system_prompt() format_prompt = f"""Format this analysis into a conventional commit message: **Type:** {commit_type} **Scope:** {clean_scope} **Impact:** {impact} **Description:** {description} **Output Format (CRITICAL - use parentheses):** type(scope): emoji description Output ONE LINE only.""" format_response = await llm_client.format_commit_message( prompt=format_prompt, system_prompt=format_system, max_tokens=150, ) raw = extract_commit_message(format_response) final = sanitize_message_scope(correct_emoji(sanitize_message(raw))) # Verify the final message _assert_conventional_commit(final) result = validate_commit_message(final) assert result.valid, ( f"End-to-end message failed quality validation.\n" f"Message: {final}\n" f"Violations: {result.violations}\n" f"14B response: {analyze_response[:300]}\n" f"3B response: {format_response[:200]}" )