auto-commit-service/tests/test_commit_quality.py

389 lines
14 KiB
Python

"""GPU integration tests for commit message quality.
These tests exercise the full analyze→format pipeline with real LLM inference
to verify that commit messages meet quality standards.
Run with: pytest -m gpu tests/test_commit_quality.py -v
"""
import pytest
from auto_commit_service.config import AutoCommitSettings
from auto_commit_service.pipeline.format_utils import (
build_format_system_prompt,
correct_emoji,
extract_commit_message,
sanitize_message,
sanitize_message_scope,
sanitize_scope,
)
from auto_commit_service.pipeline.gitmoji import get_valid_types_regex
from auto_commit_service.llm.validator import validate_commit_message
import re
@pytest.fixture
async def llm_client(gpu_services: dict[str, str], gpu_settings: AutoCommitSettings):
"""Initialize and return the multi-model LLM client."""
from auto_commit_service.llm.multi_model_client import MultiModelLlamaClient
client = MultiModelLlamaClient(
timeout=gpu_settings.llm_timeout,
)
async with client:
yield client
def _assert_conventional_commit(message: str) -> None:
"""Assert a message matches conventional commit format with clean scope."""
# Use a simple regex to extract type, scope, emoji, description
match = re.match(r'^(\S+?)(\([^)]+\))?:\s*(\S+)\s+(.+)$', message)
assert match, f"Not a conventional commit: {message}"
# Scope quality checks
scope_match = re.match(r'^[^(]+\(([^)]+)\):', message)
if scope_match:
scope = scope_match.group(1)
assert len(scope) <= 25, f"Scope too long ({len(scope)}): {scope}"
assert ' ' not in scope, f"Scope contains spaces: {scope}"
assert scope.lower() not in {'src', 'lib', 'app', 'code', 'files'}, (
f"Scope too generic: {scope}"
)
# No stop words should remain
stop_words = {'the', 'a', 'an', 'is', 'for', 'in', 'of', 'to', 'with', 'as', 'its'}
scope_words = set(scope.lower().split('-'))
leaked = scope_words & stop_words
assert not leaked, f"Stop words leaked into scope: {leaked} in '{scope}'"
@pytest.mark.gpu
class TestFormatStageQuality:
"""Test that the 3B format model produces clean commit messages."""
@pytest.mark.asyncio
async def test_simple_feature_message(self, llm_client) -> None:
"""3B model should produce a clean feat message."""
system_prompt = build_format_system_prompt()
prompt = """Format this analysis into a conventional commit message:
**Type:** feat
**Scope:** auth
**Impact:** Added OAuth2 login support
**Description:** Add OAuth2 social login with Google and GitHub providers
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
result = validate_commit_message(final)
assert result.valid, f"Quality validation failed: {result.violations}"
@pytest.mark.asyncio
async def test_chore_config_message(self, llm_client) -> None:
"""3B model should produce a clean chore message for config changes.
Retries up to 3 times since the 3B model can be non-deterministic.
"""
system_prompt = build_format_system_prompt()
prompt = """Format this analysis into a conventional commit message:
**Type:** chore
**Scope:** config
**Impact:** Updated ESLint and TypeScript configuration
**Description:** Update ESLint rules to enforce strict type checking
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
last_result = None
for attempt in range(3):
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
last_result = validate_commit_message(final)
if last_result.valid:
break
assert last_result and last_result.valid, (
f"Quality validation failed after 3 attempts: {last_result.violations}"
)
@pytest.mark.asyncio
async def test_deps_upgrade_message(self, llm_client) -> None:
"""3B model should produce a clean deps-upgrade message."""
system_prompt = build_format_system_prompt()
prompt = """Format this analysis into a conventional commit message:
**Type:** deps-upgrade
**Scope:** npm
**Impact:** Bumped vite from 5.2 to 6.0
**Description:** Upgrade vite to v6.0 for improved build performance
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
result = validate_commit_message(final)
assert result.valid, f"Quality validation failed: {result.violations}"
@pytest.mark.asyncio
async def test_scope_sanitization_catches_bad_llm_scope(self, llm_client) -> None:
"""Even if we feed a bad scope, sanitize_message_scope should clean it."""
system_prompt = build_format_system_prompt()
# Intentionally provide a bad scope to see if the pipeline cleans it
prompt = """Format this analysis into a conventional commit message:
**Type:** refactor
**Scope:** the primary authentication module for users
**Impact:** Extracted shared validation into utility
**Description:** Extract validation logic from auth handlers into shared utility
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Use the type "refactor" and scope "the primary authentication module for users" provided.
Output ONE LINE only."""
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
# The scope should NOT contain stop words
scope_match = re.match(r'^[^(]+\(([^)]+)\):', final)
if scope_match:
scope = scope_match.group(1)
assert len(scope) <= 25, f"Scope too long after sanitization: {scope}"
@pytest.mark.gpu
class TestAnalyzeStageQuality:
"""Test that the 14B analyze model produces structured, clean analyses."""
@pytest.mark.asyncio
async def test_analyze_produces_valid_type(self, llm_client) -> None:
"""14B model should output a valid commit type from the gitmoji set."""
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types
type_list = format_type_list_for_prompt()
system_prompt = f"""You are an expert at analyzing code changes.
**CHANGE TYPES** (choose the most specific):
{type_list}
Provide your analysis in this format:
TYPE: [type from list]
SCOPE: [1-2 word identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""
prompt = """Analyze these file changes:
**Changed Files (2):**
- src/auth/login.ts
- src/auth/oauth.ts
These files implement OAuth2 social login with Google and GitHub providers.
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
response = await llm_client.analyze_commit(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=512,
)
# Parse TYPE from response
type_match = re.search(r'TYPE:\s*(\S+)', response, re.IGNORECASE)
assert type_match, f"No TYPE found in response: {response[:200]}"
commit_type = type_match.group(1).lower().strip('*')
valid_types = set(get_all_types())
assert commit_type in valid_types, (
f"Invalid type '{commit_type}' not in valid set. Response: {response[:200]}"
)
@pytest.mark.asyncio
async def test_analyze_scope_is_clean(self, llm_client) -> None:
"""14B model scope output, after sanitization, should be a clean identifier."""
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt
type_list = format_type_list_for_prompt()
system_prompt = f"""You are an expert at analyzing code changes.
**CHANGE TYPES**:
{type_list}
**SCOPE RULES** (CRITICAL - follow exactly):
- Scope MUST be 1-2 words, kebab-case, max 20 characters
- GOOD scopes: "auth", "cli", "pipeline", "api-routes"
- BAD scopes: "src" (too generic), "the auth module" (natural language)
- NEVER write natural language in the scope
Provide your analysis in this format:
TYPE: [type]
SCOPE: [short identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""
prompt = """Analyze these file changes:
**Changed Files (3):**
- packages/ui-theme/src/adapters/cyberpunk-adapter.ts
- packages/ui-theme/src/adapters/luxe-adapter.ts
- packages/ui-theme/src/types/ThemeInterface.ts
The adapters were updated to include new opacity tokens and the ThemeInterface
was extended with an optional opacity field.
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
response = await llm_client.analyze_commit(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=512,
)
scope_match = re.search(r'SCOPE:\s*(.+)', response, re.IGNORECASE)
assert scope_match, f"No SCOPE found in response: {response[:200]}"
raw_scope = scope_match.group(1).strip().strip('*')
clean_scope = sanitize_scope(raw_scope)
assert clean_scope, f"Scope sanitized to empty from raw: '{raw_scope}'"
assert len(clean_scope) <= 25, f"Scope too long: '{clean_scope}'"
assert ' ' not in clean_scope, f"Scope has spaces: '{clean_scope}'"
@pytest.mark.gpu
class TestEndToEndMessageQuality:
"""End-to-end test: analyze → format → sanitize → validate."""
@pytest.mark.asyncio
async def test_full_pipeline_produces_valid_message(self, llm_client) -> None:
"""Full two-stage pipeline should produce a message that passes validation."""
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types
# Stage 1: Analyze with 14B
type_list = format_type_list_for_prompt()
analyze_system = f"""You are an expert at analyzing code changes.
**CHANGE TYPES**:
{type_list}
**SCOPE RULES**:
- 1-2 words, kebab-case, max 20 chars
- Extract from file paths
Provide:
TYPE: [type]
SCOPE: [short identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""
analyze_prompt = """Analyze these file changes:
**Changed Files (2):**
- src/pipeline/stages/format.py
- src/pipeline/format_utils.py
The format stage was updated to apply scope sanitization after emoji correction,
and format_utils gained a new sanitize_message_scope function that extracts,
cleans, and re-inserts the scope in formatted commit messages.
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
analyze_response = await llm_client.analyze_commit(
prompt=analyze_prompt,
system_prompt=analyze_system,
max_tokens=512,
)
# Parse analysis
type_match = re.search(r'TYPE:\s*(\S+)', analyze_response, re.IGNORECASE)
scope_match = re.search(r'SCOPE:\s*(.+)', analyze_response, re.IGNORECASE)
desc_match = re.search(r'DESCRIPTION:\s*(.+)', analyze_response, re.IGNORECASE)
impact_match = re.search(r'IMPACT:\s*(.+)', analyze_response, re.IGNORECASE)
commit_type = type_match.group(1).strip().strip('*`').lower() if type_match else "chore"
commit_type = re.split(r'[\s:(\[]+', commit_type)[0] if commit_type else "chore"
raw_scope = scope_match.group(1).strip().strip('*') if scope_match else "pipeline"
description = desc_match.group(1).strip().strip('*') if desc_match else "Update format pipeline"
impact = impact_match.group(1).strip().strip('*') if impact_match else description
valid_types = set(get_all_types())
if commit_type not in valid_types:
commit_type = "refactor"
clean_scope = sanitize_scope(raw_scope) or "format-utils"
# Stage 2: Format with 3B
format_system = build_format_system_prompt()
format_prompt = f"""Format this analysis into a conventional commit message:
**Type:** {commit_type}
**Scope:** {clean_scope}
**Impact:** {impact}
**Description:** {description}
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
format_response = await llm_client.format_commit_message(
prompt=format_prompt,
system_prompt=format_system,
max_tokens=150,
)
raw = extract_commit_message(format_response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
# Verify the final message
_assert_conventional_commit(final)
result = validate_commit_message(final)
assert result.valid, (
f"End-to-end message failed quality validation.\n"
f"Message: {final}\n"
f"Violations: {result.violations}\n"
f"14B response: {analyze_response[:300]}\n"
f"3B response: {format_response[:200]}"
)