auto-commit-service/tests/test_commit_quality.py

390 lines
14 KiB
Python
Raw Normal View History

"""GPU integration tests for commit message quality.
These tests exercise the full analyzeformat pipeline with real LLM inference
to verify that commit messages meet quality standards.
Run with: pytest -m gpu tests/test_commit_quality.py -v
"""
import pytest
from auto_commit_service.config import AutoCommitSettings
from auto_commit_service.pipeline.format_utils import (
build_format_system_prompt,
correct_emoji,
extract_commit_message,
sanitize_message,
sanitize_message_scope,
sanitize_scope,
)
from auto_commit_service.pipeline.gitmoji import get_valid_types_regex
from auto_commit_service.llm.validator import validate_commit_message
import re
@pytest.fixture
async def llm_client(gpu_services: dict[str, str], gpu_settings: AutoCommitSettings):
"""Initialize and return the multi-model LLM client."""
from auto_commit_service.llm.multi_model_client import MultiModelLlamaClient
client = MultiModelLlamaClient(
timeout=gpu_settings.llm_timeout,
)
async with client:
yield client
def _assert_conventional_commit(message: str) -> None:
"""Assert a message matches conventional commit format with clean scope."""
# Use a simple regex to extract type, scope, emoji, description
match = re.match(r'^(\S+?)(\([^)]+\))?:\s*(\S+)\s+(.+)$', message)
assert match, f"Not a conventional commit: {message}"
# Scope quality checks
scope_match = re.match(r'^[^(]+\(([^)]+)\):', message)
if scope_match:
scope = scope_match.group(1)
assert len(scope) <= 25, f"Scope too long ({len(scope)}): {scope}"
assert ' ' not in scope, f"Scope contains spaces: {scope}"
assert scope.lower() not in {'src', 'lib', 'app', 'code', 'files'}, (
f"Scope too generic: {scope}"
)
# No stop words should remain
stop_words = {'the', 'a', 'an', 'is', 'for', 'in', 'of', 'to', 'with', 'as', 'its'}
scope_words = set(scope.lower().split('-'))
leaked = scope_words & stop_words
assert not leaked, f"Stop words leaked into scope: {leaked} in '{scope}'"
@pytest.mark.gpu
class TestFormatStageQuality:
"""Test that the 3B format model produces clean commit messages."""
@pytest.mark.asyncio
async def test_simple_feature_message(self, llm_client) -> None:
"""3B model should produce a clean feat message."""
system_prompt = build_format_system_prompt()
prompt = """Format this analysis into a conventional commit message:
**Type:** feat
**Scope:** auth
**Impact:** Added OAuth2 login support
**Description:** Add OAuth2 social login with Google and GitHub providers
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
result = validate_commit_message(final)
assert result.valid, f"Quality validation failed: {result.violations}"
@pytest.mark.asyncio
async def test_chore_config_message(self, llm_client) -> None:
"""3B model should produce a clean chore message for config changes.
Retries up to 3 times since the 3B model can be non-deterministic.
"""
system_prompt = build_format_system_prompt()
prompt = """Format this analysis into a conventional commit message:
**Type:** chore
**Scope:** config
**Impact:** Updated ESLint and TypeScript configuration
**Description:** Update ESLint rules to enforce strict type checking
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
last_result = None
for attempt in range(3):
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
last_result = validate_commit_message(final)
if last_result.valid:
break
assert last_result and last_result.valid, (
f"Quality validation failed after 3 attempts: {last_result.violations}"
)
@pytest.mark.asyncio
async def test_deps_upgrade_message(self, llm_client) -> None:
"""3B model should produce a clean deps-upgrade message."""
system_prompt = build_format_system_prompt()
prompt = """Format this analysis into a conventional commit message:
**Type:** deps-upgrade
**Scope:** npm
**Impact:** Bumped vite from 5.2 to 6.0
**Description:** Upgrade vite to v6.0 for improved build performance
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
result = validate_commit_message(final)
assert result.valid, f"Quality validation failed: {result.violations}"
@pytest.mark.asyncio
async def test_scope_sanitization_catches_bad_llm_scope(self, llm_client) -> None:
"""Even if we feed a bad scope, sanitize_message_scope should clean it."""
system_prompt = build_format_system_prompt()
# Intentionally provide a bad scope to see if the pipeline cleans it
prompt = """Format this analysis into a conventional commit message:
**Type:** refactor
**Scope:** the primary authentication module for users
**Impact:** Extracted shared validation into utility
**Description:** Extract validation logic from auth handlers into shared utility
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Use the type "refactor" and scope "the primary authentication module for users" provided.
Output ONE LINE only."""
response = await llm_client.format_commit_message(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=150,
)
raw = extract_commit_message(response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
_assert_conventional_commit(final)
# The scope should NOT contain stop words
scope_match = re.match(r'^[^(]+\(([^)]+)\):', final)
if scope_match:
scope = scope_match.group(1)
assert len(scope) <= 25, f"Scope too long after sanitization: {scope}"
@pytest.mark.gpu
class TestAnalyzeStageQuality:
"""Test that the 14B analyze model produces structured, clean analyses."""
@pytest.mark.asyncio
async def test_analyze_produces_valid_type(self, llm_client) -> None:
"""14B model should output a valid commit type from the gitmoji set."""
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types
type_list = format_type_list_for_prompt()
system_prompt = f"""You are an expert at analyzing code changes.
**CHANGE TYPES** (choose the most specific):
{type_list}
Provide your analysis in this format:
TYPE: [type from list]
SCOPE: [1-2 word identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""
prompt = """Analyze these file changes:
**Changed Files (2):**
- src/auth/login.ts
- src/auth/oauth.ts
These files implement OAuth2 social login with Google and GitHub providers.
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
response = await llm_client.analyze_commit(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=512,
)
# Parse TYPE from response
type_match = re.search(r'TYPE:\s*(\S+)', response, re.IGNORECASE)
assert type_match, f"No TYPE found in response: {response[:200]}"
commit_type = type_match.group(1).lower().strip('*')
valid_types = set(get_all_types())
assert commit_type in valid_types, (
f"Invalid type '{commit_type}' not in valid set. Response: {response[:200]}"
)
@pytest.mark.asyncio
async def test_analyze_scope_is_clean(self, llm_client) -> None:
"""14B model scope output, after sanitization, should be a clean identifier."""
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt
type_list = format_type_list_for_prompt()
system_prompt = f"""You are an expert at analyzing code changes.
**CHANGE TYPES**:
{type_list}
**SCOPE RULES** (CRITICAL - follow exactly):
- Scope MUST be 1-2 words, kebab-case, max 20 characters
- GOOD scopes: "auth", "cli", "pipeline", "api-routes"
- BAD scopes: "src" (too generic), "the auth module" (natural language)
- NEVER write natural language in the scope
Provide your analysis in this format:
TYPE: [type]
SCOPE: [short identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""
prompt = """Analyze these file changes:
**Changed Files (3):**
- packages/ui-theme/src/adapters/cyberpunk-adapter.ts
- packages/ui-theme/src/adapters/luxe-adapter.ts
- packages/ui-theme/src/types/ThemeInterface.ts
The adapters were updated to include new opacity tokens and the ThemeInterface
was extended with an optional opacity field.
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
response = await llm_client.analyze_commit(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=512,
)
scope_match = re.search(r'SCOPE:\s*(.+)', response, re.IGNORECASE)
assert scope_match, f"No SCOPE found in response: {response[:200]}"
raw_scope = scope_match.group(1).strip().strip('*')
clean_scope = sanitize_scope(raw_scope)
assert clean_scope, f"Scope sanitized to empty from raw: '{raw_scope}'"
assert len(clean_scope) <= 25, f"Scope too long: '{clean_scope}'"
assert ' ' not in clean_scope, f"Scope has spaces: '{clean_scope}'"
@pytest.mark.gpu
class TestEndToEndMessageQuality:
"""End-to-end test: analyze → format → sanitize → validate."""
@pytest.mark.asyncio
async def test_full_pipeline_produces_valid_message(self, llm_client) -> None:
"""Full two-stage pipeline should produce a message that passes validation."""
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types
# Stage 1: Analyze with 14B
type_list = format_type_list_for_prompt()
analyze_system = f"""You are an expert at analyzing code changes.
**CHANGE TYPES**:
{type_list}
**SCOPE RULES**:
- 1-2 words, kebab-case, max 20 chars
- Extract from file paths
Provide:
TYPE: [type]
SCOPE: [short identifier]
IMPACT: [summary]
DESCRIPTION: [specific description]"""
analyze_prompt = """Analyze these file changes:
**Changed Files (2):**
- src/pipeline/stages/format.py
- src/pipeline/format_utils.py
The format stage was updated to apply scope sanitization after emoji correction,
and format_utils gained a new sanitize_message_scope function that extracts,
cleans, and re-inserts the scope in formatted commit messages.
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
analyze_response = await llm_client.analyze_commit(
prompt=analyze_prompt,
system_prompt=analyze_system,
max_tokens=512,
)
# Parse analysis
type_match = re.search(r'TYPE:\s*(\S+)', analyze_response, re.IGNORECASE)
scope_match = re.search(r'SCOPE:\s*(.+)', analyze_response, re.IGNORECASE)
desc_match = re.search(r'DESCRIPTION:\s*(.+)', analyze_response, re.IGNORECASE)
impact_match = re.search(r'IMPACT:\s*(.+)', analyze_response, re.IGNORECASE)
commit_type = type_match.group(1).strip().strip('*`').lower() if type_match else "chore"
commit_type = re.split(r'[\s:(\[]+', commit_type)[0] if commit_type else "chore"
raw_scope = scope_match.group(1).strip().strip('*') if scope_match else "pipeline"
description = desc_match.group(1).strip().strip('*') if desc_match else "Update format pipeline"
impact = impact_match.group(1).strip().strip('*') if impact_match else description
valid_types = set(get_all_types())
if commit_type not in valid_types:
commit_type = "refactor"
clean_scope = sanitize_scope(raw_scope) or "format-utils"
# Stage 2: Format with 3B
format_system = build_format_system_prompt()
format_prompt = f"""Format this analysis into a conventional commit message:
**Type:** {commit_type}
**Scope:** {clean_scope}
**Impact:** {impact}
**Description:** {description}
**Output Format (CRITICAL - use parentheses):**
type(scope): emoji description
Output ONE LINE only."""
format_response = await llm_client.format_commit_message(
prompt=format_prompt,
system_prompt=format_system,
max_tokens=150,
)
raw = extract_commit_message(format_response)
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
# Verify the final message
_assert_conventional_commit(final)
result = validate_commit_message(final)
assert result.valid, (
f"End-to-end message failed quality validation.\n"
f"Message: {final}\n"
f"Violations: {result.violations}\n"
f"14B response: {analyze_response[:300]}\n"
f"3B response: {format_response[:200]}"
)