389 lines
14 KiB
Python
389 lines
14 KiB
Python
"""GPU integration tests for commit message quality.
|
|
|
|
These tests exercise the full analyze→format pipeline with real LLM inference
|
|
to verify that commit messages meet quality standards.
|
|
|
|
Run with: pytest -m gpu tests/test_commit_quality.py -v
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from auto_commit_service.config import AutoCommitSettings
|
|
from auto_commit_service.pipeline.format_utils import (
|
|
build_format_system_prompt,
|
|
correct_emoji,
|
|
extract_commit_message,
|
|
sanitize_message,
|
|
sanitize_message_scope,
|
|
sanitize_scope,
|
|
)
|
|
from auto_commit_service.pipeline.gitmoji import get_valid_types_regex
|
|
from auto_commit_service.llm.validator import validate_commit_message
|
|
|
|
import re
|
|
|
|
|
|
@pytest.fixture
|
|
async def llm_client(gpu_services: dict[str, str], gpu_settings: AutoCommitSettings):
|
|
"""Initialize and return the multi-model LLM client."""
|
|
from auto_commit_service.llm.multi_model_client import MultiModelLlamaClient
|
|
|
|
client = MultiModelLlamaClient(
|
|
timeout=gpu_settings.llm_timeout,
|
|
)
|
|
async with client:
|
|
yield client
|
|
|
|
|
|
def _assert_conventional_commit(message: str) -> None:
|
|
"""Assert a message matches conventional commit format with clean scope."""
|
|
# Use a simple regex to extract type, scope, emoji, description
|
|
match = re.match(r'^(\S+?)(\([^)]+\))?:\s*(\S+)\s+(.+)$', message)
|
|
assert match, f"Not a conventional commit: {message}"
|
|
|
|
# Scope quality checks
|
|
scope_match = re.match(r'^[^(]+\(([^)]+)\):', message)
|
|
if scope_match:
|
|
scope = scope_match.group(1)
|
|
assert len(scope) <= 25, f"Scope too long ({len(scope)}): {scope}"
|
|
assert ' ' not in scope, f"Scope contains spaces: {scope}"
|
|
assert scope.lower() not in {'src', 'lib', 'app', 'code', 'files'}, (
|
|
f"Scope too generic: {scope}"
|
|
)
|
|
# No stop words should remain
|
|
stop_words = {'the', 'a', 'an', 'is', 'for', 'in', 'of', 'to', 'with', 'as', 'its'}
|
|
scope_words = set(scope.lower().split('-'))
|
|
leaked = scope_words & stop_words
|
|
assert not leaked, f"Stop words leaked into scope: {leaked} in '{scope}'"
|
|
|
|
|
|
@pytest.mark.gpu
|
|
class TestFormatStageQuality:
|
|
"""Test that the 3B format model produces clean commit messages."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_simple_feature_message(self, llm_client) -> None:
|
|
"""3B model should produce a clean feat message."""
|
|
system_prompt = build_format_system_prompt()
|
|
prompt = """Format this analysis into a conventional commit message:
|
|
|
|
**Type:** feat
|
|
**Scope:** auth
|
|
**Impact:** Added OAuth2 login support
|
|
**Description:** Add OAuth2 social login with Google and GitHub providers
|
|
|
|
**Output Format (CRITICAL - use parentheses):**
|
|
type(scope): emoji description
|
|
|
|
Output ONE LINE only."""
|
|
|
|
response = await llm_client.format_commit_message(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
max_tokens=150,
|
|
)
|
|
|
|
raw = extract_commit_message(response)
|
|
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
|
|
|
|
_assert_conventional_commit(final)
|
|
result = validate_commit_message(final)
|
|
assert result.valid, f"Quality validation failed: {result.violations}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_chore_config_message(self, llm_client) -> None:
|
|
"""3B model should produce a clean chore message for config changes.
|
|
|
|
Retries up to 3 times since the 3B model can be non-deterministic.
|
|
"""
|
|
system_prompt = build_format_system_prompt()
|
|
prompt = """Format this analysis into a conventional commit message:
|
|
|
|
**Type:** chore
|
|
**Scope:** config
|
|
**Impact:** Updated ESLint and TypeScript configuration
|
|
**Description:** Update ESLint rules to enforce strict type checking
|
|
|
|
**Output Format (CRITICAL - use parentheses):**
|
|
type(scope): emoji description
|
|
|
|
Output ONE LINE only."""
|
|
|
|
last_result = None
|
|
for attempt in range(3):
|
|
response = await llm_client.format_commit_message(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
max_tokens=150,
|
|
)
|
|
|
|
raw = extract_commit_message(response)
|
|
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
|
|
|
|
_assert_conventional_commit(final)
|
|
last_result = validate_commit_message(final)
|
|
if last_result.valid:
|
|
break
|
|
|
|
assert last_result and last_result.valid, (
|
|
f"Quality validation failed after 3 attempts: {last_result.violations}"
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_deps_upgrade_message(self, llm_client) -> None:
|
|
"""3B model should produce a clean deps-upgrade message."""
|
|
system_prompt = build_format_system_prompt()
|
|
prompt = """Format this analysis into a conventional commit message:
|
|
|
|
**Type:** deps-upgrade
|
|
**Scope:** npm
|
|
**Impact:** Bumped vite from 5.2 to 6.0
|
|
**Description:** Upgrade vite to v6.0 for improved build performance
|
|
|
|
**Output Format (CRITICAL - use parentheses):**
|
|
type(scope): emoji description
|
|
|
|
Output ONE LINE only."""
|
|
|
|
response = await llm_client.format_commit_message(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
max_tokens=150,
|
|
)
|
|
|
|
raw = extract_commit_message(response)
|
|
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
|
|
|
|
_assert_conventional_commit(final)
|
|
result = validate_commit_message(final)
|
|
assert result.valid, f"Quality validation failed: {result.violations}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scope_sanitization_catches_bad_llm_scope(self, llm_client) -> None:
|
|
"""Even if we feed a bad scope, sanitize_message_scope should clean it."""
|
|
system_prompt = build_format_system_prompt()
|
|
# Intentionally provide a bad scope to see if the pipeline cleans it
|
|
prompt = """Format this analysis into a conventional commit message:
|
|
|
|
**Type:** refactor
|
|
**Scope:** the primary authentication module for users
|
|
**Impact:** Extracted shared validation into utility
|
|
**Description:** Extract validation logic from auth handlers into shared utility
|
|
|
|
**Output Format (CRITICAL - use parentheses):**
|
|
type(scope): emoji description
|
|
|
|
Use the type "refactor" and scope "the primary authentication module for users" provided.
|
|
Output ONE LINE only."""
|
|
|
|
response = await llm_client.format_commit_message(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
max_tokens=150,
|
|
)
|
|
|
|
raw = extract_commit_message(response)
|
|
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
|
|
|
|
_assert_conventional_commit(final)
|
|
# The scope should NOT contain stop words
|
|
scope_match = re.match(r'^[^(]+\(([^)]+)\):', final)
|
|
if scope_match:
|
|
scope = scope_match.group(1)
|
|
assert len(scope) <= 25, f"Scope too long after sanitization: {scope}"
|
|
|
|
|
|
@pytest.mark.gpu
|
|
class TestAnalyzeStageQuality:
|
|
"""Test that the 14B analyze model produces structured, clean analyses."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_analyze_produces_valid_type(self, llm_client) -> None:
|
|
"""14B model should output a valid commit type from the gitmoji set."""
|
|
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types
|
|
|
|
type_list = format_type_list_for_prompt()
|
|
system_prompt = f"""You are an expert at analyzing code changes.
|
|
|
|
**CHANGE TYPES** (choose the most specific):
|
|
{type_list}
|
|
|
|
Provide your analysis in this format:
|
|
TYPE: [type from list]
|
|
SCOPE: [1-2 word identifier]
|
|
IMPACT: [summary]
|
|
DESCRIPTION: [specific description]"""
|
|
|
|
prompt = """Analyze these file changes:
|
|
|
|
**Changed Files (2):**
|
|
- src/auth/login.ts
|
|
- src/auth/oauth.ts
|
|
|
|
These files implement OAuth2 social login with Google and GitHub providers.
|
|
|
|
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
|
|
|
|
response = await llm_client.analyze_commit(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
max_tokens=512,
|
|
)
|
|
|
|
# Parse TYPE from response
|
|
type_match = re.search(r'TYPE:\s*(\S+)', response, re.IGNORECASE)
|
|
assert type_match, f"No TYPE found in response: {response[:200]}"
|
|
commit_type = type_match.group(1).lower().strip('*')
|
|
|
|
valid_types = set(get_all_types())
|
|
assert commit_type in valid_types, (
|
|
f"Invalid type '{commit_type}' not in valid set. Response: {response[:200]}"
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_analyze_scope_is_clean(self, llm_client) -> None:
|
|
"""14B model scope output, after sanitization, should be a clean identifier."""
|
|
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt
|
|
|
|
type_list = format_type_list_for_prompt()
|
|
system_prompt = f"""You are an expert at analyzing code changes.
|
|
|
|
**CHANGE TYPES**:
|
|
{type_list}
|
|
|
|
**SCOPE RULES** (CRITICAL - follow exactly):
|
|
- Scope MUST be 1-2 words, kebab-case, max 20 characters
|
|
- GOOD scopes: "auth", "cli", "pipeline", "api-routes"
|
|
- BAD scopes: "src" (too generic), "the auth module" (natural language)
|
|
- NEVER write natural language in the scope
|
|
|
|
Provide your analysis in this format:
|
|
TYPE: [type]
|
|
SCOPE: [short identifier]
|
|
IMPACT: [summary]
|
|
DESCRIPTION: [specific description]"""
|
|
|
|
prompt = """Analyze these file changes:
|
|
|
|
**Changed Files (3):**
|
|
- packages/ui-theme/src/adapters/cyberpunk-adapter.ts
|
|
- packages/ui-theme/src/adapters/luxe-adapter.ts
|
|
- packages/ui-theme/src/types/ThemeInterface.ts
|
|
|
|
The adapters were updated to include new opacity tokens and the ThemeInterface
|
|
was extended with an optional opacity field.
|
|
|
|
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
|
|
|
|
response = await llm_client.analyze_commit(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
max_tokens=512,
|
|
)
|
|
|
|
scope_match = re.search(r'SCOPE:\s*(.+)', response, re.IGNORECASE)
|
|
assert scope_match, f"No SCOPE found in response: {response[:200]}"
|
|
|
|
raw_scope = scope_match.group(1).strip().strip('*')
|
|
clean_scope = sanitize_scope(raw_scope)
|
|
|
|
assert clean_scope, f"Scope sanitized to empty from raw: '{raw_scope}'"
|
|
assert len(clean_scope) <= 25, f"Scope too long: '{clean_scope}'"
|
|
assert ' ' not in clean_scope, f"Scope has spaces: '{clean_scope}'"
|
|
|
|
|
|
@pytest.mark.gpu
|
|
class TestEndToEndMessageQuality:
|
|
"""End-to-end test: analyze → format → sanitize → validate."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_full_pipeline_produces_valid_message(self, llm_client) -> None:
|
|
"""Full two-stage pipeline should produce a message that passes validation."""
|
|
from auto_commit_service.pipeline.gitmoji import format_type_list_for_prompt, get_all_types
|
|
|
|
# Stage 1: Analyze with 14B
|
|
type_list = format_type_list_for_prompt()
|
|
analyze_system = f"""You are an expert at analyzing code changes.
|
|
|
|
**CHANGE TYPES**:
|
|
{type_list}
|
|
|
|
**SCOPE RULES**:
|
|
- 1-2 words, kebab-case, max 20 chars
|
|
- Extract from file paths
|
|
|
|
Provide:
|
|
TYPE: [type]
|
|
SCOPE: [short identifier]
|
|
IMPACT: [summary]
|
|
DESCRIPTION: [specific description]"""
|
|
|
|
analyze_prompt = """Analyze these file changes:
|
|
|
|
**Changed Files (2):**
|
|
- src/pipeline/stages/format.py
|
|
- src/pipeline/format_utils.py
|
|
|
|
The format stage was updated to apply scope sanitization after emoji correction,
|
|
and format_utils gained a new sanitize_message_scope function that extracts,
|
|
cleans, and re-inserts the scope in formatted commit messages.
|
|
|
|
Output TYPE, SCOPE, IMPACT, DESCRIPTION."""
|
|
|
|
analyze_response = await llm_client.analyze_commit(
|
|
prompt=analyze_prompt,
|
|
system_prompt=analyze_system,
|
|
max_tokens=512,
|
|
)
|
|
|
|
# Parse analysis
|
|
type_match = re.search(r'TYPE:\s*(\S+)', analyze_response, re.IGNORECASE)
|
|
scope_match = re.search(r'SCOPE:\s*(.+)', analyze_response, re.IGNORECASE)
|
|
desc_match = re.search(r'DESCRIPTION:\s*(.+)', analyze_response, re.IGNORECASE)
|
|
impact_match = re.search(r'IMPACT:\s*(.+)', analyze_response, re.IGNORECASE)
|
|
|
|
commit_type = type_match.group(1).strip().strip('*`').lower() if type_match else "chore"
|
|
commit_type = re.split(r'[\s:(\[]+', commit_type)[0] if commit_type else "chore"
|
|
raw_scope = scope_match.group(1).strip().strip('*') if scope_match else "pipeline"
|
|
description = desc_match.group(1).strip().strip('*') if desc_match else "Update format pipeline"
|
|
impact = impact_match.group(1).strip().strip('*') if impact_match else description
|
|
|
|
valid_types = set(get_all_types())
|
|
if commit_type not in valid_types:
|
|
commit_type = "refactor"
|
|
|
|
clean_scope = sanitize_scope(raw_scope) or "format-utils"
|
|
|
|
# Stage 2: Format with 3B
|
|
format_system = build_format_system_prompt()
|
|
format_prompt = f"""Format this analysis into a conventional commit message:
|
|
|
|
**Type:** {commit_type}
|
|
**Scope:** {clean_scope}
|
|
**Impact:** {impact}
|
|
**Description:** {description}
|
|
|
|
**Output Format (CRITICAL - use parentheses):**
|
|
type(scope): emoji description
|
|
|
|
Output ONE LINE only."""
|
|
|
|
format_response = await llm_client.format_commit_message(
|
|
prompt=format_prompt,
|
|
system_prompt=format_system,
|
|
max_tokens=150,
|
|
)
|
|
|
|
raw = extract_commit_message(format_response)
|
|
final = sanitize_message_scope(correct_emoji(sanitize_message(raw)))
|
|
|
|
# Verify the final message
|
|
_assert_conventional_commit(final)
|
|
result = validate_commit_message(final)
|
|
assert result.valid, (
|
|
f"End-to-end message failed quality validation.\n"
|
|
f"Message: {final}\n"
|
|
f"Violations: {result.violations}\n"
|
|
f"14B response: {analyze_response[:300]}\n"
|
|
f"3B response: {format_response[:200]}"
|
|
)
|