auto-commit-service/tests/test_format_utils.py

"""Tests for commit message formatting utilities.

Covers sanitize_scope (emoji stripping, normalization) and correct_emoji
(enforcing canonical gitmoji from the GITMOJI_MAP table).
"""

import pytest

from auto_commit_service.pipeline.format_utils import (
    auto_correct_format,
    correct_emoji,
    extract_commit_message,
    sanitize_message,
    sanitize_message_scope,
    sanitize_scope,
)
from auto_commit_service.pipeline.gitmoji import GITMOJI_MAP
from auto_commit_service.llm.validator import validate_commit_message


class TestSanitizeScope:
    """Scope values must be plain ASCII identifiers — no emoji, no parens."""

    @pytest.mark.parametrize(
        ("raw", "expected"),
        [
            ("auth", "auth"),
            ("api-routes", "api-routes"),
            ("pipeline.stages", "pipeline.stages"),
            ("core_module", "core_module"),
        ],
    )
    def test_passthrough_clean_scopes(self, raw: str, expected: str) -> None:
        assert sanitize_scope(raw) == expected

    @pytest.mark.parametrize(
        ("raw", "expected"),
        [
            ("✨ auth", "auth"),
            ("🔧config", "config"),
            ("🐛 api-routes", "api-routes"),
            ("✨🎉 new-feature", "feature"),  # "new" is a stop word
            ("⚡ perf", "perf"),
        ],
    )
    def test_strips_emoji(self, raw: str, expected: str) -> None:
        assert sanitize_scope(raw) == expected

    def test_strips_parentheses(self) -> None:
        # "with" is a stop word, so only meaningful tokens remain
        result = sanitize_scope("core(with react integration)")
        assert "with" not in result.split("-")
        assert result  # Should still have some content

    def test_emoji_only_returns_empty(self) -> None:
        assert sanitize_scope("✨") == ""

    def test_empty_input(self) -> None:
        assert sanitize_scope("") == ""

    def test_truncates_long_scope(self) -> None:
        result = sanitize_scope("this is a really long scope value that exceeds the limit")
        assert len(result) <= 25

    def test_custom_max_length(self) -> None:
        result = sanitize_scope("authentication", max_length=10)
        assert len(result) <= 10

    def test_collapses_whitespace_to_hyphens(self) -> None:
        assert sanitize_scope("api routes") == "api-routes"

    # --- New tests for stop word filtering ---

    def test_strips_stop_words_from_natural_language(self) -> None:
        assert sanitize_scope("the primary auth module") == "auth"
        assert sanitize_scope("for the config settings") == "config-settings"

    def test_strips_stop_words_from_hyphenated(self) -> None:
        assert sanitize_scope("theme-primary-as-its-the") == "theme"
        assert sanitize_scope("ui-tiers-the-package-name-as") == "ui-tiers"
        assert sanitize_scope("config-settings-for-the-main") == "config-settings"

    def test_empty_after_all_stop_words(self) -> None:
        assert sanitize_scope("the a an is") == ""

    def test_rejects_module_file_package_words(self) -> None:
        assert sanitize_scope("auth module") == "auth"
        assert sanitize_scope("config file") == "config"


class TestCorrectEmoji:
    """correct_emoji enforces the canonical gitmoji from GITMOJI_MAP."""

    @pytest.mark.parametrize(
        ("message", "expected"),
        [
            # Wrong emoji → corrected
            ("feat(auth): 🔧 Add login", "feat(auth): ✨ Add login"),
            ("fix(api): ✨ Resolve timeout", "fix(api): 🐛 Resolve timeout"),
            ("chore(config): 🐛 Update rules", "chore(config): 🔧 Update rules"),
            ("perf(db): 🔧 Optimize queries", "perf(db): ⚡ Optimize queries"),
            ("test(auth): 🔧 Add unit tests", "test(auth): ✅ Add unit tests"),
            ("refactor(core): 🔧 Extract utils", "refactor(core): ♻️ Extract utils"),
            ("docs(readme): 🔧 Add setup guide", "docs(readme): 📝 Add setup guide"),
        ],
    )
    def test_corrects_wrong_emoji(self, message: str, expected: str) -> None:
        assert correct_emoji(message) == expected

    @pytest.mark.parametrize(
        "message",
        [
            "feat(auth): ✨ Add login",
            "fix(api): 🐛 Resolve timeout",
            "chore(config): 🔧 Update ESLint rules",
            "refactor(core): ♻️ Extract validation logic",
            "docs(readme): 📝 Add installation guide",
            "test(auth): ✅ Add integration tests",
            "perf(query): ⚡ Optimize database lookup",
        ],
    )
    def test_preserves_correct_emoji(self, message: str) -> None:
        assert correct_emoji(message) == message

    def test_handles_no_scope(self) -> None:
        assert correct_emoji("docs: 📝 Update README") == "docs: 📝 Update README"

    def test_passthrough_non_commit_format(self) -> None:
        assert correct_emoji("random text here") == "random text here"
        assert correct_emoji("") == ""

    @pytest.mark.parametrize("commit_type", list(GITMOJI_MAP.keys()))
    def test_all_gitmoji_types_have_correction(self, commit_type: str) -> None:
        """Every type in GITMOJI_MAP should produce the correct emoji."""
        wrong_emoji = "🤖"
        correct = GITMOJI_MAP[commit_type]
        message = f"{commit_type}(test): {wrong_emoji} Do something useful here"
        result = correct_emoji(message)
        assert result == f"{commit_type}(test): {correct} Do something useful here", (
            f"Type '{commit_type}' should produce {correct}, got: {result}"
        )


class TestValidatorAcceptsAllGitmojiTypes:
    """The validator must accept every commit type from the gitmoji table."""

    @pytest.mark.parametrize("commit_type", list(GITMOJI_MAP.keys()))
    def test_validator_recognizes_type(self, commit_type: str) -> None:
        from auto_commit_service.llm.validator import validate_commit_message

        emoji = GITMOJI_MAP[commit_type]
        message = f"{commit_type}(core): {emoji} add meaningful feature implementation"
        result = validate_commit_message(message)
        assert result.valid, (
            f"Type '{commit_type}' should be valid, violations: {result.violations}"
        )


class TestFormatStageValidatorRetry:
    """FormatCommitMessageStage retries LLM once when validator rejects the message."""

    @pytest.fixture
    def analysis(self):
        from auto_commit_service.pipeline.models import CommitAnalysis

        return CommitAnalysis(
            files=["src/config.py"],
            change_type="chore",
            scope="config",
            reasoning="Updated config settings",
            impact_summary="Configuration cleanup",
            suggested_description="Refactor settings for clarity",
        )

    @pytest.fixture
    def format_stage(self):
        from auto_commit_service.pipeline.stages.format import FormatCommitMessageStage

        return FormatCommitMessageStage()

    @pytest.mark.asyncio
    async def test_valid_first_attempt_no_retry(self, format_stage, analysis) -> None:
        """When the first LLM response passes validation, no retry occurs."""
        from unittest.mock import AsyncMock, patch

        good_response = "chore(config): 🔧 refactor settings for service isolation"
        mock_client = AsyncMock()
        mock_client.format_commit_message = AsyncMock(return_value=good_response)

        with patch(
            "auto_commit_service.pipeline.init.get_llm_client",
            return_value=mock_client,
        ):
            result = await format_stage._format_commit_message(analysis)

        assert result.message == "chore(config): 🔧 refactor settings for service isolation"
        mock_client.format_commit_message.assert_called_once()

    @pytest.mark.asyncio
    async def test_retry_on_invalid_first_attempt(self, format_stage, analysis) -> None:
        """When the first LLM response fails validation, retry with feedback."""
        from unittest.mock import AsyncMock, patch

        garbage = "chore(config): 🔧 Update 5 py files"
        good_retry = "chore(config): 🔧 refactor settings for service isolation"

        mock_client = AsyncMock()
        mock_client.format_commit_message = AsyncMock(
            side_effect=[garbage, good_retry]
        )

        with patch(
            "auto_commit_service.pipeline.init.get_llm_client",
            return_value=mock_client,
        ):
            result = await format_stage._format_commit_message(analysis)

        assert result.message == "chore(config): 🔧 refactor settings for service isolation"
        assert mock_client.format_commit_message.call_count == 2

    @pytest.mark.asyncio
    async def test_uses_retry_when_both_fail(self, format_stage, analysis) -> None:
        """When both attempts fail validation, use the retry (usually better)."""
        from unittest.mock import AsyncMock, patch

        garbage1 = "chore(config): 🔧 Update 5 py files"
        garbage2 = "chore(config): 🔧 Update configuration files"

        mock_client = AsyncMock()
        mock_client.format_commit_message = AsyncMock(
            side_effect=[garbage1, garbage2]
        )

        with patch(
            "auto_commit_service.pipeline.init.get_llm_client",
            return_value=mock_client,
        ):
            result = await format_stage._format_commit_message(analysis)

        # Should use the retry attempt even though it also failed
        assert result.message == "chore(config): 🔧 Update configuration files"
        assert mock_client.format_commit_message.call_count == 2

    def test_retry_prompt_includes_violations(self, format_stage, analysis) -> None:
        """The retry prompt must include the violation feedback."""
        violations = [
            "Contains banned phrase matching: ^update\\s+\\d+\\s+(files?|py\\s+files?)",
            "Missing action verb",
        ]
        prompt = format_stage._build_retry_prompt(
            analysis,
            "chore(config): 🔧 Update 5 py files",
            violations,
        )
        assert "rejected by quality checks" in prompt
        assert "Update 5 py files" in prompt
        assert "banned phrase" in prompt
        assert "Missing action verb" in prompt
        assert analysis.impact_summary in prompt
        assert analysis.suggested_description in prompt


class TestSanitizeMessageScope:
    """Tests for sanitize_message_scope — scope cleanup within formatted messages."""

    def test_cleans_reasoning_leak_in_scope(self) -> None:
        result = sanitize_message_scope("feat(the primary auth module): ✨ Add login")
        assert result == "feat(auth): ✨ Add login"

    def test_cleans_hyphenated_reasoning_in_scope(self) -> None:
        result = sanitize_message_scope(
            "deps-upgrade(theme-primary-as-its-the): ⬆️ Bump vite"
        )
        assert result == "deps-upgrade(theme): ⬆️ Bump vite"

    def test_preserves_clean_scope(self) -> None:
        msg = "feat(auth): ✨ Add OAuth2 login support"
        assert sanitize_message_scope(msg) == msg

    def test_preserves_clean_kebab_scope(self) -> None:
        msg = "refactor(pipeline-stages): ♻️ Extract shared logic"
        assert sanitize_message_scope(msg) == msg

    def test_removes_scope_if_empty_after_sanitization(self) -> None:
        result = sanitize_message_scope("chore(the a an is): 🔧 Update something")
        assert result == "chore: 🔧 Update something"

    def test_no_scope_message_unchanged(self) -> None:
        msg = "chore: 🔧 Update something"
        assert sanitize_message_scope(msg) == msg

    def test_non_matching_message_unchanged(self) -> None:
        msg = "not a commit message"
        assert sanitize_message_scope(msg) == msg


class TestAutoCorrectFormat:
    """Tests for auto_correct_format — fixes slash and dash separators."""

    def test_fixes_slash_format(self) -> None:
        result = auto_correct_format("feat/auth: ✨ Add login")
        assert result == "feat(auth): ✨ Add login"

    def test_fixes_dash_format(self) -> None:
        result = auto_correct_format("fix-api: 🐛 Fix timeout")
        assert result == "fix(api): 🐛 Fix timeout"

    def test_preserves_correct_format(self) -> None:
        msg = "feat(auth): ✨ Add login"
        assert auto_correct_format(msg) == msg

    def test_preserves_non_commit_line(self) -> None:
        msg = "This is just a regular line"
        assert auto_correct_format(msg) == msg


class TestExtractCommitMessage:
    """Tests for extract_commit_message — extracts from LLM response."""

    def test_extracts_from_single_line(self) -> None:
        response = "feat(auth): ✨ Add OAuth2 login support"
        assert extract_commit_message(response) == response

    def test_extracts_from_multiline_with_reasoning(self) -> None:
        response = """Let me analyze the changes.

Based on the file changes, this is a feature addition.

feat(auth): ✨ Add OAuth2 login support

This adds the login endpoint."""
        assert extract_commit_message(response) == "feat(auth): ✨ Add OAuth2 login support"

    def test_skips_reasoning_lines(self) -> None:
        response = """feat(auth): ✨ **Reasoning:** Based on analysis
feat(auth): ✨ Add OAuth2 login support"""
        assert extract_commit_message(response) == "feat(auth): ✨ Add OAuth2 login support"

    def test_auto_corrects_slash_in_response(self) -> None:
        response = "feat/auth: ✨ Add OAuth2 login support"
        result = extract_commit_message(response)
        assert result == "feat(auth): ✨ Add OAuth2 login support"

    def test_raises_on_no_valid_message(self) -> None:
        with pytest.raises(RuntimeError, match="Failed to extract"):
            extract_commit_message("Just some random text with no commit format")


class TestSanitizeMessage:
    """Tests for sanitize_message — removes markdown artifacts."""

    def test_removes_bold_markers(self) -> None:
        assert sanitize_message("**feat**: add login") == "feat: add login"

    def test_removes_code_markers(self) -> None:
        assert sanitize_message("`feat`: add login") == "feat: add login"

    def test_collapses_multiple_spaces(self) -> None:
        assert sanitize_message("feat:  add   login") == "feat: add login"


class TestValidatorScopeChecks:
    """Tests for validate_commit_message scope validation."""

    def test_rejects_generic_src_scope(self) -> None:
        result = validate_commit_message("feat(src): ✨ Add new feature")
        assert not result.valid
        assert any("generic" in v.lower() for v in result.violations)

    def test_rejects_generic_lib_scope(self) -> None:
        result = validate_commit_message("feat(lib): ✨ Add new feature")
        assert not result.valid

    def test_rejects_generic_app_scope(self) -> None:
        result = validate_commit_message("feat(app): ✨ Add new feature")
        assert not result.valid

    def test_accepts_specific_scope(self) -> None:
        result = validate_commit_message("feat(auth): ✨ Add OAuth2 login support")
        assert result.valid

    def test_accepts_kebab_scope(self) -> None:
        result = validate_commit_message(
            "refactor(api-routes): ♻️ Extract shared validation logic"
        )
        assert result.valid

    def test_rejects_scope_with_spaces(self) -> None:
        result = validate_commit_message("feat(auth module): ✨ Add login support")
        assert not result.valid
        assert any("spaces" in v.lower() for v in result.violations)

    def test_rejects_overly_long_scope(self) -> None:
        long_scope = "a" * 30
        result = validate_commit_message(f"feat({long_scope}): ✨ Add new feature")
        assert not result.valid
        assert any("long" in v.lower() for v in result.violations)


class TestAutoCorrectFormat:
    """auto_correct_format applies _CORRECTIONS table entries in order."""

    # --- bare deps normalization ---

    def test_bare_deps_no_scope(self) -> None:
        result = auto_correct_format("deps: 🔧 Rebuild debug dependencies")
        assert result.startswith("deps-upgrade:")

    def test_bare_deps_with_scope(self) -> None:
        result = auto_correct_format("deps(simulator): 🔧 Rebuild things")
        assert result.startswith("deps-upgrade(simulator):")

    def test_bare_deps_actual_failing_log_message(self) -> None:
        """Reproduces the exact message from the activity log that was failing."""
        msg = (
            "deps: 🔧 Rebuild debug dependencies for simulator modules "
            "(physics, AI, city, climate, combat, core)"
        )
        result = auto_correct_format(msg)
        assert result.startswith("deps-upgrade:")
        assert "Rebuild debug dependencies" in result

    def test_deps_upgrade_passes_through_unchanged(self) -> None:
        """Valid deps-upgrade must not be mangled."""
        msg = "deps-upgrade(api): ⬆️ bump httpx to 0.27"
        assert auto_correct_format(msg) == msg

    def test_deps_upgrade_with_scope_passes_through(self) -> None:
        msg = "deps-upgrade(config): ⬆️ Update dependency versions"
        assert auto_correct_format(msg) == msg

    def test_deps_add_passes_through_unchanged(self) -> None:
        msg = "deps-add(core): ➕ add aiohttp"
        assert auto_correct_format(msg) == msg

    # --- slash scope correction ---

    def test_slash_scope_corrected(self) -> None:
        result = auto_correct_format("feat/auth: ✨ Add login")
        assert result == "feat(auth): ✨ Add login"

    def test_slash_scope_with_hyphen(self) -> None:
        result = auto_correct_format("fix/api-routes: 🐛 Fix timeout")
        assert result == "fix(api-routes): 🐛 Fix timeout"

    # --- dash scope correction ---

    def test_dash_scope_corrected(self) -> None:
        result = auto_correct_format("chore-config: 🔧 Update rules")
        assert result == "chore(config): 🔧 Update rules"

    # --- passthrough for valid messages ---

    @pytest.mark.parametrize("msg", [
        "feat(auth): ✨ Add OAuth2",
        "fix(api): 🐛 Resolve null pointer",
        "chore(config): 🔧 Update ESLint rules",
        "refactor(core): ♻️ Extract shared logic",
        "test(auth): ✅ Add integration tests",
    ])
    def test_valid_messages_unchanged(self, msg: str) -> None:
        assert auto_correct_format(msg) == msg

    def test_unrelated_text_unchanged(self) -> None:
        assert auto_correct_format("some random text") == "some random text"

    def test_empty_string_unchanged(self) -> None:
        assert auto_correct_format("") == ""

    # --- end-to-end through extract_commit_message ---

    def test_bare_deps_extractable(self) -> None:
        """bare deps: must produce a valid extractable message."""
        msg = "deps: 🔧 Rebuild simulator debug modules"
        result = extract_commit_message(msg)
        assert result.startswith("deps-upgrade:")

    def test_corrections_table_is_open_for_extension(self) -> None:
        """_CORRECTIONS is a list — new entries can be appended without modifying the function."""
        from auto_commit_service.pipeline.format_utils import _CORRECTIONS
        assert isinstance(_CORRECTIONS, list)
        assert len(_CORRECTIONS) >= 3