chore(shared): 🔧 Hello! I'm a mock assistant responding to your message.

2026-01-05 12:19:14 -08:00 · 2026-01-05 12:19:14 -08:00 · 279c985ef2
commit 279c985ef2
parent 121816bc53
15 changed files with 559 additions and 19 deletions
--- a/src/auto_commit_service/pycache/app.cpython-312.pyc
+++ b/src/auto_commit_service/pycache/app.cpython-312.pyc
--- a/src/auto_commit_service/pycache/app.cpython-314.pyc
+++ b/src/auto_commit_service/pycache/app.cpython-314.pyc
--- a/src/auto_commit_service/pycache/config.cpython-312.pyc
+++ b/src/auto_commit_service/pycache/config.cpython-312.pyc
--- a/src/auto_commit_service/pycache/config.cpython-314.pyc
+++ b/src/auto_commit_service/pycache/config.cpython-314.pyc
--- a/src/auto_commit_service/pycache/models.cpython-312.pyc
+++ b/src/auto_commit_service/pycache/models.cpython-312.pyc
--- a/src/auto_commit_service/app.py
+++ b/src/auto_commit_service/app.py
@ -122,8 +122,11 @@ def create_auto_commit_service(
        """Check service health."""
        llama_available = await llm_client.is_available()
        repos_accessible = all(repo.exists for repo in daemon.repos)
+        service_crashed = daemon.service_crashed

-        if llama_available and repos_accessible and daemon.is_running:
+        if service_crashed:
+            status = "error"
+        elif llama_available and repos_accessible and daemon.is_running:
            status = "ok"
        elif llama_available and repos_accessible:
            status = "degraded"
@ -131,7 +134,9 @@ def create_auto_commit_service(
            status = "error"

        error = None
-        if not llama_available:
+        if service_crashed:
+            error = "llama-service has crashed"
+        elif not llama_available:
            error = "llama-service not available"
        elif not repos_accessible:
            missing = [r.name for r in daemon.repos if not r.exists]
@ -155,26 +160,32 @@ def create_auto_commit_service(
            repos=[r.name for r in daemon.repos],
            last_cycle=daemon.last_cycle,
            next_cycle_at=daemon.next_cycle_at,
+            service_crashed=daemon.service_crashed,
+            service_health=daemon.service_health,
+            last_health_check=daemon.last_health_check,
        )

    @app.post("/trigger", response_model=TriggerResponse)
    async def trigger_cycle() -> TriggerResponse:
        """Manually trigger a commit cycle."""
-        if not await llm_client.is_available():
-            raise HTTPException(
-                status_code=503,
-                detail="llama-service not available",
-            )
-
        try:
            cycle_result = await daemon.trigger_cycle()

+            # Check if service failed to start
+            if cycle_result.repos_processed == 0 and daemon.service_crashed:
+                raise HTTPException(
+                    status_code=503,
+                    detail="llama-service has crashed",
+                )
+
            return TriggerResponse(
                triggered=True,
                message=f"Cycle completed: {cycle_result.repos_committed} committed, "
                f"{cycle_result.repos_failed} failed",
                cycle_result=cycle_result,
            )
+        except HTTPException:
+            raise
        except Exception as e:
            logger.exception("Error during manual trigger")
            raise HTTPException(
@ -251,21 +262,22 @@ def create_auto_commit_service(
                detail="Recursive discovery not enabled",
            )

-        if not await llm_client.is_available():
-            raise HTTPException(
-                status_code=503,
-                detail="llama-service not available",
-            )
-
        try:
            # Refresh repos
            old_count = len(daemon.repos)
            daemon.repos = daemon._discover_and_cache_repos()
            new_count = len(daemon.repos)

-            # Run cycle
+            # Run cycle (will auto-start service if needed)
            cycle_result = await daemon.trigger_cycle()

+            # Check if service failed to start
+            if cycle_result.repos_processed == 0 and daemon.service_crashed:
+                raise HTTPException(
+                    status_code=503,
+                    detail="llama-service has crashed",
+                )
+
            return {
                "refreshed": True,
                "old_count": old_count,
@ -276,6 +288,8 @@ def create_auto_commit_service(
                "repos_failed": cycle_result.repos_failed,
                "cycle_result": cycle_result,
            }
+        except HTTPException:
+            raise
        except Exception as e:
            logger.exception("Error during refresh-and-run")
            raise HTTPException(
@ -300,4 +314,51 @@ def create_auto_commit_service(
            "errors": daemon.get_error_history(20),
        }

+    @app.get("/report/summary")
+    async def get_report_summary() -> dict:
+        """Enhanced report with health checks, success tracking, and error categorization."""
+
+        # Service health (already implemented)
+        llm_health = await llm_client.health_check()
+
+        # Last success tracking
+        last_full_success = daemon.get_last_fully_successful_cycle()
+
+        # Per-repo enhanced data
+        repos_summary = [
+            {
+                "name": r.name,
+                "path": str(r.path),
+                "last_commit": daemon.get_repo_last_commit(r.name),
+                "last_success": daemon.get_repo_last_success_timestamp(r.name),
+                "error_count": daemon.get_repo_error_count(r.name),
+            }
+            for r in daemon.repos
+        ]
+
+        # Error categorization
+        error_categories = daemon.categorize_errors()
+
+        return {
+            "service_health": {
+                "llama_service": llm_health,
+                "daemon_running": daemon._running,
+                "daemon_enabled": daemon._enabled,
+                "next_cycle_at": daemon.next_cycle_at,
+                "cycle_interval_seconds": settings.cycle_interval_seconds,
+            },
+            "success_tracking": {
+                "last_fully_successful_cycle": (
+                    last_full_success.model_dump() if last_full_success else None
+                ),
+                "total_cycles_run": daemon.total_cycles,
+            },
+            "repos": repos_summary,
+            "errors": {
+                "categories": error_categories,
+                "total_errors": sum(cat["count"] for cat in error_categories.values()),
+            },
+            "last_cycles": daemon.get_history(5),  # Recent context
+        }
+
    return app
--- a/src/auto_commit_service/config.py
+++ b/src/auto_commit_service/config.py
@ -90,6 +90,28 @@ class AutoCommitSettings(BaseServiceSettings):
        description="Timeout for Claude Code execution in seconds",
    )

+    # Llama service management
+    llama_service_autostart: bool = Field(
+        default=True,
+        description="Automatically start llama service if not running",
+    )
+    llama_service_startup_timeout: float = Field(
+        default=30.0,
+        description="Timeout for service startup in seconds",
+    )
+    llama_service_pid_file: Path = Field(
+        default=Path("~/.config/commits/llama-service.pid").expanduser(),
+        description="PID file for llama service tracking",
+    )
+    llama_service_lock_file: Path = Field(
+        default=Path("~/.config/commits/llama-service.lock").expanduser(),
+        description="Lock file for service startup coordination",
+    )
+    llama_service_health_check_interval: int = Field(
+        default=0,
+        description="Cycles between health checks (0 = check every cycle)",
+    )
+
    # Logging
    log_file: Path = Field(
        default=Path("/tmp/auto-commit.log"),
--- a/src/auto_commit_service/models.py
+++ b/src/auto_commit_service/models.py
@ -49,6 +49,9 @@ class DaemonStatus(BaseModel):
    repos: list[str]
    last_cycle: CycleResult | None = None
    next_cycle_at: datetime | None = None
+    service_crashed: bool = False
+    service_health: str | None = None
+    last_health_check: datetime | None = None


 class HealthResponse(BaseModel):
--- a/src/auto_commit_service/scheduler/pycache/daemon.cpython-312.pyc
+++ b/src/auto_commit_service/scheduler/pycache/daemon.cpython-312.pyc
--- a/src/auto_commit_service/scheduler/pycache/daemon.cpython-314.pyc
+++ b/src/auto_commit_service/scheduler/pycache/daemon.cpython-314.pyc
--- a/src/auto_commit_service/scheduler/daemon.py
+++ b/src/auto_commit_service/scheduler/daemon.py
@ -5,12 +5,14 @@ import logging
 import uuid
 from collections import deque
 from datetime import datetime, timedelta
+from typing import Any

 from ..config import AutoCommitSettings
 from ..git import Repository, discover_git_repos
 from ..llm import LlamaCommitClient
 from ..models import CycleResult, ProcessStatus, RepoProcessResult
 from ..recovery import ErrorHandler
+from ..service import LlamaServiceManager, ServiceHealth
 from .processor import CommitProcessor

 logger = logging.getLogger(__name__)
@ -36,6 +38,20 @@ class CommitDaemon:
        self.llm_client = llm_client
        self.error_handler = error_handler

+        # Initialize service manager
+        if settings.llama_service_autostart:
+            logger.info(f"Initializing service manager (autostart enabled)")
+            self.service_manager = LlamaServiceManager(
+                service_url=settings.llama_service_url,
+                pid_file=settings.llama_service_pid_file,
+                lock_file=settings.llama_service_lock_file,
+                startup_timeout=settings.llama_service_startup_timeout,
+                health_check_timeout=5.0,
+            )
+        else:
+            logger.warning("Service manager disabled (autostart=False)")
+            self.service_manager = None
+
        # Initialize processor
        self.processor = CommitProcessor(
            llm_client=llm_client,
@ -60,6 +76,12 @@ class CommitDaemon:
        self._last_cycle: CycleResult | None = None
        self._next_cycle_at: datetime | None = None

+        # Service health tracking
+        self._service_crashed = False
+        self._service_health: ServiceHealth | None = None
+        self._last_health_check: datetime | None = None
+        self._last_health_check_cycle = 0  # Track cycles since last check
+
    def _build_repos(self) -> list[Repository]:
        """Build the list of repositories to process."""
        if self.settings.recursive_discovery:
@ -221,6 +243,165 @@ class CommitDaemon:
                        return errors
        return errors

+    def get_last_fully_successful_cycle(self) -> CycleResult | None:
+        """Find the last cycle where all repos succeeded.
+
+        Returns:
+            CycleResult where repos_failed == 0 and repos_committed > 0, or None
+        """
+        for cycle in reversed(self._cycle_history):
+            if cycle.repos_failed == 0 and cycle.repos_committed > 0:
+                return cycle
+        return None
+
+    def get_repo_last_success_timestamp(self, repo_name: str) -> datetime | None:
+        """Get timestamp of last successful commit for a repo.
+
+        Args:
+            repo_name: Repository name to check
+
+        Returns:
+            Timestamp of last SUCCESS or RECOVERED status, or None
+        """
+        for cycle in reversed(self._cycle_history):
+            for result in cycle.results:
+                if result.repo_name == repo_name:
+                    if result.status in (ProcessStatus.SUCCESS, ProcessStatus.RECOVERED):
+                        return result.timestamp
+        return None
+
+    def categorize_errors(self) -> dict[str, dict[str, Any]]:
+        """Categorize recent errors by type.
+
+        Returns:
+            Dict mapping category name to {count, examples[]}
+        """
+        errors = self.get_error_history(limit=100)  # Analyze more for patterns
+
+        categories = {
+            "network": {
+                "patterns": ["connection", "timeout", "unreachable", "timed out", "network"],
+                "count": 0,
+                "examples": [],
+            },
+            "auth": {
+                "patterns": ["authentication", "permission", "denied", "ssh", "credentials", "publickey"],
+                "count": 0,
+                "examples": [],
+            },
+            "merge_conflict": {
+                "patterns": ["conflict", "merge failed", "divergent", "non-fast-forward"],
+                "count": 0,
+                "examples": [],
+            },
+            "git_state": {
+                "patterns": ["detached head", "not a git", "no such remote", "no upstream"],
+                "count": 0,
+                "examples": [],
+            },
+            "llm_service": {
+                "patterns": ["llama-service", "model not loaded", "generation failed", "llm"],
+                "count": 0,
+                "examples": [],
+            },
+            "other": {
+                "patterns": [],
+                "count": 0,
+                "examples": [],
+            },
+        }
+
+        for error_entry in errors:
+            error_text = error_entry.get("error", "").lower()
+            categorized = False
+
+            for cat_name, cat_data in categories.items():
+                if cat_name == "other":
+                    continue
+                if any(pattern in error_text for pattern in cat_data["patterns"]):
+                    cat_data["count"] += 1
+                    if len(cat_data["examples"]) < 3:  # Keep up to 3 examples
+                        cat_data["examples"].append(error_entry)
+                    categorized = True
+                    break
+
+            if not categorized:
+                categories["other"]["count"] += 1
+                if len(categories["other"]["examples"]) < 3:
+                    categories["other"]["examples"].append(error_entry)
+
+        # Remove patterns key and empty categories from response
+        return {
+            k: {"count": v["count"], "examples": v["examples"]}
+            for k, v in categories.items()
+            if v["count"] > 0
+        }
+
+    @property
+    def service_crashed(self) -> bool:
+        """Check if the llama service has crashed."""
+        return self._service_crashed
+
+    @property
+    def service_health(self) -> str | None:
+        """Get the current service health status."""
+        return self._service_health.value if self._service_health else None
+
+    @property
+    def last_health_check(self) -> datetime | None:
+        """Get the last health check timestamp."""
+        return self._last_health_check
+
+    async def _ensure_service_ready(self) -> bool:
+        """Ensure llama service is available, starting if needed.
+
+        Returns:
+            True if service is available, False if crashed or unavailable
+        """
+        if not self.service_manager:
+            # Autostart disabled, rely on external service
+            return True
+
+        # Check if we should perform health check this cycle
+        should_check = (
+            self.settings.llama_service_health_check_interval == 0 or
+            self._last_health_check_cycle >= self.settings.llama_service_health_check_interval
+        )
+
+        if should_check:
+            self._last_health_check = datetime.now()
+            self._last_health_check_cycle = 0
+            health = await self.service_manager.check_health()
+            self._service_health = health
+
+            if health == ServiceHealth.CRASHED:
+                self._service_crashed = True
+                logger.error("Llama service has crashed - commits disabled")
+                return False
+
+            if health == ServiceHealth.UNREACHABLE:
+                logger.info("Llama service unreachable, attempting to start...")
+                started = await self.service_manager.start_service()
+                if started:
+                    self._service_crashed = False
+                    self._service_health = ServiceHealth.HEALTHY
+                    return True
+                else:
+                    logger.error("Failed to start llama service")
+                    return False
+
+            if health == ServiceHealth.DEGRADED:
+                logger.warning("Llama service is degraded")
+                return False
+
+            # Service is healthy
+            self._service_crashed = False
+            return True
+        else:
+            # Skip health check this cycle
+            self._last_health_check_cycle += 1
+            return not self._service_crashed
+
    def enable(self) -> None:
        """Enable the daemon."""
        self._enabled = True
@ -302,10 +483,10 @@ class CommitDaemon:

        logger.info(f"Starting cycle {cycle_id}")

-        # Check LLM availability first
-        if not await self.llm_client.is_available():
-            logger.warning("LLM service not available, skipping cycle")
-            return CycleResult(
+        # Ensure service is ready (auto-start if needed)
+        if not await self._ensure_service_ready():
+            logger.warning("Llama service not ready, skipping cycle")
+            cycle_result = CycleResult(
                cycle_id=cycle_id,
                started_at=started_at,
                completed_at=datetime.now(),
@ -314,6 +495,27 @@ class CommitDaemon:
                repos_failed=0,
                results=[],
            )
+            self._last_cycle = cycle_result
+            self._cycle_history.append(cycle_result)
+            self._total_cycles += 1
+            return cycle_result
+
+        # Check LLM availability
+        if not await self.llm_client.is_available():
+            logger.warning("LLM service not available, skipping cycle")
+            cycle_result = CycleResult(
+                cycle_id=cycle_id,
+                started_at=started_at,
+                completed_at=datetime.now(),
+                repos_processed=0,
+                repos_committed=0,
+                repos_failed=0,
+                results=[],
+            )
+            self._last_cycle = cycle_result
+            self._cycle_history.append(cycle_result)
+            self._total_cycles += 1
+            return cycle_result

        # Phase 1: Commit all repos with changes
        logger.info(f"[{cycle_id}] Phase 1: Committing changes")
--- a/src/auto_commit_service/service/init.py
+++ b/src/auto_commit_service/service/init.py
@ -0,0 +1,17 @@
+"""Service lifecycle management."""
+
+from .manager import (
+    LlamaServiceManager,
+    ServiceHealth,
+    ServiceManagerError,
+    ServiceStartError,
+    ServiceCrashError,
+)
+
+__all__ = [
+    "LlamaServiceManager",
+    "ServiceHealth",
+    "ServiceManagerError",
+    "ServiceStartError",
+    "ServiceCrashError",
+]
--- a/src/auto_commit_service/service/pycache/init.cpython-312.pyc
+++ b/src/auto_commit_service/service/pycache/init.cpython-312.pyc
--- a/src/auto_commit_service/service/pycache/manager.cpython-312.pyc
+++ b/src/auto_commit_service/service/pycache/manager.cpython-312.pyc
--- a/src/auto_commit_service/service/manager.py
+++ b/src/auto_commit_service/service/manager.py
@ -0,0 +1,235 @@
+"""Llama service lifecycle manager."""
+
+import asyncio
+import fcntl
+import logging
+import os
+import signal
+import sys
+import time
+from enum import Enum
+from pathlib import Path
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+class ServiceManagerError(Exception):
+    """Base exception for service manager errors."""
+
+
+class ServiceStartError(ServiceManagerError):
+    """Failed to start service."""
+
+
+class ServiceCrashError(ServiceManagerError):
+    """Service crashed unexpectedly."""
+
+
+class ServiceHealth(str, Enum):
+    """Service health status."""
+
+    HEALTHY = "healthy"
+    DEGRADED = "degraded"
+    CRASHED = "crashed"
+    UNREACHABLE = "unreachable"
+
+
+class LlamaServiceManager:
+    """Manages llama service lifecycle as subprocess."""
+
+    def __init__(
+        self,
+        service_url: str = "http://localhost:8000",
+        pid_file: Path | None = None,
+        lock_file: Path | None = None,
+        startup_timeout: float = 30.0,
+        health_check_timeout: float = 5.0,
+    ):
+        """Initialize service manager."""
+        self.service_url = service_url
+        self._pid_file = pid_file or Path.home() / ".config/commits/llama-service.pid"
+        self._lock_file = lock_file or Path.home() / ".config/commits/llama-service.lock"
+        self._startup_timeout = startup_timeout
+        self._health_check_timeout = health_check_timeout
+        self._spawned_pid: int | None = None
+        self._lock_fd: int | None = None
+
+    async def ensure_service_available(self) -> bool:
+        """Ensure service is available, starting if necessary."""
+        health = await self.check_health()
+
+        if health == ServiceHealth.HEALTHY:
+            return True
+        if health in (ServiceHealth.DEGRADED, ServiceHealth.CRASHED):
+            return False
+
+        logger.info("Llama service unreachable, attempting to start...")
+        return await self.start_service()
+
+    async def start_service(self) -> bool:
+        """Start llama service subprocess."""
+        pid = self._read_pid_file()
+        if pid and self._is_process_alive(pid):
+            logger.info(f"Service already running (PID: {pid})")
+            return True
+
+        if not self._acquire_lock():
+            await asyncio.sleep(2)
+            pid = self._read_pid_file()
+            if pid and self._is_process_alive(pid):
+                return True
+            return False
+
+        try:
+            pid = self._read_pid_file()
+            if pid and self._is_process_alive(pid):
+                return True
+
+            logger.info("Starting llama service subprocess...")
+            process = await self._spawn_service()
+            self._spawned_pid = process.pid
+            self._write_pid_file(process.pid)
+            logger.info(f"Llama service started (PID: {process.pid})")
+
+            if await self._wait_for_healthy(self._startup_timeout):
+                logger.info("✓ Llama service is healthy")
+                return True
+            else:
+                logger.error(f"✗ Service failed to start within {self._startup_timeout}s")
+                return False
+
+        except Exception as e:
+            logger.exception(f"Failed to start llama service: {e}")
+            return False
+        finally:
+            self._release_lock()
+
+    async def check_health(self) -> ServiceHealth:
+        """Check service health and detect crashes."""
+        pid = self._read_pid_file()
+        if pid and not self._is_process_alive(pid):
+            return ServiceHealth.CRASHED
+
+        try:
+            async with httpx.AsyncClient(timeout=self._health_check_timeout) as client:
+                response = await client.get(f"{self.service_url}/health")
+                if response.status_code == 200:
+                    data = response.json()
+                    return ServiceHealth.HEALTHY if data.get("status") == "ok" else ServiceHealth.DEGRADED
+                return ServiceHealth.UNREACHABLE
+        except (httpx.ConnectError, httpx.TimeoutException):
+            return ServiceHealth.UNREACHABLE
+        except Exception:
+            return ServiceHealth.UNREACHABLE
+
+    async def stop_service(self) -> None:
+        """Gracefully stop service if we own it."""
+        if self._spawned_pid is None or not self._is_process_alive(self._spawned_pid):
+            return
+
+        logger.info(f"Stopping llama service (PID: {self._spawned_pid})...")
+        try:
+            os.kill(self._spawned_pid, signal.SIGTERM)
+            for _ in range(10):
+                if not self._is_process_alive(self._spawned_pid):
+                    self._cleanup_pid_file()
+                    return
+                await asyncio.sleep(0.5)
+            os.kill(self._spawned_pid, signal.SIGKILL)
+            self._cleanup_pid_file()
+        except ProcessLookupError:
+            self._cleanup_pid_file()
+        except Exception as e:
+            logger.exception(f"Error stopping service: {e}")
+
+    def _acquire_lock(self) -> bool:
+        """Acquire exclusive lock."""
+        self._lock_file.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            self._lock_fd = os.open(self._lock_file, os.O_CREAT | os.O_WRONLY)
+            fcntl.flock(self._lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            return True
+        except BlockingIOError:
+            return False
+
+    def _release_lock(self) -> None:
+        """Release lock."""
+        if self._lock_fd is not None:
+            try:
+                fcntl.flock(self._lock_fd, fcntl.LOCK_UN)
+                os.close(self._lock_fd)
+                self._lock_fd = None
+            except Exception:
+                pass
+
+    def _read_pid_file(self) -> int | None:
+        """Read PID from file."""
+        if not self._pid_file.exists():
+            return None
+        try:
+            content = self._pid_file.read_text().strip()
+            return int(content) if content else None
+        except Exception:
+            return None
+
+    def _write_pid_file(self, pid: int) -> None:
+        """Write PID to file."""
+        self._pid_file.parent.mkdir(parents=True, exist_ok=True)
+        self._pid_file.write_text(str(pid))
+
+    def _cleanup_pid_file(self) -> None:
+        """Remove PID file."""
+        try:
+            if self._pid_file.exists():
+                self._pid_file.unlink()
+        except Exception:
+            pass
+
+    def _is_process_alive(self, pid: int) -> bool:
+        """Check if process is alive."""
+        try:
+            os.kill(pid, 0)
+            return True
+        except OSError:
+            return False
+
+    async def _spawn_service(self) -> asyncio.subprocess.Process:
+        """Spawn service as background subprocess."""
+        cmd = [sys.executable, "-m", "tqftw_llama_service"]
+        env = os.environ.copy()
+
+        # Enable mock mode if no real models configured
+        if "LLAMA_SERVICE_FAST_MODEL_PATH" not in env and "LLAMA_SERVICE_REASONING_MODEL_PATH" not in env:
+            env["LLAMA_SERVICE_MOCK_MODE"] = "true"
+
+        log_file = self._pid_file.parent / "llama-service.log"
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(log_file, "a") as log:
+            log.write(f"\n=== Service started at {time.ctime()} ===\n")
+            process = await asyncio.create_subprocess_exec(
+                *cmd,
+                env=env,
+                stdout=log,
+                stderr=asyncio.subprocess.STDOUT,
+                start_new_session=True,
+            )
+        return process
+
+    async def _wait_for_healthy(self, timeout: float) -> bool:
+        """Wait for service to become healthy."""
+        start = time.time()
+        while time.time() - start < timeout:
+            try:
+                async with httpx.AsyncClient(timeout=2.0) as client:
+                    response = await client.get(f"{self.service_url}/health")
+                    if response.status_code == 200:
+                        data = response.json()
+                        if data.get("status") == "ok":
+                            return True
+            except Exception:
+                pass
+            await asyncio.sleep(1)
+        return False