diff --git a/src/auto_commit_service/__pycache__/app.cpython-312.pyc b/src/auto_commit_service/__pycache__/app.cpython-312.pyc index 6f8ee0f..0112451 100644 Binary files a/src/auto_commit_service/__pycache__/app.cpython-312.pyc and b/src/auto_commit_service/__pycache__/app.cpython-312.pyc differ diff --git a/src/auto_commit_service/__pycache__/app.cpython-314.pyc b/src/auto_commit_service/__pycache__/app.cpython-314.pyc index 307d6db..c96b925 100644 Binary files a/src/auto_commit_service/__pycache__/app.cpython-314.pyc and b/src/auto_commit_service/__pycache__/app.cpython-314.pyc differ diff --git a/src/auto_commit_service/__pycache__/config.cpython-312.pyc b/src/auto_commit_service/__pycache__/config.cpython-312.pyc index 625d03c..6481f01 100644 Binary files a/src/auto_commit_service/__pycache__/config.cpython-312.pyc and b/src/auto_commit_service/__pycache__/config.cpython-312.pyc differ diff --git a/src/auto_commit_service/__pycache__/config.cpython-314.pyc b/src/auto_commit_service/__pycache__/config.cpython-314.pyc index e6f6672..11348b5 100644 Binary files a/src/auto_commit_service/__pycache__/config.cpython-314.pyc and b/src/auto_commit_service/__pycache__/config.cpython-314.pyc differ diff --git a/src/auto_commit_service/__pycache__/models.cpython-312.pyc b/src/auto_commit_service/__pycache__/models.cpython-312.pyc index 8c2221e..0e888af 100644 Binary files a/src/auto_commit_service/__pycache__/models.cpython-312.pyc and b/src/auto_commit_service/__pycache__/models.cpython-312.pyc differ diff --git a/src/auto_commit_service/app.py b/src/auto_commit_service/app.py index be8eaf8..62dba14 100644 --- a/src/auto_commit_service/app.py +++ b/src/auto_commit_service/app.py @@ -122,8 +122,11 @@ def create_auto_commit_service( """Check service health.""" llama_available = await llm_client.is_available() repos_accessible = all(repo.exists for repo in daemon.repos) + service_crashed = daemon.service_crashed - if llama_available and repos_accessible and daemon.is_running: + if service_crashed: + status = "error" + elif llama_available and repos_accessible and daemon.is_running: status = "ok" elif llama_available and repos_accessible: status = "degraded" @@ -131,7 +134,9 @@ def create_auto_commit_service( status = "error" error = None - if not llama_available: + if service_crashed: + error = "llama-service has crashed" + elif not llama_available: error = "llama-service not available" elif not repos_accessible: missing = [r.name for r in daemon.repos if not r.exists] @@ -155,26 +160,32 @@ def create_auto_commit_service( repos=[r.name for r in daemon.repos], last_cycle=daemon.last_cycle, next_cycle_at=daemon.next_cycle_at, + service_crashed=daemon.service_crashed, + service_health=daemon.service_health, + last_health_check=daemon.last_health_check, ) @app.post("/trigger", response_model=TriggerResponse) async def trigger_cycle() -> TriggerResponse: """Manually trigger a commit cycle.""" - if not await llm_client.is_available(): - raise HTTPException( - status_code=503, - detail="llama-service not available", - ) - try: cycle_result = await daemon.trigger_cycle() + # Check if service failed to start + if cycle_result.repos_processed == 0 and daemon.service_crashed: + raise HTTPException( + status_code=503, + detail="llama-service has crashed", + ) + return TriggerResponse( triggered=True, message=f"Cycle completed: {cycle_result.repos_committed} committed, " f"{cycle_result.repos_failed} failed", cycle_result=cycle_result, ) + except HTTPException: + raise except Exception as e: logger.exception("Error during manual trigger") raise HTTPException( @@ -251,21 +262,22 @@ def create_auto_commit_service( detail="Recursive discovery not enabled", ) - if not await llm_client.is_available(): - raise HTTPException( - status_code=503, - detail="llama-service not available", - ) - try: # Refresh repos old_count = len(daemon.repos) daemon.repos = daemon._discover_and_cache_repos() new_count = len(daemon.repos) - # Run cycle + # Run cycle (will auto-start service if needed) cycle_result = await daemon.trigger_cycle() + # Check if service failed to start + if cycle_result.repos_processed == 0 and daemon.service_crashed: + raise HTTPException( + status_code=503, + detail="llama-service has crashed", + ) + return { "refreshed": True, "old_count": old_count, @@ -276,6 +288,8 @@ def create_auto_commit_service( "repos_failed": cycle_result.repos_failed, "cycle_result": cycle_result, } + except HTTPException: + raise except Exception as e: logger.exception("Error during refresh-and-run") raise HTTPException( @@ -300,4 +314,51 @@ def create_auto_commit_service( "errors": daemon.get_error_history(20), } + @app.get("/report/summary") + async def get_report_summary() -> dict: + """Enhanced report with health checks, success tracking, and error categorization.""" + + # Service health (already implemented) + llm_health = await llm_client.health_check() + + # Last success tracking + last_full_success = daemon.get_last_fully_successful_cycle() + + # Per-repo enhanced data + repos_summary = [ + { + "name": r.name, + "path": str(r.path), + "last_commit": daemon.get_repo_last_commit(r.name), + "last_success": daemon.get_repo_last_success_timestamp(r.name), + "error_count": daemon.get_repo_error_count(r.name), + } + for r in daemon.repos + ] + + # Error categorization + error_categories = daemon.categorize_errors() + + return { + "service_health": { + "llama_service": llm_health, + "daemon_running": daemon._running, + "daemon_enabled": daemon._enabled, + "next_cycle_at": daemon.next_cycle_at, + "cycle_interval_seconds": settings.cycle_interval_seconds, + }, + "success_tracking": { + "last_fully_successful_cycle": ( + last_full_success.model_dump() if last_full_success else None + ), + "total_cycles_run": daemon.total_cycles, + }, + "repos": repos_summary, + "errors": { + "categories": error_categories, + "total_errors": sum(cat["count"] for cat in error_categories.values()), + }, + "last_cycles": daemon.get_history(5), # Recent context + } + return app diff --git a/src/auto_commit_service/config.py b/src/auto_commit_service/config.py index 6250120..ffdfd12 100644 --- a/src/auto_commit_service/config.py +++ b/src/auto_commit_service/config.py @@ -90,6 +90,28 @@ class AutoCommitSettings(BaseServiceSettings): description="Timeout for Claude Code execution in seconds", ) + # Llama service management + llama_service_autostart: bool = Field( + default=True, + description="Automatically start llama service if not running", + ) + llama_service_startup_timeout: float = Field( + default=30.0, + description="Timeout for service startup in seconds", + ) + llama_service_pid_file: Path = Field( + default=Path("~/.config/commits/llama-service.pid").expanduser(), + description="PID file for llama service tracking", + ) + llama_service_lock_file: Path = Field( + default=Path("~/.config/commits/llama-service.lock").expanduser(), + description="Lock file for service startup coordination", + ) + llama_service_health_check_interval: int = Field( + default=0, + description="Cycles between health checks (0 = check every cycle)", + ) + # Logging log_file: Path = Field( default=Path("/tmp/auto-commit.log"), diff --git a/src/auto_commit_service/models.py b/src/auto_commit_service/models.py index ab465e1..5ab515f 100644 --- a/src/auto_commit_service/models.py +++ b/src/auto_commit_service/models.py @@ -49,6 +49,9 @@ class DaemonStatus(BaseModel): repos: list[str] last_cycle: CycleResult | None = None next_cycle_at: datetime | None = None + service_crashed: bool = False + service_health: str | None = None + last_health_check: datetime | None = None class HealthResponse(BaseModel): diff --git a/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-312.pyc b/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-312.pyc index 94dc027..2b97337 100644 Binary files a/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-312.pyc and b/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-312.pyc differ diff --git a/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-314.pyc b/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-314.pyc index d30e439..19ef353 100644 Binary files a/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-314.pyc and b/src/auto_commit_service/scheduler/__pycache__/daemon.cpython-314.pyc differ diff --git a/src/auto_commit_service/scheduler/daemon.py b/src/auto_commit_service/scheduler/daemon.py index 7e31a61..1eac7b1 100644 --- a/src/auto_commit_service/scheduler/daemon.py +++ b/src/auto_commit_service/scheduler/daemon.py @@ -5,12 +5,14 @@ import logging import uuid from collections import deque from datetime import datetime, timedelta +from typing import Any from ..config import AutoCommitSettings from ..git import Repository, discover_git_repos from ..llm import LlamaCommitClient from ..models import CycleResult, ProcessStatus, RepoProcessResult from ..recovery import ErrorHandler +from ..service import LlamaServiceManager, ServiceHealth from .processor import CommitProcessor logger = logging.getLogger(__name__) @@ -36,6 +38,20 @@ class CommitDaemon: self.llm_client = llm_client self.error_handler = error_handler + # Initialize service manager + if settings.llama_service_autostart: + logger.info(f"Initializing service manager (autostart enabled)") + self.service_manager = LlamaServiceManager( + service_url=settings.llama_service_url, + pid_file=settings.llama_service_pid_file, + lock_file=settings.llama_service_lock_file, + startup_timeout=settings.llama_service_startup_timeout, + health_check_timeout=5.0, + ) + else: + logger.warning("Service manager disabled (autostart=False)") + self.service_manager = None + # Initialize processor self.processor = CommitProcessor( llm_client=llm_client, @@ -60,6 +76,12 @@ class CommitDaemon: self._last_cycle: CycleResult | None = None self._next_cycle_at: datetime | None = None + # Service health tracking + self._service_crashed = False + self._service_health: ServiceHealth | None = None + self._last_health_check: datetime | None = None + self._last_health_check_cycle = 0 # Track cycles since last check + def _build_repos(self) -> list[Repository]: """Build the list of repositories to process.""" if self.settings.recursive_discovery: @@ -221,6 +243,165 @@ class CommitDaemon: return errors return errors + def get_last_fully_successful_cycle(self) -> CycleResult | None: + """Find the last cycle where all repos succeeded. + + Returns: + CycleResult where repos_failed == 0 and repos_committed > 0, or None + """ + for cycle in reversed(self._cycle_history): + if cycle.repos_failed == 0 and cycle.repos_committed > 0: + return cycle + return None + + def get_repo_last_success_timestamp(self, repo_name: str) -> datetime | None: + """Get timestamp of last successful commit for a repo. + + Args: + repo_name: Repository name to check + + Returns: + Timestamp of last SUCCESS or RECOVERED status, or None + """ + for cycle in reversed(self._cycle_history): + for result in cycle.results: + if result.repo_name == repo_name: + if result.status in (ProcessStatus.SUCCESS, ProcessStatus.RECOVERED): + return result.timestamp + return None + + def categorize_errors(self) -> dict[str, dict[str, Any]]: + """Categorize recent errors by type. + + Returns: + Dict mapping category name to {count, examples[]} + """ + errors = self.get_error_history(limit=100) # Analyze more for patterns + + categories = { + "network": { + "patterns": ["connection", "timeout", "unreachable", "timed out", "network"], + "count": 0, + "examples": [], + }, + "auth": { + "patterns": ["authentication", "permission", "denied", "ssh", "credentials", "publickey"], + "count": 0, + "examples": [], + }, + "merge_conflict": { + "patterns": ["conflict", "merge failed", "divergent", "non-fast-forward"], + "count": 0, + "examples": [], + }, + "git_state": { + "patterns": ["detached head", "not a git", "no such remote", "no upstream"], + "count": 0, + "examples": [], + }, + "llm_service": { + "patterns": ["llama-service", "model not loaded", "generation failed", "llm"], + "count": 0, + "examples": [], + }, + "other": { + "patterns": [], + "count": 0, + "examples": [], + }, + } + + for error_entry in errors: + error_text = error_entry.get("error", "").lower() + categorized = False + + for cat_name, cat_data in categories.items(): + if cat_name == "other": + continue + if any(pattern in error_text for pattern in cat_data["patterns"]): + cat_data["count"] += 1 + if len(cat_data["examples"]) < 3: # Keep up to 3 examples + cat_data["examples"].append(error_entry) + categorized = True + break + + if not categorized: + categories["other"]["count"] += 1 + if len(categories["other"]["examples"]) < 3: + categories["other"]["examples"].append(error_entry) + + # Remove patterns key and empty categories from response + return { + k: {"count": v["count"], "examples": v["examples"]} + for k, v in categories.items() + if v["count"] > 0 + } + + @property + def service_crashed(self) -> bool: + """Check if the llama service has crashed.""" + return self._service_crashed + + @property + def service_health(self) -> str | None: + """Get the current service health status.""" + return self._service_health.value if self._service_health else None + + @property + def last_health_check(self) -> datetime | None: + """Get the last health check timestamp.""" + return self._last_health_check + + async def _ensure_service_ready(self) -> bool: + """Ensure llama service is available, starting if needed. + + Returns: + True if service is available, False if crashed or unavailable + """ + if not self.service_manager: + # Autostart disabled, rely on external service + return True + + # Check if we should perform health check this cycle + should_check = ( + self.settings.llama_service_health_check_interval == 0 or + self._last_health_check_cycle >= self.settings.llama_service_health_check_interval + ) + + if should_check: + self._last_health_check = datetime.now() + self._last_health_check_cycle = 0 + health = await self.service_manager.check_health() + self._service_health = health + + if health == ServiceHealth.CRASHED: + self._service_crashed = True + logger.error("Llama service has crashed - commits disabled") + return False + + if health == ServiceHealth.UNREACHABLE: + logger.info("Llama service unreachable, attempting to start...") + started = await self.service_manager.start_service() + if started: + self._service_crashed = False + self._service_health = ServiceHealth.HEALTHY + return True + else: + logger.error("Failed to start llama service") + return False + + if health == ServiceHealth.DEGRADED: + logger.warning("Llama service is degraded") + return False + + # Service is healthy + self._service_crashed = False + return True + else: + # Skip health check this cycle + self._last_health_check_cycle += 1 + return not self._service_crashed + def enable(self) -> None: """Enable the daemon.""" self._enabled = True @@ -302,10 +483,10 @@ class CommitDaemon: logger.info(f"Starting cycle {cycle_id}") - # Check LLM availability first - if not await self.llm_client.is_available(): - logger.warning("LLM service not available, skipping cycle") - return CycleResult( + # Ensure service is ready (auto-start if needed) + if not await self._ensure_service_ready(): + logger.warning("Llama service not ready, skipping cycle") + cycle_result = CycleResult( cycle_id=cycle_id, started_at=started_at, completed_at=datetime.now(), @@ -314,6 +495,27 @@ class CommitDaemon: repos_failed=0, results=[], ) + self._last_cycle = cycle_result + self._cycle_history.append(cycle_result) + self._total_cycles += 1 + return cycle_result + + # Check LLM availability + if not await self.llm_client.is_available(): + logger.warning("LLM service not available, skipping cycle") + cycle_result = CycleResult( + cycle_id=cycle_id, + started_at=started_at, + completed_at=datetime.now(), + repos_processed=0, + repos_committed=0, + repos_failed=0, + results=[], + ) + self._last_cycle = cycle_result + self._cycle_history.append(cycle_result) + self._total_cycles += 1 + return cycle_result # Phase 1: Commit all repos with changes logger.info(f"[{cycle_id}] Phase 1: Committing changes") diff --git a/src/auto_commit_service/service/__init__.py b/src/auto_commit_service/service/__init__.py new file mode 100644 index 0000000..4c02b17 --- /dev/null +++ b/src/auto_commit_service/service/__init__.py @@ -0,0 +1,17 @@ +"""Service lifecycle management.""" + +from .manager import ( + LlamaServiceManager, + ServiceHealth, + ServiceManagerError, + ServiceStartError, + ServiceCrashError, +) + +__all__ = [ + "LlamaServiceManager", + "ServiceHealth", + "ServiceManagerError", + "ServiceStartError", + "ServiceCrashError", +] diff --git a/src/auto_commit_service/service/__pycache__/__init__.cpython-312.pyc b/src/auto_commit_service/service/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..30ee7a9 Binary files /dev/null and b/src/auto_commit_service/service/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/auto_commit_service/service/__pycache__/manager.cpython-312.pyc b/src/auto_commit_service/service/__pycache__/manager.cpython-312.pyc new file mode 100644 index 0000000..1d00dc0 Binary files /dev/null and b/src/auto_commit_service/service/__pycache__/manager.cpython-312.pyc differ diff --git a/src/auto_commit_service/service/manager.py b/src/auto_commit_service/service/manager.py new file mode 100644 index 0000000..cfbc226 --- /dev/null +++ b/src/auto_commit_service/service/manager.py @@ -0,0 +1,235 @@ +"""Llama service lifecycle manager.""" + +import asyncio +import fcntl +import logging +import os +import signal +import sys +import time +from enum import Enum +from pathlib import Path + +import httpx + +logger = logging.getLogger(__name__) + + +class ServiceManagerError(Exception): + """Base exception for service manager errors.""" + + +class ServiceStartError(ServiceManagerError): + """Failed to start service.""" + + +class ServiceCrashError(ServiceManagerError): + """Service crashed unexpectedly.""" + + +class ServiceHealth(str, Enum): + """Service health status.""" + + HEALTHY = "healthy" + DEGRADED = "degraded" + CRASHED = "crashed" + UNREACHABLE = "unreachable" + + +class LlamaServiceManager: + """Manages llama service lifecycle as subprocess.""" + + def __init__( + self, + service_url: str = "http://localhost:8000", + pid_file: Path | None = None, + lock_file: Path | None = None, + startup_timeout: float = 30.0, + health_check_timeout: float = 5.0, + ): + """Initialize service manager.""" + self.service_url = service_url + self._pid_file = pid_file or Path.home() / ".config/commits/llama-service.pid" + self._lock_file = lock_file or Path.home() / ".config/commits/llama-service.lock" + self._startup_timeout = startup_timeout + self._health_check_timeout = health_check_timeout + self._spawned_pid: int | None = None + self._lock_fd: int | None = None + + async def ensure_service_available(self) -> bool: + """Ensure service is available, starting if necessary.""" + health = await self.check_health() + + if health == ServiceHealth.HEALTHY: + return True + if health in (ServiceHealth.DEGRADED, ServiceHealth.CRASHED): + return False + + logger.info("Llama service unreachable, attempting to start...") + return await self.start_service() + + async def start_service(self) -> bool: + """Start llama service subprocess.""" + pid = self._read_pid_file() + if pid and self._is_process_alive(pid): + logger.info(f"Service already running (PID: {pid})") + return True + + if not self._acquire_lock(): + await asyncio.sleep(2) + pid = self._read_pid_file() + if pid and self._is_process_alive(pid): + return True + return False + + try: + pid = self._read_pid_file() + if pid and self._is_process_alive(pid): + return True + + logger.info("Starting llama service subprocess...") + process = await self._spawn_service() + self._spawned_pid = process.pid + self._write_pid_file(process.pid) + logger.info(f"Llama service started (PID: {process.pid})") + + if await self._wait_for_healthy(self._startup_timeout): + logger.info("✓ Llama service is healthy") + return True + else: + logger.error(f"✗ Service failed to start within {self._startup_timeout}s") + return False + + except Exception as e: + logger.exception(f"Failed to start llama service: {e}") + return False + finally: + self._release_lock() + + async def check_health(self) -> ServiceHealth: + """Check service health and detect crashes.""" + pid = self._read_pid_file() + if pid and not self._is_process_alive(pid): + return ServiceHealth.CRASHED + + try: + async with httpx.AsyncClient(timeout=self._health_check_timeout) as client: + response = await client.get(f"{self.service_url}/health") + if response.status_code == 200: + data = response.json() + return ServiceHealth.HEALTHY if data.get("status") == "ok" else ServiceHealth.DEGRADED + return ServiceHealth.UNREACHABLE + except (httpx.ConnectError, httpx.TimeoutException): + return ServiceHealth.UNREACHABLE + except Exception: + return ServiceHealth.UNREACHABLE + + async def stop_service(self) -> None: + """Gracefully stop service if we own it.""" + if self._spawned_pid is None or not self._is_process_alive(self._spawned_pid): + return + + logger.info(f"Stopping llama service (PID: {self._spawned_pid})...") + try: + os.kill(self._spawned_pid, signal.SIGTERM) + for _ in range(10): + if not self._is_process_alive(self._spawned_pid): + self._cleanup_pid_file() + return + await asyncio.sleep(0.5) + os.kill(self._spawned_pid, signal.SIGKILL) + self._cleanup_pid_file() + except ProcessLookupError: + self._cleanup_pid_file() + except Exception as e: + logger.exception(f"Error stopping service: {e}") + + def _acquire_lock(self) -> bool: + """Acquire exclusive lock.""" + self._lock_file.parent.mkdir(parents=True, exist_ok=True) + try: + self._lock_fd = os.open(self._lock_file, os.O_CREAT | os.O_WRONLY) + fcntl.flock(self._lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + return True + except BlockingIOError: + return False + + def _release_lock(self) -> None: + """Release lock.""" + if self._lock_fd is not None: + try: + fcntl.flock(self._lock_fd, fcntl.LOCK_UN) + os.close(self._lock_fd) + self._lock_fd = None + except Exception: + pass + + def _read_pid_file(self) -> int | None: + """Read PID from file.""" + if not self._pid_file.exists(): + return None + try: + content = self._pid_file.read_text().strip() + return int(content) if content else None + except Exception: + return None + + def _write_pid_file(self, pid: int) -> None: + """Write PID to file.""" + self._pid_file.parent.mkdir(parents=True, exist_ok=True) + self._pid_file.write_text(str(pid)) + + def _cleanup_pid_file(self) -> None: + """Remove PID file.""" + try: + if self._pid_file.exists(): + self._pid_file.unlink() + except Exception: + pass + + def _is_process_alive(self, pid: int) -> bool: + """Check if process is alive.""" + try: + os.kill(pid, 0) + return True + except OSError: + return False + + async def _spawn_service(self) -> asyncio.subprocess.Process: + """Spawn service as background subprocess.""" + cmd = [sys.executable, "-m", "tqftw_llama_service"] + env = os.environ.copy() + + # Enable mock mode if no real models configured + if "LLAMA_SERVICE_FAST_MODEL_PATH" not in env and "LLAMA_SERVICE_REASONING_MODEL_PATH" not in env: + env["LLAMA_SERVICE_MOCK_MODE"] = "true" + + log_file = self._pid_file.parent / "llama-service.log" + log_file.parent.mkdir(parents=True, exist_ok=True) + + with open(log_file, "a") as log: + log.write(f"\n=== Service started at {time.ctime()} ===\n") + process = await asyncio.create_subprocess_exec( + *cmd, + env=env, + stdout=log, + stderr=asyncio.subprocess.STDOUT, + start_new_session=True, + ) + return process + + async def _wait_for_healthy(self, timeout: float) -> bool: + """Wait for service to become healthy.""" + start = time.time() + while time.time() - start < timeout: + try: + async with httpx.AsyncClient(timeout=2.0) as client: + response = await client.get(f"{self.service_url}/health") + if response.status_code == 200: + data = response.json() + if data.get("status") == "ok": + return True + except Exception: + pass + await asyncio.sleep(1) + return False