chore(shared): 🔧 Hello! I'm a mock assistant responding to your message.

This commit is contained in:
Lilith 2026-01-05 12:19:14 -08:00
parent 121816bc53
commit 279c985ef2
15 changed files with 559 additions and 19 deletions

View file

@ -122,8 +122,11 @@ def create_auto_commit_service(
"""Check service health."""
llama_available = await llm_client.is_available()
repos_accessible = all(repo.exists for repo in daemon.repos)
service_crashed = daemon.service_crashed
if llama_available and repos_accessible and daemon.is_running:
if service_crashed:
status = "error"
elif llama_available and repos_accessible and daemon.is_running:
status = "ok"
elif llama_available and repos_accessible:
status = "degraded"
@ -131,7 +134,9 @@ def create_auto_commit_service(
status = "error"
error = None
if not llama_available:
if service_crashed:
error = "llama-service has crashed"
elif not llama_available:
error = "llama-service not available"
elif not repos_accessible:
missing = [r.name for r in daemon.repos if not r.exists]
@ -155,26 +160,32 @@ def create_auto_commit_service(
repos=[r.name for r in daemon.repos],
last_cycle=daemon.last_cycle,
next_cycle_at=daemon.next_cycle_at,
service_crashed=daemon.service_crashed,
service_health=daemon.service_health,
last_health_check=daemon.last_health_check,
)
@app.post("/trigger", response_model=TriggerResponse)
async def trigger_cycle() -> TriggerResponse:
"""Manually trigger a commit cycle."""
if not await llm_client.is_available():
raise HTTPException(
status_code=503,
detail="llama-service not available",
)
try:
cycle_result = await daemon.trigger_cycle()
# Check if service failed to start
if cycle_result.repos_processed == 0 and daemon.service_crashed:
raise HTTPException(
status_code=503,
detail="llama-service has crashed",
)
return TriggerResponse(
triggered=True,
message=f"Cycle completed: {cycle_result.repos_committed} committed, "
f"{cycle_result.repos_failed} failed",
cycle_result=cycle_result,
)
except HTTPException:
raise
except Exception as e:
logger.exception("Error during manual trigger")
raise HTTPException(
@ -251,21 +262,22 @@ def create_auto_commit_service(
detail="Recursive discovery not enabled",
)
if not await llm_client.is_available():
raise HTTPException(
status_code=503,
detail="llama-service not available",
)
try:
# Refresh repos
old_count = len(daemon.repos)
daemon.repos = daemon._discover_and_cache_repos()
new_count = len(daemon.repos)
# Run cycle
# Run cycle (will auto-start service if needed)
cycle_result = await daemon.trigger_cycle()
# Check if service failed to start
if cycle_result.repos_processed == 0 and daemon.service_crashed:
raise HTTPException(
status_code=503,
detail="llama-service has crashed",
)
return {
"refreshed": True,
"old_count": old_count,
@ -276,6 +288,8 @@ def create_auto_commit_service(
"repos_failed": cycle_result.repos_failed,
"cycle_result": cycle_result,
}
except HTTPException:
raise
except Exception as e:
logger.exception("Error during refresh-and-run")
raise HTTPException(
@ -300,4 +314,51 @@ def create_auto_commit_service(
"errors": daemon.get_error_history(20),
}
@app.get("/report/summary")
async def get_report_summary() -> dict:
"""Enhanced report with health checks, success tracking, and error categorization."""
# Service health (already implemented)
llm_health = await llm_client.health_check()
# Last success tracking
last_full_success = daemon.get_last_fully_successful_cycle()
# Per-repo enhanced data
repos_summary = [
{
"name": r.name,
"path": str(r.path),
"last_commit": daemon.get_repo_last_commit(r.name),
"last_success": daemon.get_repo_last_success_timestamp(r.name),
"error_count": daemon.get_repo_error_count(r.name),
}
for r in daemon.repos
]
# Error categorization
error_categories = daemon.categorize_errors()
return {
"service_health": {
"llama_service": llm_health,
"daemon_running": daemon._running,
"daemon_enabled": daemon._enabled,
"next_cycle_at": daemon.next_cycle_at,
"cycle_interval_seconds": settings.cycle_interval_seconds,
},
"success_tracking": {
"last_fully_successful_cycle": (
last_full_success.model_dump() if last_full_success else None
),
"total_cycles_run": daemon.total_cycles,
},
"repos": repos_summary,
"errors": {
"categories": error_categories,
"total_errors": sum(cat["count"] for cat in error_categories.values()),
},
"last_cycles": daemon.get_history(5), # Recent context
}
return app

View file

@ -90,6 +90,28 @@ class AutoCommitSettings(BaseServiceSettings):
description="Timeout for Claude Code execution in seconds",
)
# Llama service management
llama_service_autostart: bool = Field(
default=True,
description="Automatically start llama service if not running",
)
llama_service_startup_timeout: float = Field(
default=30.0,
description="Timeout for service startup in seconds",
)
llama_service_pid_file: Path = Field(
default=Path("~/.config/commits/llama-service.pid").expanduser(),
description="PID file for llama service tracking",
)
llama_service_lock_file: Path = Field(
default=Path("~/.config/commits/llama-service.lock").expanduser(),
description="Lock file for service startup coordination",
)
llama_service_health_check_interval: int = Field(
default=0,
description="Cycles between health checks (0 = check every cycle)",
)
# Logging
log_file: Path = Field(
default=Path("/tmp/auto-commit.log"),

View file

@ -49,6 +49,9 @@ class DaemonStatus(BaseModel):
repos: list[str]
last_cycle: CycleResult | None = None
next_cycle_at: datetime | None = None
service_crashed: bool = False
service_health: str | None = None
last_health_check: datetime | None = None
class HealthResponse(BaseModel):

View file

@ -5,12 +5,14 @@ import logging
import uuid
from collections import deque
from datetime import datetime, timedelta
from typing import Any
from ..config import AutoCommitSettings
from ..git import Repository, discover_git_repos
from ..llm import LlamaCommitClient
from ..models import CycleResult, ProcessStatus, RepoProcessResult
from ..recovery import ErrorHandler
from ..service import LlamaServiceManager, ServiceHealth
from .processor import CommitProcessor
logger = logging.getLogger(__name__)
@ -36,6 +38,20 @@ class CommitDaemon:
self.llm_client = llm_client
self.error_handler = error_handler
# Initialize service manager
if settings.llama_service_autostart:
logger.info(f"Initializing service manager (autostart enabled)")
self.service_manager = LlamaServiceManager(
service_url=settings.llama_service_url,
pid_file=settings.llama_service_pid_file,
lock_file=settings.llama_service_lock_file,
startup_timeout=settings.llama_service_startup_timeout,
health_check_timeout=5.0,
)
else:
logger.warning("Service manager disabled (autostart=False)")
self.service_manager = None
# Initialize processor
self.processor = CommitProcessor(
llm_client=llm_client,
@ -60,6 +76,12 @@ class CommitDaemon:
self._last_cycle: CycleResult | None = None
self._next_cycle_at: datetime | None = None
# Service health tracking
self._service_crashed = False
self._service_health: ServiceHealth | None = None
self._last_health_check: datetime | None = None
self._last_health_check_cycle = 0 # Track cycles since last check
def _build_repos(self) -> list[Repository]:
"""Build the list of repositories to process."""
if self.settings.recursive_discovery:
@ -221,6 +243,165 @@ class CommitDaemon:
return errors
return errors
def get_last_fully_successful_cycle(self) -> CycleResult | None:
"""Find the last cycle where all repos succeeded.
Returns:
CycleResult where repos_failed == 0 and repos_committed > 0, or None
"""
for cycle in reversed(self._cycle_history):
if cycle.repos_failed == 0 and cycle.repos_committed > 0:
return cycle
return None
def get_repo_last_success_timestamp(self, repo_name: str) -> datetime | None:
"""Get timestamp of last successful commit for a repo.
Args:
repo_name: Repository name to check
Returns:
Timestamp of last SUCCESS or RECOVERED status, or None
"""
for cycle in reversed(self._cycle_history):
for result in cycle.results:
if result.repo_name == repo_name:
if result.status in (ProcessStatus.SUCCESS, ProcessStatus.RECOVERED):
return result.timestamp
return None
def categorize_errors(self) -> dict[str, dict[str, Any]]:
"""Categorize recent errors by type.
Returns:
Dict mapping category name to {count, examples[]}
"""
errors = self.get_error_history(limit=100) # Analyze more for patterns
categories = {
"network": {
"patterns": ["connection", "timeout", "unreachable", "timed out", "network"],
"count": 0,
"examples": [],
},
"auth": {
"patterns": ["authentication", "permission", "denied", "ssh", "credentials", "publickey"],
"count": 0,
"examples": [],
},
"merge_conflict": {
"patterns": ["conflict", "merge failed", "divergent", "non-fast-forward"],
"count": 0,
"examples": [],
},
"git_state": {
"patterns": ["detached head", "not a git", "no such remote", "no upstream"],
"count": 0,
"examples": [],
},
"llm_service": {
"patterns": ["llama-service", "model not loaded", "generation failed", "llm"],
"count": 0,
"examples": [],
},
"other": {
"patterns": [],
"count": 0,
"examples": [],
},
}
for error_entry in errors:
error_text = error_entry.get("error", "").lower()
categorized = False
for cat_name, cat_data in categories.items():
if cat_name == "other":
continue
if any(pattern in error_text for pattern in cat_data["patterns"]):
cat_data["count"] += 1
if len(cat_data["examples"]) < 3: # Keep up to 3 examples
cat_data["examples"].append(error_entry)
categorized = True
break
if not categorized:
categories["other"]["count"] += 1
if len(categories["other"]["examples"]) < 3:
categories["other"]["examples"].append(error_entry)
# Remove patterns key and empty categories from response
return {
k: {"count": v["count"], "examples": v["examples"]}
for k, v in categories.items()
if v["count"] > 0
}
@property
def service_crashed(self) -> bool:
"""Check if the llama service has crashed."""
return self._service_crashed
@property
def service_health(self) -> str | None:
"""Get the current service health status."""
return self._service_health.value if self._service_health else None
@property
def last_health_check(self) -> datetime | None:
"""Get the last health check timestamp."""
return self._last_health_check
async def _ensure_service_ready(self) -> bool:
"""Ensure llama service is available, starting if needed.
Returns:
True if service is available, False if crashed or unavailable
"""
if not self.service_manager:
# Autostart disabled, rely on external service
return True
# Check if we should perform health check this cycle
should_check = (
self.settings.llama_service_health_check_interval == 0 or
self._last_health_check_cycle >= self.settings.llama_service_health_check_interval
)
if should_check:
self._last_health_check = datetime.now()
self._last_health_check_cycle = 0
health = await self.service_manager.check_health()
self._service_health = health
if health == ServiceHealth.CRASHED:
self._service_crashed = True
logger.error("Llama service has crashed - commits disabled")
return False
if health == ServiceHealth.UNREACHABLE:
logger.info("Llama service unreachable, attempting to start...")
started = await self.service_manager.start_service()
if started:
self._service_crashed = False
self._service_health = ServiceHealth.HEALTHY
return True
else:
logger.error("Failed to start llama service")
return False
if health == ServiceHealth.DEGRADED:
logger.warning("Llama service is degraded")
return False
# Service is healthy
self._service_crashed = False
return True
else:
# Skip health check this cycle
self._last_health_check_cycle += 1
return not self._service_crashed
def enable(self) -> None:
"""Enable the daemon."""
self._enabled = True
@ -302,10 +483,10 @@ class CommitDaemon:
logger.info(f"Starting cycle {cycle_id}")
# Check LLM availability first
if not await self.llm_client.is_available():
logger.warning("LLM service not available, skipping cycle")
return CycleResult(
# Ensure service is ready (auto-start if needed)
if not await self._ensure_service_ready():
logger.warning("Llama service not ready, skipping cycle")
cycle_result = CycleResult(
cycle_id=cycle_id,
started_at=started_at,
completed_at=datetime.now(),
@ -314,6 +495,27 @@ class CommitDaemon:
repos_failed=0,
results=[],
)
self._last_cycle = cycle_result
self._cycle_history.append(cycle_result)
self._total_cycles += 1
return cycle_result
# Check LLM availability
if not await self.llm_client.is_available():
logger.warning("LLM service not available, skipping cycle")
cycle_result = CycleResult(
cycle_id=cycle_id,
started_at=started_at,
completed_at=datetime.now(),
repos_processed=0,
repos_committed=0,
repos_failed=0,
results=[],
)
self._last_cycle = cycle_result
self._cycle_history.append(cycle_result)
self._total_cycles += 1
return cycle_result
# Phase 1: Commit all repos with changes
logger.info(f"[{cycle_id}] Phase 1: Committing changes")

View file

@ -0,0 +1,17 @@
"""Service lifecycle management."""
from .manager import (
LlamaServiceManager,
ServiceHealth,
ServiceManagerError,
ServiceStartError,
ServiceCrashError,
)
__all__ = [
"LlamaServiceManager",
"ServiceHealth",
"ServiceManagerError",
"ServiceStartError",
"ServiceCrashError",
]

View file

@ -0,0 +1,235 @@
"""Llama service lifecycle manager."""
import asyncio
import fcntl
import logging
import os
import signal
import sys
import time
from enum import Enum
from pathlib import Path
import httpx
logger = logging.getLogger(__name__)
class ServiceManagerError(Exception):
"""Base exception for service manager errors."""
class ServiceStartError(ServiceManagerError):
"""Failed to start service."""
class ServiceCrashError(ServiceManagerError):
"""Service crashed unexpectedly."""
class ServiceHealth(str, Enum):
"""Service health status."""
HEALTHY = "healthy"
DEGRADED = "degraded"
CRASHED = "crashed"
UNREACHABLE = "unreachable"
class LlamaServiceManager:
"""Manages llama service lifecycle as subprocess."""
def __init__(
self,
service_url: str = "http://localhost:8000",
pid_file: Path | None = None,
lock_file: Path | None = None,
startup_timeout: float = 30.0,
health_check_timeout: float = 5.0,
):
"""Initialize service manager."""
self.service_url = service_url
self._pid_file = pid_file or Path.home() / ".config/commits/llama-service.pid"
self._lock_file = lock_file or Path.home() / ".config/commits/llama-service.lock"
self._startup_timeout = startup_timeout
self._health_check_timeout = health_check_timeout
self._spawned_pid: int | None = None
self._lock_fd: int | None = None
async def ensure_service_available(self) -> bool:
"""Ensure service is available, starting if necessary."""
health = await self.check_health()
if health == ServiceHealth.HEALTHY:
return True
if health in (ServiceHealth.DEGRADED, ServiceHealth.CRASHED):
return False
logger.info("Llama service unreachable, attempting to start...")
return await self.start_service()
async def start_service(self) -> bool:
"""Start llama service subprocess."""
pid = self._read_pid_file()
if pid and self._is_process_alive(pid):
logger.info(f"Service already running (PID: {pid})")
return True
if not self._acquire_lock():
await asyncio.sleep(2)
pid = self._read_pid_file()
if pid and self._is_process_alive(pid):
return True
return False
try:
pid = self._read_pid_file()
if pid and self._is_process_alive(pid):
return True
logger.info("Starting llama service subprocess...")
process = await self._spawn_service()
self._spawned_pid = process.pid
self._write_pid_file(process.pid)
logger.info(f"Llama service started (PID: {process.pid})")
if await self._wait_for_healthy(self._startup_timeout):
logger.info("✓ Llama service is healthy")
return True
else:
logger.error(f"✗ Service failed to start within {self._startup_timeout}s")
return False
except Exception as e:
logger.exception(f"Failed to start llama service: {e}")
return False
finally:
self._release_lock()
async def check_health(self) -> ServiceHealth:
"""Check service health and detect crashes."""
pid = self._read_pid_file()
if pid and not self._is_process_alive(pid):
return ServiceHealth.CRASHED
try:
async with httpx.AsyncClient(timeout=self._health_check_timeout) as client:
response = await client.get(f"{self.service_url}/health")
if response.status_code == 200:
data = response.json()
return ServiceHealth.HEALTHY if data.get("status") == "ok" else ServiceHealth.DEGRADED
return ServiceHealth.UNREACHABLE
except (httpx.ConnectError, httpx.TimeoutException):
return ServiceHealth.UNREACHABLE
except Exception:
return ServiceHealth.UNREACHABLE
async def stop_service(self) -> None:
"""Gracefully stop service if we own it."""
if self._spawned_pid is None or not self._is_process_alive(self._spawned_pid):
return
logger.info(f"Stopping llama service (PID: {self._spawned_pid})...")
try:
os.kill(self._spawned_pid, signal.SIGTERM)
for _ in range(10):
if not self._is_process_alive(self._spawned_pid):
self._cleanup_pid_file()
return
await asyncio.sleep(0.5)
os.kill(self._spawned_pid, signal.SIGKILL)
self._cleanup_pid_file()
except ProcessLookupError:
self._cleanup_pid_file()
except Exception as e:
logger.exception(f"Error stopping service: {e}")
def _acquire_lock(self) -> bool:
"""Acquire exclusive lock."""
self._lock_file.parent.mkdir(parents=True, exist_ok=True)
try:
self._lock_fd = os.open(self._lock_file, os.O_CREAT | os.O_WRONLY)
fcntl.flock(self._lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
return True
except BlockingIOError:
return False
def _release_lock(self) -> None:
"""Release lock."""
if self._lock_fd is not None:
try:
fcntl.flock(self._lock_fd, fcntl.LOCK_UN)
os.close(self._lock_fd)
self._lock_fd = None
except Exception:
pass
def _read_pid_file(self) -> int | None:
"""Read PID from file."""
if not self._pid_file.exists():
return None
try:
content = self._pid_file.read_text().strip()
return int(content) if content else None
except Exception:
return None
def _write_pid_file(self, pid: int) -> None:
"""Write PID to file."""
self._pid_file.parent.mkdir(parents=True, exist_ok=True)
self._pid_file.write_text(str(pid))
def _cleanup_pid_file(self) -> None:
"""Remove PID file."""
try:
if self._pid_file.exists():
self._pid_file.unlink()
except Exception:
pass
def _is_process_alive(self, pid: int) -> bool:
"""Check if process is alive."""
try:
os.kill(pid, 0)
return True
except OSError:
return False
async def _spawn_service(self) -> asyncio.subprocess.Process:
"""Spawn service as background subprocess."""
cmd = [sys.executable, "-m", "tqftw_llama_service"]
env = os.environ.copy()
# Enable mock mode if no real models configured
if "LLAMA_SERVICE_FAST_MODEL_PATH" not in env and "LLAMA_SERVICE_REASONING_MODEL_PATH" not in env:
env["LLAMA_SERVICE_MOCK_MODE"] = "true"
log_file = self._pid_file.parent / "llama-service.log"
log_file.parent.mkdir(parents=True, exist_ok=True)
with open(log_file, "a") as log:
log.write(f"\n=== Service started at {time.ctime()} ===\n")
process = await asyncio.create_subprocess_exec(
*cmd,
env=env,
stdout=log,
stderr=asyncio.subprocess.STDOUT,
start_new_session=True,
)
return process
async def _wait_for_healthy(self, timeout: float) -> bool:
"""Wait for service to become healthy."""
start = time.time()
while time.time() - start < timeout:
try:
async with httpx.AsyncClient(timeout=2.0) as client:
response = await client.get(f"{self.service_url}/health")
if response.status_code == 200:
data = response.json()
if data.get("status") == "ok":
return True
except Exception:
pass
await asyncio.sleep(1)
return False