feat(auto-commit): add crash detection and multi-retry restart

- Add llama_service_max_restart_attempts (default: 3)
- Add llama_service_restart_backoff_seconds (default: 5.0s)
- Implement retry logic with exponential backoff (5s, 10s, 15s)
- Track crash timestamps and restart attempts
- Expose crash state in /health and /status endpoints:
  - llama_service_crashed
  - llama_service_restart_attempts
  - llama_service_last_crash
  - llama_service_last_successful_restart
- Enhanced logging with ✓/✗ symbols for restart outcomes

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Lilith 2026-01-10 23:27:58 -08:00
parent f0bf1dc859
commit a6a7e96889
4 changed files with 78 additions and 10 deletions

View file

@ -147,6 +147,10 @@ def create_auto_commit_service(
llama_service_available=llama_available,
repos_accessible=repos_accessible,
error=error,
llama_service_crashed=service_crashed,
llama_service_restart_attempts=daemon.service_restart_attempts,
llama_service_last_crash=daemon.service_last_crash_time,
llama_service_last_successful_restart=daemon.service_last_successful_restart,
)
@app.get("/status", response_model=DaemonStatus)
@ -162,6 +166,9 @@ def create_auto_commit_service(
service_crashed=daemon.service_crashed,
service_health=daemon.service_health,
last_health_check=daemon.last_health_check,
service_restart_attempts=daemon.service_restart_attempts,
service_last_crash_time=daemon.service_last_crash_time,
service_last_successful_restart=daemon.service_last_successful_restart,
)
@app.post("/trigger", response_model=TriggerResponse)

View file

@ -121,6 +121,14 @@ class AutoCommitSettings(BaseServiceSettings):
default=0,
description="Cycles between health checks (0 = check every cycle)",
)
llama_service_max_restart_attempts: int = Field(
default=3,
description="Maximum restart attempts before giving up",
)
llama_service_restart_backoff_seconds: float = Field(
default=5.0,
description="Delay between restart attempts (seconds)",
)
# Model-boss integration for auto-loading LLM
llama_model_id: str = Field(

View file

@ -52,6 +52,9 @@ class DaemonStatus(BaseModel):
service_crashed: bool = False
service_health: str | None = None
last_health_check: datetime | None = None
service_restart_attempts: int = 0
service_last_crash_time: datetime | None = None
service_last_successful_restart: datetime | None = None
class HealthResponse(BaseModel):
@ -62,6 +65,10 @@ class HealthResponse(BaseModel):
llama_service_available: bool
repos_accessible: bool
error: str | None = None
llama_service_crashed: bool = False
llama_service_restart_attempts: int = 0
llama_service_last_crash: datetime | None = None
llama_service_last_successful_restart: datetime | None = None
class TriggerResponse(BaseModel):

View file

@ -91,6 +91,9 @@ class CommitDaemon:
self._service_health: ServiceHealth | None = None
self._last_health_check: datetime | None = None
self._last_health_check_cycle = 0 # Track cycles since last check
self._service_restart_attempts = 0
self._last_crash_time: datetime | None = None
self._last_successful_restart: datetime | None = None
def _build_repos(self) -> list[Repository]:
"""Build the list of repositories to process."""
@ -380,6 +383,21 @@ class CommitDaemon:
"""Get the last health check timestamp."""
return self._last_health_check
@property
def service_restart_attempts(self) -> int:
"""Get the number of restart attempts."""
return self._service_restart_attempts
@property
def service_last_crash_time(self) -> datetime | None:
"""Get the last crash timestamp."""
return self._last_crash_time
@property
def service_last_successful_restart(self) -> datetime | None:
"""Get the last successful restart timestamp."""
return self._last_successful_restart
async def _ensure_service_ready(self) -> bool:
"""Ensure llama service is available, starting if needed.
@ -404,19 +422,47 @@ class CommitDaemon:
if health == ServiceHealth.CRASHED or health == ServiceHealth.UNREACHABLE:
status_msg = "crashed (stale PID)" if health == ServiceHealth.CRASHED else "unreachable"
logger.info(f"Llama service {status_msg}, attempting to start...")
started = await self.service_manager.ensure_service_available()
if started:
self._service_crashed = False
self._service_health = ServiceHealth.HEALTHY
return True
else:
self._service_crashed = True
logger.error("Failed to start llama service")
return False
self._last_crash_time = datetime.now()
logger.warning(
f"Llama service {status_msg}, attempting restart "
f"(attempt 1/{self.settings.llama_service_max_restart_attempts})"
)
# Retry logic with exponential backoff
for attempt in range(1, self.settings.llama_service_max_restart_attempts + 1):
self._service_restart_attempts = attempt
started = await self.service_manager.ensure_service_available()
if started:
self._service_crashed = False
self._service_health = ServiceHealth.HEALTHY
self._service_restart_attempts = 0
self._last_successful_restart = datetime.now()
logger.info(f"✓ Llama service restarted successfully (attempt {attempt}/{self.settings.llama_service_max_restart_attempts})")
return True
# Failed this attempt
if attempt < self.settings.llama_service_max_restart_attempts:
backoff = self.settings.llama_service_restart_backoff_seconds * attempt
logger.warning(
f"✗ Restart attempt {attempt}/{self.settings.llama_service_max_restart_attempts} failed, "
f"retrying in {backoff:.1f}s..."
)
await asyncio.sleep(backoff)
else:
logger.error(
f"✗ Failed to restart llama service after {self.settings.llama_service_max_restart_attempts} attempts"
)
# All attempts failed
self._service_crashed = True
self._service_restart_attempts = self.settings.llama_service_max_restart_attempts
return False
# Service is healthy
self._service_crashed = False
self._service_restart_attempts = 0
return True
else:
# Skip health check this cycle