feat(auto-commit): add crash detection and multi-retry restart
- Add llama_service_max_restart_attempts (default: 3) - Add llama_service_restart_backoff_seconds (default: 5.0s) - Implement retry logic with exponential backoff (5s, 10s, 15s) - Track crash timestamps and restart attempts - Expose crash state in /health and /status endpoints: - llama_service_crashed - llama_service_restart_attempts - llama_service_last_crash - llama_service_last_successful_restart - Enhanced logging with ✓/✗ symbols for restart outcomes Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
f0bf1dc859
commit
a6a7e96889
4 changed files with 78 additions and 10 deletions
|
|
@ -147,6 +147,10 @@ def create_auto_commit_service(
|
|||
llama_service_available=llama_available,
|
||||
repos_accessible=repos_accessible,
|
||||
error=error,
|
||||
llama_service_crashed=service_crashed,
|
||||
llama_service_restart_attempts=daemon.service_restart_attempts,
|
||||
llama_service_last_crash=daemon.service_last_crash_time,
|
||||
llama_service_last_successful_restart=daemon.service_last_successful_restart,
|
||||
)
|
||||
|
||||
@app.get("/status", response_model=DaemonStatus)
|
||||
|
|
@ -162,6 +166,9 @@ def create_auto_commit_service(
|
|||
service_crashed=daemon.service_crashed,
|
||||
service_health=daemon.service_health,
|
||||
last_health_check=daemon.last_health_check,
|
||||
service_restart_attempts=daemon.service_restart_attempts,
|
||||
service_last_crash_time=daemon.service_last_crash_time,
|
||||
service_last_successful_restart=daemon.service_last_successful_restart,
|
||||
)
|
||||
|
||||
@app.post("/trigger", response_model=TriggerResponse)
|
||||
|
|
|
|||
|
|
@ -121,6 +121,14 @@ class AutoCommitSettings(BaseServiceSettings):
|
|||
default=0,
|
||||
description="Cycles between health checks (0 = check every cycle)",
|
||||
)
|
||||
llama_service_max_restart_attempts: int = Field(
|
||||
default=3,
|
||||
description="Maximum restart attempts before giving up",
|
||||
)
|
||||
llama_service_restart_backoff_seconds: float = Field(
|
||||
default=5.0,
|
||||
description="Delay between restart attempts (seconds)",
|
||||
)
|
||||
|
||||
# Model-boss integration for auto-loading LLM
|
||||
llama_model_id: str = Field(
|
||||
|
|
|
|||
|
|
@ -52,6 +52,9 @@ class DaemonStatus(BaseModel):
|
|||
service_crashed: bool = False
|
||||
service_health: str | None = None
|
||||
last_health_check: datetime | None = None
|
||||
service_restart_attempts: int = 0
|
||||
service_last_crash_time: datetime | None = None
|
||||
service_last_successful_restart: datetime | None = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
|
|
@ -62,6 +65,10 @@ class HealthResponse(BaseModel):
|
|||
llama_service_available: bool
|
||||
repos_accessible: bool
|
||||
error: str | None = None
|
||||
llama_service_crashed: bool = False
|
||||
llama_service_restart_attempts: int = 0
|
||||
llama_service_last_crash: datetime | None = None
|
||||
llama_service_last_successful_restart: datetime | None = None
|
||||
|
||||
|
||||
class TriggerResponse(BaseModel):
|
||||
|
|
|
|||
|
|
@ -91,6 +91,9 @@ class CommitDaemon:
|
|||
self._service_health: ServiceHealth | None = None
|
||||
self._last_health_check: datetime | None = None
|
||||
self._last_health_check_cycle = 0 # Track cycles since last check
|
||||
self._service_restart_attempts = 0
|
||||
self._last_crash_time: datetime | None = None
|
||||
self._last_successful_restart: datetime | None = None
|
||||
|
||||
def _build_repos(self) -> list[Repository]:
|
||||
"""Build the list of repositories to process."""
|
||||
|
|
@ -380,6 +383,21 @@ class CommitDaemon:
|
|||
"""Get the last health check timestamp."""
|
||||
return self._last_health_check
|
||||
|
||||
@property
|
||||
def service_restart_attempts(self) -> int:
|
||||
"""Get the number of restart attempts."""
|
||||
return self._service_restart_attempts
|
||||
|
||||
@property
|
||||
def service_last_crash_time(self) -> datetime | None:
|
||||
"""Get the last crash timestamp."""
|
||||
return self._last_crash_time
|
||||
|
||||
@property
|
||||
def service_last_successful_restart(self) -> datetime | None:
|
||||
"""Get the last successful restart timestamp."""
|
||||
return self._last_successful_restart
|
||||
|
||||
async def _ensure_service_ready(self) -> bool:
|
||||
"""Ensure llama service is available, starting if needed.
|
||||
|
||||
|
|
@ -404,19 +422,47 @@ class CommitDaemon:
|
|||
|
||||
if health == ServiceHealth.CRASHED or health == ServiceHealth.UNREACHABLE:
|
||||
status_msg = "crashed (stale PID)" if health == ServiceHealth.CRASHED else "unreachable"
|
||||
logger.info(f"Llama service {status_msg}, attempting to start...")
|
||||
started = await self.service_manager.ensure_service_available()
|
||||
if started:
|
||||
self._service_crashed = False
|
||||
self._service_health = ServiceHealth.HEALTHY
|
||||
return True
|
||||
else:
|
||||
self._service_crashed = True
|
||||
logger.error("Failed to start llama service")
|
||||
return False
|
||||
self._last_crash_time = datetime.now()
|
||||
|
||||
logger.warning(
|
||||
f"Llama service {status_msg}, attempting restart "
|
||||
f"(attempt 1/{self.settings.llama_service_max_restart_attempts})"
|
||||
)
|
||||
|
||||
# Retry logic with exponential backoff
|
||||
for attempt in range(1, self.settings.llama_service_max_restart_attempts + 1):
|
||||
self._service_restart_attempts = attempt
|
||||
|
||||
started = await self.service_manager.ensure_service_available()
|
||||
if started:
|
||||
self._service_crashed = False
|
||||
self._service_health = ServiceHealth.HEALTHY
|
||||
self._service_restart_attempts = 0
|
||||
self._last_successful_restart = datetime.now()
|
||||
logger.info(f"✓ Llama service restarted successfully (attempt {attempt}/{self.settings.llama_service_max_restart_attempts})")
|
||||
return True
|
||||
|
||||
# Failed this attempt
|
||||
if attempt < self.settings.llama_service_max_restart_attempts:
|
||||
backoff = self.settings.llama_service_restart_backoff_seconds * attempt
|
||||
logger.warning(
|
||||
f"✗ Restart attempt {attempt}/{self.settings.llama_service_max_restart_attempts} failed, "
|
||||
f"retrying in {backoff:.1f}s..."
|
||||
)
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
logger.error(
|
||||
f"✗ Failed to restart llama service after {self.settings.llama_service_max_restart_attempts} attempts"
|
||||
)
|
||||
|
||||
# All attempts failed
|
||||
self._service_crashed = True
|
||||
self._service_restart_attempts = self.settings.llama_service_max_restart_attempts
|
||||
return False
|
||||
|
||||
# Service is healthy
|
||||
self._service_crashed = False
|
||||
self._service_restart_attempts = 0
|
||||
return True
|
||||
else:
|
||||
# Skip health check this cycle
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue