diff --git a/src/auto_commit_service/app.py b/src/auto_commit_service/app.py index 71b90c0..862ae1b 100644 --- a/src/auto_commit_service/app.py +++ b/src/auto_commit_service/app.py @@ -147,6 +147,10 @@ def create_auto_commit_service( llama_service_available=llama_available, repos_accessible=repos_accessible, error=error, + llama_service_crashed=service_crashed, + llama_service_restart_attempts=daemon.service_restart_attempts, + llama_service_last_crash=daemon.service_last_crash_time, + llama_service_last_successful_restart=daemon.service_last_successful_restart, ) @app.get("/status", response_model=DaemonStatus) @@ -162,6 +166,9 @@ def create_auto_commit_service( service_crashed=daemon.service_crashed, service_health=daemon.service_health, last_health_check=daemon.last_health_check, + service_restart_attempts=daemon.service_restart_attempts, + service_last_crash_time=daemon.service_last_crash_time, + service_last_successful_restart=daemon.service_last_successful_restart, ) @app.post("/trigger", response_model=TriggerResponse) diff --git a/src/auto_commit_service/config.py b/src/auto_commit_service/config.py index 278ebd2..48330a7 100644 --- a/src/auto_commit_service/config.py +++ b/src/auto_commit_service/config.py @@ -121,6 +121,14 @@ class AutoCommitSettings(BaseServiceSettings): default=0, description="Cycles between health checks (0 = check every cycle)", ) + llama_service_max_restart_attempts: int = Field( + default=3, + description="Maximum restart attempts before giving up", + ) + llama_service_restart_backoff_seconds: float = Field( + default=5.0, + description="Delay between restart attempts (seconds)", + ) # Model-boss integration for auto-loading LLM llama_model_id: str = Field( diff --git a/src/auto_commit_service/models.py b/src/auto_commit_service/models.py index 5ab515f..8ab583b 100644 --- a/src/auto_commit_service/models.py +++ b/src/auto_commit_service/models.py @@ -52,6 +52,9 @@ class DaemonStatus(BaseModel): service_crashed: bool = False service_health: str | None = None last_health_check: datetime | None = None + service_restart_attempts: int = 0 + service_last_crash_time: datetime | None = None + service_last_successful_restart: datetime | None = None class HealthResponse(BaseModel): @@ -62,6 +65,10 @@ class HealthResponse(BaseModel): llama_service_available: bool repos_accessible: bool error: str | None = None + llama_service_crashed: bool = False + llama_service_restart_attempts: int = 0 + llama_service_last_crash: datetime | None = None + llama_service_last_successful_restart: datetime | None = None class TriggerResponse(BaseModel): diff --git a/src/auto_commit_service/scheduler/daemon.py b/src/auto_commit_service/scheduler/daemon.py index 1f28de6..7886387 100644 --- a/src/auto_commit_service/scheduler/daemon.py +++ b/src/auto_commit_service/scheduler/daemon.py @@ -91,6 +91,9 @@ class CommitDaemon: self._service_health: ServiceHealth | None = None self._last_health_check: datetime | None = None self._last_health_check_cycle = 0 # Track cycles since last check + self._service_restart_attempts = 0 + self._last_crash_time: datetime | None = None + self._last_successful_restart: datetime | None = None def _build_repos(self) -> list[Repository]: """Build the list of repositories to process.""" @@ -380,6 +383,21 @@ class CommitDaemon: """Get the last health check timestamp.""" return self._last_health_check + @property + def service_restart_attempts(self) -> int: + """Get the number of restart attempts.""" + return self._service_restart_attempts + + @property + def service_last_crash_time(self) -> datetime | None: + """Get the last crash timestamp.""" + return self._last_crash_time + + @property + def service_last_successful_restart(self) -> datetime | None: + """Get the last successful restart timestamp.""" + return self._last_successful_restart + async def _ensure_service_ready(self) -> bool: """Ensure llama service is available, starting if needed. @@ -404,19 +422,47 @@ class CommitDaemon: if health == ServiceHealth.CRASHED or health == ServiceHealth.UNREACHABLE: status_msg = "crashed (stale PID)" if health == ServiceHealth.CRASHED else "unreachable" - logger.info(f"Llama service {status_msg}, attempting to start...") - started = await self.service_manager.ensure_service_available() - if started: - self._service_crashed = False - self._service_health = ServiceHealth.HEALTHY - return True - else: - self._service_crashed = True - logger.error("Failed to start llama service") - return False + self._last_crash_time = datetime.now() + + logger.warning( + f"Llama service {status_msg}, attempting restart " + f"(attempt 1/{self.settings.llama_service_max_restart_attempts})" + ) + + # Retry logic with exponential backoff + for attempt in range(1, self.settings.llama_service_max_restart_attempts + 1): + self._service_restart_attempts = attempt + + started = await self.service_manager.ensure_service_available() + if started: + self._service_crashed = False + self._service_health = ServiceHealth.HEALTHY + self._service_restart_attempts = 0 + self._last_successful_restart = datetime.now() + logger.info(f"✓ Llama service restarted successfully (attempt {attempt}/{self.settings.llama_service_max_restart_attempts})") + return True + + # Failed this attempt + if attempt < self.settings.llama_service_max_restart_attempts: + backoff = self.settings.llama_service_restart_backoff_seconds * attempt + logger.warning( + f"✗ Restart attempt {attempt}/{self.settings.llama_service_max_restart_attempts} failed, " + f"retrying in {backoff:.1f}s..." + ) + await asyncio.sleep(backoff) + else: + logger.error( + f"✗ Failed to restart llama service after {self.settings.llama_service_max_restart_attempts} attempts" + ) + + # All attempts failed + self._service_crashed = True + self._service_restart_attempts = self.settings.llama_service_max_restart_attempts + return False # Service is healthy self._service_crashed = False + self._service_restart_attempts = 0 return True else: # Skip health check this cycle