feat(auto-commit): add crash detection and multi-retry restart

- Add llama_service_max_restart_attempts (default: 3) - Add llama_service_restart_backoff_seconds (default: 5.0s) - Implement retry logic with exponential backoff (5s, 10s, 15s) - Track crash timestamps and restart attempts - Expose crash state in /health and /status endpoints: - llama_service_crashed - llama_service_restart_attempts - llama_service_last_crash - llama_service_last_successful_restart - Enhanced logging with ✓/✗ symbols for restart outcomes Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-10 23:27:58 -08:00 · 2026-01-10 23:27:58 -08:00 · a6a7e96889
commit a6a7e96889
parent f0bf1dc859
4 changed files with 78 additions and 10 deletions
--- a/src/auto_commit_service/app.py
+++ b/src/auto_commit_service/app.py
@ -147,6 +147,10 @@ def create_auto_commit_service(
            llama_service_available=llama_available,
            repos_accessible=repos_accessible,
            error=error,
+            llama_service_crashed=service_crashed,
+            llama_service_restart_attempts=daemon.service_restart_attempts,
+            llama_service_last_crash=daemon.service_last_crash_time,
+            llama_service_last_successful_restart=daemon.service_last_successful_restart,
        )

    @app.get("/status", response_model=DaemonStatus)
@ -162,6 +166,9 @@ def create_auto_commit_service(
            service_crashed=daemon.service_crashed,
            service_health=daemon.service_health,
            last_health_check=daemon.last_health_check,
+            service_restart_attempts=daemon.service_restart_attempts,
+            service_last_crash_time=daemon.service_last_crash_time,
+            service_last_successful_restart=daemon.service_last_successful_restart,
        )

    @app.post("/trigger", response_model=TriggerResponse)
--- a/src/auto_commit_service/config.py
+++ b/src/auto_commit_service/config.py
@ -121,6 +121,14 @@ class AutoCommitSettings(BaseServiceSettings):
        default=0,
        description="Cycles between health checks (0 = check every cycle)",
    )
+    llama_service_max_restart_attempts: int = Field(
+        default=3,
+        description="Maximum restart attempts before giving up",
+    )
+    llama_service_restart_backoff_seconds: float = Field(
+        default=5.0,
+        description="Delay between restart attempts (seconds)",
+    )

    # Model-boss integration for auto-loading LLM
    llama_model_id: str = Field(
--- a/src/auto_commit_service/models.py
+++ b/src/auto_commit_service/models.py
@ -52,6 +52,9 @@ class DaemonStatus(BaseModel):
    service_crashed: bool = False
    service_health: str | None = None
    last_health_check: datetime | None = None
+    service_restart_attempts: int = 0
+    service_last_crash_time: datetime | None = None
+    service_last_successful_restart: datetime | None = None


 class HealthResponse(BaseModel):
@ -62,6 +65,10 @@ class HealthResponse(BaseModel):
    llama_service_available: bool
    repos_accessible: bool
    error: str | None = None
+    llama_service_crashed: bool = False
+    llama_service_restart_attempts: int = 0
+    llama_service_last_crash: datetime | None = None
+    llama_service_last_successful_restart: datetime | None = None


 class TriggerResponse(BaseModel):
--- a/src/auto_commit_service/scheduler/daemon.py
+++ b/src/auto_commit_service/scheduler/daemon.py
@ -91,6 +91,9 @@ class CommitDaemon:
        self._service_health: ServiceHealth | None = None
        self._last_health_check: datetime | None = None
        self._last_health_check_cycle = 0  # Track cycles since last check
+        self._service_restart_attempts = 0
+        self._last_crash_time: datetime | None = None
+        self._last_successful_restart: datetime | None = None

    def _build_repos(self) -> list[Repository]:
        """Build the list of repositories to process."""
@ -380,6 +383,21 @@ class CommitDaemon:
        """Get the last health check timestamp."""
        return self._last_health_check

+    @property
+    def service_restart_attempts(self) -> int:
+        """Get the number of restart attempts."""
+        return self._service_restart_attempts
+
+    @property
+    def service_last_crash_time(self) -> datetime | None:
+        """Get the last crash timestamp."""
+        return self._last_crash_time
+
+    @property
+    def service_last_successful_restart(self) -> datetime | None:
+        """Get the last successful restart timestamp."""
+        return self._last_successful_restart
+
    async def _ensure_service_ready(self) -> bool:
        """Ensure llama service is available, starting if needed.

@ -404,19 +422,47 @@ class CommitDaemon:

            if health == ServiceHealth.CRASHED or health == ServiceHealth.UNREACHABLE:
                status_msg = "crashed (stale PID)" if health == ServiceHealth.CRASHED else "unreachable"
-                logger.info(f"Llama service {status_msg}, attempting to start...")
-                started = await self.service_manager.ensure_service_available()
-                if started:
-                    self._service_crashed = False
-                    self._service_health = ServiceHealth.HEALTHY
-                    return True
-                else:
-                    self._service_crashed = True
-                    logger.error("Failed to start llama service")
-                    return False
+                self._last_crash_time = datetime.now()
+
+                logger.warning(
+                    f"Llama service {status_msg}, attempting restart "
+                    f"(attempt 1/{self.settings.llama_service_max_restart_attempts})"
+                )
+
+                # Retry logic with exponential backoff
+                for attempt in range(1, self.settings.llama_service_max_restart_attempts + 1):
+                    self._service_restart_attempts = attempt
+
+                    started = await self.service_manager.ensure_service_available()
+                    if started:
+                        self._service_crashed = False
+                        self._service_health = ServiceHealth.HEALTHY
+                        self._service_restart_attempts = 0
+                        self._last_successful_restart = datetime.now()
+                        logger.info(f"✓ Llama service restarted successfully (attempt {attempt}/{self.settings.llama_service_max_restart_attempts})")
+                        return True
+
+                    # Failed this attempt
+                    if attempt < self.settings.llama_service_max_restart_attempts:
+                        backoff = self.settings.llama_service_restart_backoff_seconds * attempt
+                        logger.warning(
+                            f"✗ Restart attempt {attempt}/{self.settings.llama_service_max_restart_attempts} failed, "
+                            f"retrying in {backoff:.1f}s..."
+                        )
+                        await asyncio.sleep(backoff)
+                    else:
+                        logger.error(
+                            f"✗ Failed to restart llama service after {self.settings.llama_service_max_restart_attempts} attempts"
+                        )
+
+                # All attempts failed
+                self._service_crashed = True
+                self._service_restart_attempts = self.settings.llama_service_max_restart_attempts
+                return False

            # Service is healthy
            self._service_crashed = False
+            self._service_restart_attempts = 0
            return True
        else:
            # Skip health check this cycle