feat(@ml/auto-commit-service): add stalled repo notifications

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-04-15 22:23:55 -07:00
parent 3b7be20bce
commit a4a2f8dd82
2 changed files with 99 additions and 3 deletions

View file

@ -73,6 +73,9 @@ class CommitsTrayApp(rumps.App):
self._local_repos: list[Path] = []
self._repos_discovered = False
# Track stalled-repo transitions so we notify once per edge
self._last_stalled_set: set[str] = set()
# Build menu
self._status_item = rumps.MenuItem("Status: starting...", callback=None)
self._status_item.set_callback(None)
@ -132,6 +135,37 @@ class CommitsTrayApp(rumps.App):
last = self.agent.last_cycle
stalled = list(last.stalled_repos) if last else []
# Notify on transitions into/out of stalled state (once per edge).
new_stalled = {e["repo_name"] for e in stalled}
newly_stalled = new_stalled - self._last_stalled_set
recovered = self._last_stalled_set - new_stalled
for repo_name in sorted(newly_stalled):
entry = next((e for e in stalled if e["repo_name"] == repo_name), {})
ahead = entry.get("ahead", 0)
behind = entry.get("behind", 0)
reason = entry.get("reason", "diverged")
try:
rumps.notification(
title="ACS stalled",
subtitle=repo_name,
message=f"{ahead}{behind}↓ ({reason}) — needs human attention",
)
except Exception as exc:
logger.warning("Failed to post stalled notification for %s: %s", repo_name, exc)
for repo_name in sorted(recovered):
try:
rumps.notification(
title="ACS recovered",
subtitle=repo_name,
message="back in sync",
)
except Exception as exc:
logger.warning("Failed to post recovery notification for %s: %s", repo_name, exc)
self._last_stalled_set = new_stalled
# Icon
if not agent_running:
icon = ICON_ERROR

View file

@ -9,7 +9,9 @@ Runs as a background thread managed by the tray app.
from __future__ import annotations
import json
import logging
import os
import socket
import subprocess
import threading
@ -75,6 +77,7 @@ class LocalCommitAgent:
repos_paths: list[Path],
cycle_seconds: int = 300,
co_author: str = "Lilith Autocommit <noreply@atlilith.com>",
recovery_state_path: Path | None = None,
):
self.acs_url = acs_url.rstrip("/")
self.repos_paths = repos_paths
@ -89,10 +92,62 @@ class LocalCommitAgent:
self._repos: list[Path] = []
self._last_cycle: CycleResult | None = None
self._total_cycles = 0
self._last_recovery_at: dict[str, float] = {} # repo_name -> monotonic ts
self._recovery_state_path_override = recovery_state_path
self._last_recovery_at: dict[str, float] = {} # repo_name -> wall-clock UNIX ts
# repo_name -> required cooldown for *next* attempt. Starts at 1h, bumps
# to 24h after Claude couldn't/wouldn't resolve. Resets on clean success.
self._recovery_cooldown: dict[str, float] = {}
self._load_recovery_state()
def _recovery_state_path(self) -> Path:
if self._recovery_state_path_override is not None:
return self._recovery_state_path_override
return Path.home() / "Library" / "Application Support" / "Commits" / "recovery.json"
def _load_recovery_state(self) -> None:
path = self._recovery_state_path()
try:
with path.open("r", encoding="utf-8") as fh:
data = json.load(fh)
except FileNotFoundError:
logger.debug(f"Recovery state file not found at {path}; starting fresh")
return
except (OSError, json.JSONDecodeError) as e:
logger.debug(f"Recovery state unreadable at {path}: {e}; starting fresh")
return
if not isinstance(data, dict):
logger.debug(f"Recovery state at {path} is not a dict; starting fresh")
return
for repo_name, entry in data.items():
if not isinstance(entry, dict):
continue
last_attempt = entry.get("last_attempt")
cooldown_sec = entry.get("cooldown_sec")
if isinstance(last_attempt, (int, float)):
self._last_recovery_at[repo_name] = float(last_attempt)
if isinstance(cooldown_sec, (int, float)):
self._recovery_cooldown[repo_name] = float(cooldown_sec)
def _save_recovery_state(self) -> None:
path = self._recovery_state_path()
payload: dict[str, dict[str, float]] = {}
for repo_name, last_attempt in self._last_recovery_at.items():
payload[repo_name] = {
"last_attempt": last_attempt,
"cooldown_sec": self._recovery_cooldown.get(
repo_name, CLAUDE_RECOVERY_COOLDOWN_SEC
),
}
try:
path.parent.mkdir(parents=True, exist_ok=True)
tmp_path = path.with_suffix(path.suffix + ".tmp")
with tmp_path.open("w", encoding="utf-8") as fh:
json.dump(payload, fh, indent=2, sort_keys=True)
os.replace(tmp_path, path)
except OSError as e:
logger.warning(f"Failed to persist recovery state to {path}: {e}")
@property
def is_running(self) -> bool:
@ -327,7 +382,7 @@ class LocalCommitAgent:
result: CycleResult,
) -> None:
"""Invoke Claude Code (rate-limited) to reconcile divergence."""
now = time.monotonic()
now = time.time()
last = self._last_recovery_at.get(repo_name, 0.0)
required = self._recovery_cooldown.get(repo_name, CLAUDE_RECOVERY_COOLDOWN_SEC)
cooled_down = (now - last) >= required
@ -338,7 +393,7 @@ class LocalCommitAgent:
"ahead": ahead,
"behind": behind,
"last_attempt": (
datetime.fromtimestamp(time.time() - (now - last), timezone.utc).isoformat()
datetime.fromtimestamp(last, timezone.utc).isoformat()
if last else None
),
}
@ -352,6 +407,7 @@ class LocalCommitAgent:
return
self._last_recovery_at[repo_name] = now
self._save_recovery_state()
logger.warning(
f"Diverged {repo_name}: {ahead}{behind}↓ — invoking claude-code for recovery"
)
@ -362,12 +418,16 @@ class LocalCommitAgent:
if ahead2 == 0 and behind2 == 0:
logger.info(f"Claude resolved {repo_name} cleanly")
self._recovery_cooldown.pop(repo_name, None)
self._last_recovery_at.pop(repo_name, None)
self._save_recovery_state()
return
if behind2 == 0 and ahead2 > 0:
try:
_git(repo_path, "push")
logger.info(f"Claude rebased {repo_name}; push succeeded")
self._recovery_cooldown.pop(repo_name, None)
self._last_recovery_at.pop(repo_name, None)
self._save_recovery_state()
return
except Exception as e:
logger.error(f"Claude rebased {repo_name} but push failed: {e}")
@ -377,10 +437,12 @@ class LocalCommitAgent:
)
stall_entry.update(ahead=ahead2, behind=behind2, reason="claude_partial")
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
self._save_recovery_state()
else:
logger.warning(f"Claude could not recover {repo_name} — marking stalled for 24h")
stall_entry["reason"] = "claude_failed"
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
self._save_recovery_state()
result.stalled_repos.append(stall_entry)