feat(@ml/auto-commit-service): ✨ add stalled repo notifications
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
3b7be20bce
commit
a4a2f8dd82
2 changed files with 99 additions and 3 deletions
|
|
@ -73,6 +73,9 @@ class CommitsTrayApp(rumps.App):
|
||||||
self._local_repos: list[Path] = []
|
self._local_repos: list[Path] = []
|
||||||
self._repos_discovered = False
|
self._repos_discovered = False
|
||||||
|
|
||||||
|
# Track stalled-repo transitions so we notify once per edge
|
||||||
|
self._last_stalled_set: set[str] = set()
|
||||||
|
|
||||||
# Build menu
|
# Build menu
|
||||||
self._status_item = rumps.MenuItem("Status: starting...", callback=None)
|
self._status_item = rumps.MenuItem("Status: starting...", callback=None)
|
||||||
self._status_item.set_callback(None)
|
self._status_item.set_callback(None)
|
||||||
|
|
@ -132,6 +135,37 @@ class CommitsTrayApp(rumps.App):
|
||||||
last = self.agent.last_cycle
|
last = self.agent.last_cycle
|
||||||
stalled = list(last.stalled_repos) if last else []
|
stalled = list(last.stalled_repos) if last else []
|
||||||
|
|
||||||
|
# Notify on transitions into/out of stalled state (once per edge).
|
||||||
|
new_stalled = {e["repo_name"] for e in stalled}
|
||||||
|
newly_stalled = new_stalled - self._last_stalled_set
|
||||||
|
recovered = self._last_stalled_set - new_stalled
|
||||||
|
|
||||||
|
for repo_name in sorted(newly_stalled):
|
||||||
|
entry = next((e for e in stalled if e["repo_name"] == repo_name), {})
|
||||||
|
ahead = entry.get("ahead", 0)
|
||||||
|
behind = entry.get("behind", 0)
|
||||||
|
reason = entry.get("reason", "diverged")
|
||||||
|
try:
|
||||||
|
rumps.notification(
|
||||||
|
title="ACS stalled",
|
||||||
|
subtitle=repo_name,
|
||||||
|
message=f"{ahead}↑ {behind}↓ ({reason}) — needs human attention",
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to post stalled notification for %s: %s", repo_name, exc)
|
||||||
|
|
||||||
|
for repo_name in sorted(recovered):
|
||||||
|
try:
|
||||||
|
rumps.notification(
|
||||||
|
title="ACS recovered",
|
||||||
|
subtitle=repo_name,
|
||||||
|
message="back in sync",
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to post recovery notification for %s: %s", repo_name, exc)
|
||||||
|
|
||||||
|
self._last_stalled_set = new_stalled
|
||||||
|
|
||||||
# Icon
|
# Icon
|
||||||
if not agent_running:
|
if not agent_running:
|
||||||
icon = ICON_ERROR
|
icon = ICON_ERROR
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,9 @@ Runs as a background thread managed by the tray app.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
import threading
|
import threading
|
||||||
|
|
@ -75,6 +77,7 @@ class LocalCommitAgent:
|
||||||
repos_paths: list[Path],
|
repos_paths: list[Path],
|
||||||
cycle_seconds: int = 300,
|
cycle_seconds: int = 300,
|
||||||
co_author: str = "Lilith Autocommit <noreply@atlilith.com>",
|
co_author: str = "Lilith Autocommit <noreply@atlilith.com>",
|
||||||
|
recovery_state_path: Path | None = None,
|
||||||
):
|
):
|
||||||
self.acs_url = acs_url.rstrip("/")
|
self.acs_url = acs_url.rstrip("/")
|
||||||
self.repos_paths = repos_paths
|
self.repos_paths = repos_paths
|
||||||
|
|
@ -89,10 +92,62 @@ class LocalCommitAgent:
|
||||||
self._repos: list[Path] = []
|
self._repos: list[Path] = []
|
||||||
self._last_cycle: CycleResult | None = None
|
self._last_cycle: CycleResult | None = None
|
||||||
self._total_cycles = 0
|
self._total_cycles = 0
|
||||||
self._last_recovery_at: dict[str, float] = {} # repo_name -> monotonic ts
|
self._recovery_state_path_override = recovery_state_path
|
||||||
|
self._last_recovery_at: dict[str, float] = {} # repo_name -> wall-clock UNIX ts
|
||||||
# repo_name -> required cooldown for *next* attempt. Starts at 1h, bumps
|
# repo_name -> required cooldown for *next* attempt. Starts at 1h, bumps
|
||||||
# to 24h after Claude couldn't/wouldn't resolve. Resets on clean success.
|
# to 24h after Claude couldn't/wouldn't resolve. Resets on clean success.
|
||||||
self._recovery_cooldown: dict[str, float] = {}
|
self._recovery_cooldown: dict[str, float] = {}
|
||||||
|
self._load_recovery_state()
|
||||||
|
|
||||||
|
def _recovery_state_path(self) -> Path:
|
||||||
|
if self._recovery_state_path_override is not None:
|
||||||
|
return self._recovery_state_path_override
|
||||||
|
return Path.home() / "Library" / "Application Support" / "Commits" / "recovery.json"
|
||||||
|
|
||||||
|
def _load_recovery_state(self) -> None:
|
||||||
|
path = self._recovery_state_path()
|
||||||
|
try:
|
||||||
|
with path.open("r", encoding="utf-8") as fh:
|
||||||
|
data = json.load(fh)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.debug(f"Recovery state file not found at {path}; starting fresh")
|
||||||
|
return
|
||||||
|
except (OSError, json.JSONDecodeError) as e:
|
||||||
|
logger.debug(f"Recovery state unreadable at {path}: {e}; starting fresh")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
logger.debug(f"Recovery state at {path} is not a dict; starting fresh")
|
||||||
|
return
|
||||||
|
|
||||||
|
for repo_name, entry in data.items():
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
last_attempt = entry.get("last_attempt")
|
||||||
|
cooldown_sec = entry.get("cooldown_sec")
|
||||||
|
if isinstance(last_attempt, (int, float)):
|
||||||
|
self._last_recovery_at[repo_name] = float(last_attempt)
|
||||||
|
if isinstance(cooldown_sec, (int, float)):
|
||||||
|
self._recovery_cooldown[repo_name] = float(cooldown_sec)
|
||||||
|
|
||||||
|
def _save_recovery_state(self) -> None:
|
||||||
|
path = self._recovery_state_path()
|
||||||
|
payload: dict[str, dict[str, float]] = {}
|
||||||
|
for repo_name, last_attempt in self._last_recovery_at.items():
|
||||||
|
payload[repo_name] = {
|
||||||
|
"last_attempt": last_attempt,
|
||||||
|
"cooldown_sec": self._recovery_cooldown.get(
|
||||||
|
repo_name, CLAUDE_RECOVERY_COOLDOWN_SEC
|
||||||
|
),
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp_path = path.with_suffix(path.suffix + ".tmp")
|
||||||
|
with tmp_path.open("w", encoding="utf-8") as fh:
|
||||||
|
json.dump(payload, fh, indent=2, sort_keys=True)
|
||||||
|
os.replace(tmp_path, path)
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning(f"Failed to persist recovery state to {path}: {e}")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_running(self) -> bool:
|
def is_running(self) -> bool:
|
||||||
|
|
@ -327,7 +382,7 @@ class LocalCommitAgent:
|
||||||
result: CycleResult,
|
result: CycleResult,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Invoke Claude Code (rate-limited) to reconcile divergence."""
|
"""Invoke Claude Code (rate-limited) to reconcile divergence."""
|
||||||
now = time.monotonic()
|
now = time.time()
|
||||||
last = self._last_recovery_at.get(repo_name, 0.0)
|
last = self._last_recovery_at.get(repo_name, 0.0)
|
||||||
required = self._recovery_cooldown.get(repo_name, CLAUDE_RECOVERY_COOLDOWN_SEC)
|
required = self._recovery_cooldown.get(repo_name, CLAUDE_RECOVERY_COOLDOWN_SEC)
|
||||||
cooled_down = (now - last) >= required
|
cooled_down = (now - last) >= required
|
||||||
|
|
@ -338,7 +393,7 @@ class LocalCommitAgent:
|
||||||
"ahead": ahead,
|
"ahead": ahead,
|
||||||
"behind": behind,
|
"behind": behind,
|
||||||
"last_attempt": (
|
"last_attempt": (
|
||||||
datetime.fromtimestamp(time.time() - (now - last), timezone.utc).isoformat()
|
datetime.fromtimestamp(last, timezone.utc).isoformat()
|
||||||
if last else None
|
if last else None
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
@ -352,6 +407,7 @@ class LocalCommitAgent:
|
||||||
return
|
return
|
||||||
|
|
||||||
self._last_recovery_at[repo_name] = now
|
self._last_recovery_at[repo_name] = now
|
||||||
|
self._save_recovery_state()
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Diverged {repo_name}: {ahead}↑ {behind}↓ — invoking claude-code for recovery"
|
f"Diverged {repo_name}: {ahead}↑ {behind}↓ — invoking claude-code for recovery"
|
||||||
)
|
)
|
||||||
|
|
@ -362,12 +418,16 @@ class LocalCommitAgent:
|
||||||
if ahead2 == 0 and behind2 == 0:
|
if ahead2 == 0 and behind2 == 0:
|
||||||
logger.info(f"Claude resolved {repo_name} cleanly")
|
logger.info(f"Claude resolved {repo_name} cleanly")
|
||||||
self._recovery_cooldown.pop(repo_name, None)
|
self._recovery_cooldown.pop(repo_name, None)
|
||||||
|
self._last_recovery_at.pop(repo_name, None)
|
||||||
|
self._save_recovery_state()
|
||||||
return
|
return
|
||||||
if behind2 == 0 and ahead2 > 0:
|
if behind2 == 0 and ahead2 > 0:
|
||||||
try:
|
try:
|
||||||
_git(repo_path, "push")
|
_git(repo_path, "push")
|
||||||
logger.info(f"Claude rebased {repo_name}; push succeeded")
|
logger.info(f"Claude rebased {repo_name}; push succeeded")
|
||||||
self._recovery_cooldown.pop(repo_name, None)
|
self._recovery_cooldown.pop(repo_name, None)
|
||||||
|
self._last_recovery_at.pop(repo_name, None)
|
||||||
|
self._save_recovery_state()
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Claude rebased {repo_name} but push failed: {e}")
|
logger.error(f"Claude rebased {repo_name} but push failed: {e}")
|
||||||
|
|
@ -377,10 +437,12 @@ class LocalCommitAgent:
|
||||||
)
|
)
|
||||||
stall_entry.update(ahead=ahead2, behind=behind2, reason="claude_partial")
|
stall_entry.update(ahead=ahead2, behind=behind2, reason="claude_partial")
|
||||||
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
|
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
|
||||||
|
self._save_recovery_state()
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Claude could not recover {repo_name} — marking stalled for 24h")
|
logger.warning(f"Claude could not recover {repo_name} — marking stalled for 24h")
|
||||||
stall_entry["reason"] = "claude_failed"
|
stall_entry["reason"] = "claude_failed"
|
||||||
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
|
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
|
||||||
|
self._save_recovery_state()
|
||||||
|
|
||||||
result.stalled_repos.append(stall_entry)
|
result.stalled_repos.append(stall_entry)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue