feat(@ml/auto-commit-service): add cooldown logic for Claude recovery attempts

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Natalie 2026-04-15 01:16:35 -07:00
parent c508867e0e
commit 3b7be20bce

View file

@ -29,9 +29,12 @@ logger = logging.getLogger(__name__)
HOSTNAME = socket.gethostname().split(".")[0] # e.g., "plum" from "plum.voyager.nasty.sh"
# Don't re-invoke Claude for the same repo more often than this — diverged
# branches that Claude can't fix stay stuck; spamming every 5 min wastes tokens.
# Cooldown between Claude recovery attempts on the same repo.
# Short cooldown: first try after any idle period (covers transient divergence).
# Long cooldown: once Claude has already looked at a repo and couldn't or
# wouldn't resolve it, don't burn more tokens for 24h — human attention needed.
CLAUDE_RECOVERY_COOLDOWN_SEC = 3600
CLAUDE_STUCK_COOLDOWN_SEC = 86400
CLAUDE_RECOVERY_PROMPT = """You are recovering a diverged git branch in the auto-commit service on plum.
@ -87,6 +90,9 @@ class LocalCommitAgent:
self._last_cycle: CycleResult | None = None
self._total_cycles = 0
self._last_recovery_at: dict[str, float] = {} # repo_name -> monotonic ts
# repo_name -> required cooldown for *next* attempt. Starts at 1h, bumps
# to 24h after Claude couldn't/wouldn't resolve. Resets on clean success.
self._recovery_cooldown: dict[str, float] = {}
@property
def is_running(self) -> bool:
@ -183,9 +189,13 @@ class LocalCommitAgent:
if not diff.strip():
return False
# Get repo name and branch
# Get repo name and branch (symbolic-ref works even for unborn branches,
# unlike `rev-parse HEAD` which fails before the initial commit).
repo_name = _repo_display_name(repo_path)
branch = _git(repo_path, "rev-parse", "--abbrev-ref", "HEAD").strip() or "main"
try:
branch = _git(repo_path, "symbolic-ref", "--short", "HEAD").strip()
except Exception:
branch = "main"
# Ask ACS for commit message
try:
@ -319,7 +329,8 @@ class LocalCommitAgent:
"""Invoke Claude Code (rate-limited) to reconcile divergence."""
now = time.monotonic()
last = self._last_recovery_at.get(repo_name, 0.0)
cooled_down = (now - last) >= CLAUDE_RECOVERY_COOLDOWN_SEC
required = self._recovery_cooldown.get(repo_name, CLAUDE_RECOVERY_COOLDOWN_SEC)
cooled_down = (now - last) >= required
stall_entry = {
"repo_name": repo_name,
@ -335,7 +346,7 @@ class LocalCommitAgent:
if not cooled_down:
logger.info(
f"Stalled {repo_name}: {ahead}{behind}↓ (in cooldown, "
f"next Claude attempt in {int(CLAUDE_RECOVERY_COOLDOWN_SEC - (now - last))}s)"
f"next Claude attempt in {int(required - (now - last))}s)"
)
result.stalled_repos.append(stall_entry)
return
@ -350,22 +361,26 @@ class LocalCommitAgent:
ahead2, behind2 = _ahead_behind(repo_path, upstream)
if ahead2 == 0 and behind2 == 0:
logger.info(f"Claude resolved {repo_name} cleanly")
self._recovery_cooldown.pop(repo_name, None)
return
if behind2 == 0 and ahead2 > 0:
try:
_git(repo_path, "push")
logger.info(f"Claude rebased {repo_name}; push succeeded")
self._recovery_cooldown.pop(repo_name, None)
return
except Exception as e:
logger.error(f"Claude rebased {repo_name} but push failed: {e}")
logger.warning(
f"Claude exited clean but {repo_name} still diverged: {ahead2}{behind2}"
f"— marking stalled (likely bailed on semantic conflicts)"
f"— marking stalled for 24h (likely bailed on semantic conflicts)"
)
stall_entry.update(ahead=ahead2, behind=behind2, reason="claude_partial")
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
else:
logger.warning(f"Claude could not recover {repo_name} — marking stalled")
logger.warning(f"Claude could not recover {repo_name} — marking stalled for 24h")
stall_entry["reason"] = "claude_failed"
self._recovery_cooldown[repo_name] = CLAUDE_STUCK_COOLDOWN_SEC
result.stalled_repos.append(stall_entry)