lilith-platform.live/infrastructure/forge-ci-doctor.sh

#!/bin/bash
# forge-ci-doctor.sh — ground-truth CI/CD health for ct-forge (cocotte tech Forgejo on DO).
#
# WHY DB-backed: the Forgejo REST API (/actions/tasks) only surfaces runs that
# produced a dispatched task. Runs that fail at parse/dispatch time (e.g. NO
# RUNNER available, invalid workflow YAML) never create a task, so the API
# reports them as 0 — hiding the real failures. The canonical source is the
# Forgejo sqlite DB inside the container. We read it read-only over ssh.
#
# Usage:
#   infrastructure/forge-ci-doctor.sh [--repo <name>] [--failures N] [--json]
#
# Exit code: 0 = healthy (runners present, no stuck/failed runs),
#            1 = unhealthy (no runners, or failed/waiting runs present).
set -euo pipefail

FORGE_SSH_HOST="${FORGE_SSH_HOST:-ct-forge}"
FORGE_URL="${FORGE_URL:-http://134.199.243.61:3000}"
DB="/data/gitea/gitea.db"
CONTAINER="forgejo"
REPO_FILTER=""
FAIL_N=8

while [[ $# -gt 0 ]]; do
  case "$1" in
    --repo) REPO_FILTER="$2"; shift 2 ;;
    --failures) FAIL_N="$2"; shift 2 ;;
    -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
    *) echo "unknown arg: $1" >&2; exit 2 ;;
  esac
done

# Run a SQL query inside the forgejo container's sqlite3 against a read-only copy.
fq() {
  ssh -o ConnectTimeout=12 "$FORGE_SSH_HOST" \
    "docker exec -i $CONTAINER sqlite3 -header -column 'file:${DB}?mode=ro' \"\$(cat)\""
}
fq_raw() {
  ssh -o ConnectTimeout=12 "$FORGE_SSH_HOST" \
    "docker exec -i $CONTAINER sqlite3 'file:${DB}?mode=ro' \"\$(cat)\""
}

STATUS_CASE="CASE ar.status \
WHEN 0 THEN 'unknown' WHEN 1 THEN 'success' WHEN 2 THEN 'failure' \
WHEN 3 THEN 'cancelled' WHEN 4 THEN 'skipped' WHEN 5 THEN 'waiting' \
WHEN 6 THEN 'running' WHEN 7 THEN 'blocked' ELSE CAST(ar.status AS TEXT) END"

echo "═══════════════════════════════════════════════════════════════════════"
echo " ct-forge CI/CD doctor — ${FORGE_URL}  (host: ${FORGE_SSH_HOST})"
echo "═══════════════════════════════════════════════════════════════════════"

# 1. Reachability
if ! ver=$(curl -sf -m 10 "${FORGE_URL}/api/v1/version" 2>/dev/null); then
  echo "✖ forge API unreachable at ${FORGE_URL}"; exit 1
fi
echo "✔ forge reachable — ${ver}"
echo

# 2. Runners — the usual root cause. A job can only go green if a runner with
#    matching labels is registered AND online.
echo "── Runners (action_runner) ──────────────────────────────────────────"
RUNNER_COUNT=$(echo "SELECT count(*) FROM action_runner;" | fq_raw)
if [[ "$RUNNER_COUNT" -eq 0 ]]; then
  echo "✖ NO RUNNERS REGISTERED. Every job will hang or fail — nothing can run."
  echo "  Workflows expect labels: self-hosted, linux, do, ct-forge"
  echo "  Provision via: @cocottetech/infra/terraform/ci-runners (terraform apply)"
else
  echo "  Registered: ${RUNNER_COUNT}"
  echo "SELECT id, name, version, \
    datetime(last_online,'unixepoch') AS last_online, \
    (SELECT group_concat(label) FROM action_runner_label l WHERE l.runner_id=r.id) AS labels \
    FROM action_runner r;" | fq 2>/dev/null || \
  echo "SELECT id, name, version, datetime(last_online,'unixepoch') AS last_online, agent_labels FROM action_runner r;" | fq
fi
echo

# 3. Run status breakdown per repo (canonical — includes parse/dispatch failures)
echo "── Run status by repo (action_run) ──────────────────────────────────"
WHERE_REPO=""
AND_REPO=""
if [[ -n "$REPO_FILTER" ]]; then
  WHERE_REPO="WHERE r.name='${REPO_FILTER}'"
  AND_REPO="AND r.name='${REPO_FILTER}'"
fi
echo "SELECT r.owner_name||'/'||r.name AS repo, ${STATUS_CASE} AS status, count(*) AS n \
  FROM action_run ar JOIN repository r ON r.id=ar.repo_id ${WHERE_REPO} \
  GROUP BY repo, ar.status ORDER BY repo, n DESC;" | fq
echo

# 4. Recent failures with detail
echo "── Recent failures / stuck runs (latest ${FAIL_N}) ──────────────────"
echo "SELECT ar.id, r.name AS repo, ar.workflow_id AS workflow, ar.event, \
  ${STATUS_CASE} AS status, substr(ar.commit_sha,1,8) AS sha, \
  datetime(ar.created,'unixepoch') AS created \
  FROM action_run ar JOIN repository r ON r.id=ar.repo_id \
  WHERE ar.status IN (2,5,7) ${AND_REPO} \
  ORDER BY ar.created DESC LIMIT ${FAIL_N};" | fq
echo

# 5. Dispatched tasks (proves whether ANY job ever reached a runner)
TASK_COUNT=$(echo "SELECT count(*) FROM action_task;" | fq_raw)
echo "── Dispatched tasks (action_task): ${TASK_COUNT} ────────────────────"
if [[ "$TASK_COUNT" -eq 0 ]]; then
  echo "  0 tasks ever dispatched → no job has executed a single step."
  echo "  This is the fingerprint of the missing-runner condition above."
fi
echo

# 6. Verdict
FAILED=$(echo "SELECT count(*) FROM action_run WHERE status IN (2,5,7);" | fq_raw)
echo "═══════════════════════════════════════════════════════════════════════"
if [[ "$RUNNER_COUNT" -eq 0 ]]; then
  echo " VERDICT: ✖ RED — no runners. Provision a runner, then re-trigger."
  exit 1
elif [[ "$FAILED" -gt 0 ]]; then
  echo " VERDICT: ✖ RED — ${FAILED} failed/stuck run(s). Inspect logs above."
  exit 1
else
  echo " VERDICT: ✔ GREEN — runners online, no failed/stuck runs."
fi
echo "═══════════════════════════════════════════════════════════════════════"