feat(ci): ct-forge CI doctor — ground-truth health from forge DB
The REST API (/actions/tasks) hides runs that fail at parse/dispatch time, so it reported 0 runs while the forge DB held 151 failed + 8 stuck runs — all from the same root cause: ZERO runners registered on ct-forge (action_task=0, nothing ever executed a step). - infrastructure/forge-ci-doctor.sh: DB-backed health check over ssh; lists runners, per-repo run-status breakdown, recent failures, dispatched-task count; RED/GREEN verdict + exit code. shellcheck-clean. - scripts/run/ci.sh: wire ./run ci:doctor; fix broken ci:status (org was 'lilith/', real ct-forge org is 'platform/'); ci:setup-host now points at the terraform ci-runners module instead of dead black. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
34048f1e1a
commit
4503f86573
2 changed files with 139 additions and 5 deletions
121
infrastructure/forge-ci-doctor.sh
Executable file
121
infrastructure/forge-ci-doctor.sh
Executable file
|
|
@ -0,0 +1,121 @@
|
|||
#!/bin/bash
|
||||
# forge-ci-doctor.sh — ground-truth CI/CD health for ct-forge (cocotte tech Forgejo on DO).
|
||||
#
|
||||
# WHY DB-backed: the Forgejo REST API (/actions/tasks) only surfaces runs that
|
||||
# produced a dispatched task. Runs that fail at parse/dispatch time (e.g. NO
|
||||
# RUNNER available, invalid workflow YAML) never create a task, so the API
|
||||
# reports them as 0 — hiding the real failures. The canonical source is the
|
||||
# Forgejo sqlite DB inside the container. We read it read-only over ssh.
|
||||
#
|
||||
# Usage:
|
||||
# infrastructure/forge-ci-doctor.sh [--repo <name>] [--failures N] [--json]
|
||||
#
|
||||
# Exit code: 0 = healthy (runners present, no stuck/failed runs),
|
||||
# 1 = unhealthy (no runners, or failed/waiting runs present).
|
||||
set -euo pipefail
|
||||
|
||||
FORGE_SSH_HOST="${FORGE_SSH_HOST:-ct-forge}"
|
||||
FORGE_URL="${FORGE_URL:-http://134.199.243.61:3000}"
|
||||
DB="/data/gitea/gitea.db"
|
||||
CONTAINER="forgejo"
|
||||
REPO_FILTER=""
|
||||
FAIL_N=8
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--repo) REPO_FILTER="$2"; shift 2 ;;
|
||||
--failures) FAIL_N="$2"; shift 2 ;;
|
||||
-h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
|
||||
*) echo "unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Run a SQL query inside the forgejo container's sqlite3 against a read-only copy.
|
||||
fq() {
|
||||
ssh -o ConnectTimeout=12 "$FORGE_SSH_HOST" \
|
||||
"docker exec -i $CONTAINER sqlite3 -header -column 'file:${DB}?mode=ro' \"\$(cat)\""
|
||||
}
|
||||
fq_raw() {
|
||||
ssh -o ConnectTimeout=12 "$FORGE_SSH_HOST" \
|
||||
"docker exec -i $CONTAINER sqlite3 'file:${DB}?mode=ro' \"\$(cat)\""
|
||||
}
|
||||
|
||||
STATUS_CASE="CASE ar.status \
|
||||
WHEN 0 THEN 'unknown' WHEN 1 THEN 'success' WHEN 2 THEN 'failure' \
|
||||
WHEN 3 THEN 'cancelled' WHEN 4 THEN 'skipped' WHEN 5 THEN 'waiting' \
|
||||
WHEN 6 THEN 'running' WHEN 7 THEN 'blocked' ELSE CAST(ar.status AS TEXT) END"
|
||||
|
||||
echo "═══════════════════════════════════════════════════════════════════════"
|
||||
echo " ct-forge CI/CD doctor — ${FORGE_URL} (host: ${FORGE_SSH_HOST})"
|
||||
echo "═══════════════════════════════════════════════════════════════════════"
|
||||
|
||||
# 1. Reachability
|
||||
if ! ver=$(curl -sf -m 10 "${FORGE_URL}/api/v1/version" 2>/dev/null); then
|
||||
echo "✖ forge API unreachable at ${FORGE_URL}"; exit 1
|
||||
fi
|
||||
echo "✔ forge reachable — ${ver}"
|
||||
echo
|
||||
|
||||
# 2. Runners — the usual root cause. A job can only go green if a runner with
|
||||
# matching labels is registered AND online.
|
||||
echo "── Runners (action_runner) ──────────────────────────────────────────"
|
||||
RUNNER_COUNT=$(echo "SELECT count(*) FROM action_runner;" | fq_raw)
|
||||
if [[ "$RUNNER_COUNT" -eq 0 ]]; then
|
||||
echo "✖ NO RUNNERS REGISTERED. Every job will hang or fail — nothing can run."
|
||||
echo " Workflows expect labels: self-hosted, linux, do, ct-forge"
|
||||
echo " Provision via: @cocottetech/infra/terraform/ci-runners (terraform apply)"
|
||||
else
|
||||
echo " Registered: ${RUNNER_COUNT}"
|
||||
echo "SELECT id, name, version, \
|
||||
datetime(last_online,'unixepoch') AS last_online, \
|
||||
(SELECT group_concat(label) FROM action_runner_label l WHERE l.runner_id=r.id) AS labels \
|
||||
FROM action_runner r;" | fq 2>/dev/null || \
|
||||
echo "SELECT id, name, version, datetime(last_online,'unixepoch') AS last_online, agent_labels FROM action_runner r;" | fq
|
||||
fi
|
||||
echo
|
||||
|
||||
# 3. Run status breakdown per repo (canonical — includes parse/dispatch failures)
|
||||
echo "── Run status by repo (action_run) ──────────────────────────────────"
|
||||
WHERE_REPO=""
|
||||
AND_REPO=""
|
||||
if [[ -n "$REPO_FILTER" ]]; then
|
||||
WHERE_REPO="WHERE r.name='${REPO_FILTER}'"
|
||||
AND_REPO="AND r.name='${REPO_FILTER}'"
|
||||
fi
|
||||
echo "SELECT r.owner_name||'/'||r.name AS repo, ${STATUS_CASE} AS status, count(*) AS n \
|
||||
FROM action_run ar JOIN repository r ON r.id=ar.repo_id ${WHERE_REPO} \
|
||||
GROUP BY repo, ar.status ORDER BY repo, n DESC;" | fq
|
||||
echo
|
||||
|
||||
# 4. Recent failures with detail
|
||||
echo "── Recent failures / stuck runs (latest ${FAIL_N}) ──────────────────"
|
||||
echo "SELECT ar.id, r.name AS repo, ar.workflow_id AS workflow, ar.event, \
|
||||
${STATUS_CASE} AS status, substr(ar.commit_sha,1,8) AS sha, \
|
||||
datetime(ar.created,'unixepoch') AS created \
|
||||
FROM action_run ar JOIN repository r ON r.id=ar.repo_id \
|
||||
WHERE ar.status IN (2,5,7) ${AND_REPO} \
|
||||
ORDER BY ar.created DESC LIMIT ${FAIL_N};" | fq
|
||||
echo
|
||||
|
||||
# 5. Dispatched tasks (proves whether ANY job ever reached a runner)
|
||||
TASK_COUNT=$(echo "SELECT count(*) FROM action_task;" | fq_raw)
|
||||
echo "── Dispatched tasks (action_task): ${TASK_COUNT} ────────────────────"
|
||||
if [[ "$TASK_COUNT" -eq 0 ]]; then
|
||||
echo " 0 tasks ever dispatched → no job has executed a single step."
|
||||
echo " This is the fingerprint of the missing-runner condition above."
|
||||
fi
|
||||
echo
|
||||
|
||||
# 6. Verdict
|
||||
FAILED=$(echo "SELECT count(*) FROM action_run WHERE status IN (2,5,7);" | fq_raw)
|
||||
echo "═══════════════════════════════════════════════════════════════════════"
|
||||
if [[ "$RUNNER_COUNT" -eq 0 ]]; then
|
||||
echo " VERDICT: ✖ RED — no runners. Provision a runner, then re-trigger."
|
||||
exit 1
|
||||
elif [[ "$FAILED" -gt 0 ]]; then
|
||||
echo " VERDICT: ✖ RED — ${FAILED} failed/stuck run(s). Inspect logs above."
|
||||
exit 1
|
||||
else
|
||||
echo " VERDICT: ✔ GREEN — runners online, no failed/stuck runs."
|
||||
fi
|
||||
echo "═══════════════════════════════════════════════════════════════════════"
|
||||
|
|
@ -7,7 +7,8 @@ COMMAND="${1:-}"
|
|||
shift || true
|
||||
|
||||
FORGEJO_URL="http://134.199.243.61:3000"
|
||||
FORGEJO_REPO="lilith/lilith-platform.live"
|
||||
# ct-forge org is "platform" (NOT "lilith" — that was the black-forge org).
|
||||
FORGEJO_REPO="platform/lilith-platform.live"
|
||||
FORGEJO_API="${FORGEJO_URL}/api/v1"
|
||||
|
||||
# Personal access token — set in shell env or ~/.config/forgejo/token
|
||||
|
|
@ -125,9 +126,20 @@ for r in runs[:limit]:
|
|||
echo " Logs: ${FORGEJO_URL}/${FORGEJO_REPO}/actions/runs/${_run_id}"
|
||||
;;
|
||||
|
||||
ci:doctor)
|
||||
# Ground-truth CI/CD health from the ct-forge DB (sees parse/dispatch
|
||||
# failures the REST API hides). No FORGEJO_TOKEN needed — reads via ssh.
|
||||
bash "$ROOT_DIR/infrastructure/forge-ci-doctor.sh" "$@"
|
||||
;;
|
||||
|
||||
ci:setup-host)
|
||||
echo "Setting up Forgejo Actions runner on black..."
|
||||
bash "$ROOT_DIR/infrastructure/setup-forgejo-host.sh" "$@"
|
||||
echo "ct-forge runners are provisioned via terraform, not on black."
|
||||
echo "See: @cocottetech/infra/terraform/ci-runners/README.md"
|
||||
echo " cd ~/Code/@projects/@cocottetech/infra/terraform/ci-runners"
|
||||
echo " export TF_VAR_do_token=\"\$(cat ~/.vault/do_pat_cocotte)\""
|
||||
echo " export TF_VAR_forge_pat=\"\$(cat ~/.vault/forge-admin-quinn.api-token)\""
|
||||
echo " terraform init && terraform apply -var=runners=1"
|
||||
exit 0
|
||||
;;
|
||||
|
||||
*)
|
||||
|
|
@ -139,9 +151,10 @@ for r in runs[:limit]:
|
|||
echo " ./run ci:trigger:my Trigger quinn.my deployment"
|
||||
echo " ./run ci:trigger:data Trigger quinn.data deployment"
|
||||
echo " ./run ci:trigger:newsletter Trigger newsletter deployment"
|
||||
echo " ./run ci:status Show recent workflow run statuses"
|
||||
echo " ./run ci:status Show recent workflow run statuses (API)"
|
||||
echo " ./run ci:doctor [--repo R] Ground-truth CI health from ct-forge DB"
|
||||
echo " ./run ci:logs <workflow.yml> Show URL for latest run logs"
|
||||
echo " ./run ci:setup-host [flags] Provision Forgejo runner on black"
|
||||
echo " ./run ci:setup-host How to provision ct-forge runners (terraform)"
|
||||
echo ""
|
||||
echo " FORGEJO_TOKEN env var required for all ci:trigger/status/logs commands."
|
||||
exit 1
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue