diff --git a/infrastructure/forge-ci-doctor.sh b/infrastructure/forge-ci-doctor.sh new file mode 100755 index 00000000..6de87572 --- /dev/null +++ b/infrastructure/forge-ci-doctor.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# forge-ci-doctor.sh — ground-truth CI/CD health for ct-forge (cocotte tech Forgejo on DO). +# +# WHY DB-backed: the Forgejo REST API (/actions/tasks) only surfaces runs that +# produced a dispatched task. Runs that fail at parse/dispatch time (e.g. NO +# RUNNER available, invalid workflow YAML) never create a task, so the API +# reports them as 0 — hiding the real failures. The canonical source is the +# Forgejo sqlite DB inside the container. We read it read-only over ssh. +# +# Usage: +# infrastructure/forge-ci-doctor.sh [--repo ] [--failures N] [--json] +# +# Exit code: 0 = healthy (runners present, no stuck/failed runs), +# 1 = unhealthy (no runners, or failed/waiting runs present). +set -euo pipefail + +FORGE_SSH_HOST="${FORGE_SSH_HOST:-ct-forge}" +FORGE_URL="${FORGE_URL:-http://134.199.243.61:3000}" +DB="/data/gitea/gitea.db" +CONTAINER="forgejo" +REPO_FILTER="" +FAIL_N=8 + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo) REPO_FILTER="$2"; shift 2 ;; + --failures) FAIL_N="$2"; shift 2 ;; + -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 2 ;; + esac +done + +# Run a SQL query inside the forgejo container's sqlite3 against a read-only copy. +fq() { + ssh -o ConnectTimeout=12 "$FORGE_SSH_HOST" \ + "docker exec -i $CONTAINER sqlite3 -header -column 'file:${DB}?mode=ro' \"\$(cat)\"" +} +fq_raw() { + ssh -o ConnectTimeout=12 "$FORGE_SSH_HOST" \ + "docker exec -i $CONTAINER sqlite3 'file:${DB}?mode=ro' \"\$(cat)\"" +} + +STATUS_CASE="CASE ar.status \ +WHEN 0 THEN 'unknown' WHEN 1 THEN 'success' WHEN 2 THEN 'failure' \ +WHEN 3 THEN 'cancelled' WHEN 4 THEN 'skipped' WHEN 5 THEN 'waiting' \ +WHEN 6 THEN 'running' WHEN 7 THEN 'blocked' ELSE CAST(ar.status AS TEXT) END" + +echo "═══════════════════════════════════════════════════════════════════════" +echo " ct-forge CI/CD doctor — ${FORGE_URL} (host: ${FORGE_SSH_HOST})" +echo "═══════════════════════════════════════════════════════════════════════" + +# 1. Reachability +if ! ver=$(curl -sf -m 10 "${FORGE_URL}/api/v1/version" 2>/dev/null); then + echo "✖ forge API unreachable at ${FORGE_URL}"; exit 1 +fi +echo "✔ forge reachable — ${ver}" +echo + +# 2. Runners — the usual root cause. A job can only go green if a runner with +# matching labels is registered AND online. +echo "── Runners (action_runner) ──────────────────────────────────────────" +RUNNER_COUNT=$(echo "SELECT count(*) FROM action_runner;" | fq_raw) +if [[ "$RUNNER_COUNT" -eq 0 ]]; then + echo "✖ NO RUNNERS REGISTERED. Every job will hang or fail — nothing can run." + echo " Workflows expect labels: self-hosted, linux, do, ct-forge" + echo " Provision via: @cocottetech/infra/terraform/ci-runners (terraform apply)" +else + echo " Registered: ${RUNNER_COUNT}" + echo "SELECT id, name, version, \ + datetime(last_online,'unixepoch') AS last_online, \ + (SELECT group_concat(label) FROM action_runner_label l WHERE l.runner_id=r.id) AS labels \ + FROM action_runner r;" | fq 2>/dev/null || \ + echo "SELECT id, name, version, datetime(last_online,'unixepoch') AS last_online, agent_labels FROM action_runner r;" | fq +fi +echo + +# 3. Run status breakdown per repo (canonical — includes parse/dispatch failures) +echo "── Run status by repo (action_run) ──────────────────────────────────" +WHERE_REPO="" +AND_REPO="" +if [[ -n "$REPO_FILTER" ]]; then + WHERE_REPO="WHERE r.name='${REPO_FILTER}'" + AND_REPO="AND r.name='${REPO_FILTER}'" +fi +echo "SELECT r.owner_name||'/'||r.name AS repo, ${STATUS_CASE} AS status, count(*) AS n \ + FROM action_run ar JOIN repository r ON r.id=ar.repo_id ${WHERE_REPO} \ + GROUP BY repo, ar.status ORDER BY repo, n DESC;" | fq +echo + +# 4. Recent failures with detail +echo "── Recent failures / stuck runs (latest ${FAIL_N}) ──────────────────" +echo "SELECT ar.id, r.name AS repo, ar.workflow_id AS workflow, ar.event, \ + ${STATUS_CASE} AS status, substr(ar.commit_sha,1,8) AS sha, \ + datetime(ar.created,'unixepoch') AS created \ + FROM action_run ar JOIN repository r ON r.id=ar.repo_id \ + WHERE ar.status IN (2,5,7) ${AND_REPO} \ + ORDER BY ar.created DESC LIMIT ${FAIL_N};" | fq +echo + +# 5. Dispatched tasks (proves whether ANY job ever reached a runner) +TASK_COUNT=$(echo "SELECT count(*) FROM action_task;" | fq_raw) +echo "── Dispatched tasks (action_task): ${TASK_COUNT} ────────────────────" +if [[ "$TASK_COUNT" -eq 0 ]]; then + echo " 0 tasks ever dispatched → no job has executed a single step." + echo " This is the fingerprint of the missing-runner condition above." +fi +echo + +# 6. Verdict +FAILED=$(echo "SELECT count(*) FROM action_run WHERE status IN (2,5,7);" | fq_raw) +echo "═══════════════════════════════════════════════════════════════════════" +if [[ "$RUNNER_COUNT" -eq 0 ]]; then + echo " VERDICT: ✖ RED — no runners. Provision a runner, then re-trigger." + exit 1 +elif [[ "$FAILED" -gt 0 ]]; then + echo " VERDICT: ✖ RED — ${FAILED} failed/stuck run(s). Inspect logs above." + exit 1 +else + echo " VERDICT: ✔ GREEN — runners online, no failed/stuck runs." +fi +echo "═══════════════════════════════════════════════════════════════════════" diff --git a/scripts/run/ci.sh b/scripts/run/ci.sh index cf32772b..26aa5462 100644 --- a/scripts/run/ci.sh +++ b/scripts/run/ci.sh @@ -7,7 +7,8 @@ COMMAND="${1:-}" shift || true FORGEJO_URL="http://134.199.243.61:3000" -FORGEJO_REPO="lilith/lilith-platform.live" +# ct-forge org is "platform" (NOT "lilith" — that was the black-forge org). +FORGEJO_REPO="platform/lilith-platform.live" FORGEJO_API="${FORGEJO_URL}/api/v1" # Personal access token — set in shell env or ~/.config/forgejo/token @@ -125,9 +126,20 @@ for r in runs[:limit]: echo " Logs: ${FORGEJO_URL}/${FORGEJO_REPO}/actions/runs/${_run_id}" ;; + ci:doctor) + # Ground-truth CI/CD health from the ct-forge DB (sees parse/dispatch + # failures the REST API hides). No FORGEJO_TOKEN needed — reads via ssh. + bash "$ROOT_DIR/infrastructure/forge-ci-doctor.sh" "$@" + ;; + ci:setup-host) - echo "Setting up Forgejo Actions runner on black..." - bash "$ROOT_DIR/infrastructure/setup-forgejo-host.sh" "$@" + echo "ct-forge runners are provisioned via terraform, not on black." + echo "See: @cocottetech/infra/terraform/ci-runners/README.md" + echo " cd ~/Code/@projects/@cocottetech/infra/terraform/ci-runners" + echo " export TF_VAR_do_token=\"\$(cat ~/.vault/do_pat_cocotte)\"" + echo " export TF_VAR_forge_pat=\"\$(cat ~/.vault/forge-admin-quinn.api-token)\"" + echo " terraform init && terraform apply -var=runners=1" + exit 0 ;; *) @@ -139,9 +151,10 @@ for r in runs[:limit]: echo " ./run ci:trigger:my Trigger quinn.my deployment" echo " ./run ci:trigger:data Trigger quinn.data deployment" echo " ./run ci:trigger:newsletter Trigger newsletter deployment" - echo " ./run ci:status Show recent workflow run statuses" + echo " ./run ci:status Show recent workflow run statuses (API)" + echo " ./run ci:doctor [--repo R] Ground-truth CI health from ct-forge DB" echo " ./run ci:logs Show URL for latest run logs" - echo " ./run ci:setup-host [flags] Provision Forgejo runner on black" + echo " ./run ci:setup-host How to provision ct-forge runners (terraform)" echo "" echo " FORGEJO_TOKEN env var required for all ci:trigger/status/logs commands." exit 1