lilith-platform.live/deployments/@domains/quinn.data/scripts/sanity.sh
2026-05-16 04:42:58 -07:00

83 lines
3.2 KiB
Bash
Executable file

#!/usr/bin/env bash
# =============================================================================
# quinn-data-sanity — Out-of-band liveness probe for data.transquinnftw.com
# =============================================================================
# Runs from black.lan on a 5-minute systemd timer (quinn-data-sanity.timer)
# to catch outages *between* deploys. The forgejo smoke step only runs on
# push to main; without this timer, a backend crash or config drift can go
# unnoticed until someone manually hits the dashboard.
#
# Exit codes:
# 0 — all checks passed
# 1 — at least one check failed; see stderr for which
#
# Observability:
# All output (stdout + stderr) goes to the systemd journal via StandardOutput=
# journal in the service unit. Inspect with:
# ssh black "sudo journalctl -u quinn-data-sanity.service -n 50"
# Last run timestamp + exit code:
# ssh black "sudo systemctl status quinn-data-sanity.service"
#
# Extension point:
# To get paged on failure, add an OnFailure= line to
# quinn-data-sanity.service pointing at a user-defined notifier unit
# (e.g., quinn-data-sanity-notify.service that curls an ntfy.sh topic).
# This script stays pure — notification concerns are a systemd concern.
# =============================================================================
set -uo pipefail
REPO_ROOT="${QUINN_DATA_REPO_ROOT:-/var/home/lilith/Code/@projects/@lilith/lilith-platform.live}"
SMOKE_CHECK="${REPO_ROOT}/deployments/ci/smoke-check.sh"
BASE_URL="${QUINN_DATA_BASE_URL:-https://data.transquinnftw.com}"
if [[ ! -x "$SMOKE_CHECK" ]]; then
echo "FATAL: smoke-check.sh not found or not executable: $SMOKE_CHECK" >&2
exit 1
fi
# UA is picked to NOT match quinn-maps.conf's $is_scraper regex. If it ever
# does, the / and /provider/ checks will falsely return 403 instead of 302.
export SANITY_UA="Mozilla/5.0 (quinn-data-sanity probe)"
timestamp="$(date '+%Y-%m-%d %H:%M:%S %Z')"
echo "[${timestamp}] quinn-data-sanity probe starting against ${BASE_URL}"
failed=0
failures=()
run_check() {
local url="$1" expected="$2" label="$3"
if bash "$SMOKE_CHECK" "$url" "$expected"; then
echo "${label}"
else
echo "${label} FAILED" >&2
failures+=("$label")
failed=1
fi
}
# 1. Liveness: /healthz must be 200 from the BFF. No auth, no scraper guard.
# A 502/504 means the BFF on :4005 is down. A 404 means the nginx
# /healthz location was removed. A 403 means the scraper guard crept
# back to server scope.
run_check "${BASE_URL}/healthz" 200 "liveness: /healthz = 200"
# 2. Dashboard auth wiring: / must 302-redirect unauthenticated visitors to
# admin login. A 200 means auth is bypassed; a 502 means admin /auth/verify
# upstream is dead; a 403 means our UA accidentally triggered $is_scraper.
run_check "${BASE_URL}/" 302 "dashboard: / → admin login"
# 3. Provider dashboard auth wiring — same contract.
run_check "${BASE_URL}/provider/" 302 "dashboard: /provider/ → admin login"
if [[ $failed -eq 0 ]]; then
echo "[${timestamp}] All quinn.data sanity checks passed."
exit 0
fi
echo "" >&2
echo "[${timestamp}] quinn-data-sanity FAILED (${#failures[@]} of 3 checks)." >&2
for f in "${failures[@]}"; do
echo " - $f" >&2
done
exit 1