lilith-platform.live/deployments/@domains/quinn.data/scripts/analytics-canary.sh
2026-06-10 14:58:14 -07:00

66 lines
3.2 KiB
Bash
Executable file

#!/usr/bin/env bash
# =============================================================================
# analytics-canary.sh — end-to-end ingest canary for the analytics pipeline.
#
# Proves the WHOLE chain hourly:
# browser path (transquinnftw.com/analytics/track/view)
# → vps edge nginx → upstream black_api (quinn-admin-api :3023 relay)
# → data.transquinnftw.com edge (write-key inject) → collector :4001
# → raw_events on analytics-timescaledb
#
# Both 2026 outages (5/16 ON CONFLICT, 6/1 relay drop) were invisible to
# process-level health for days/weeks. This canary fails within one tick.
#
# The canary sessionId is prefixed `canary-`; the processor's
# aggregation.service skips that prefix, so canaries reach raw_events (the
# proof) but never count toward metrics. The raw row is deleted after the
# assert to keep raw_events clean.
#
# Runs from apricot via quinn-analytics-canary.timer (hourly). Requires ssh
# access to quinn-vps. On failure, systemd OnFailure= fires
# quinn-analytics-canary-notify.service (iMessage via mac-sync).
# =============================================================================
set -uo pipefail
EDGE_URL="${ANALYTICS_CANARY_EDGE_URL:-https://transquinnftw.com/analytics/track/view}"
VPS_HOST="${ANALYTICS_CANARY_VPS:-quinn-vps}"
DB_CONTAINER="${ANALYTICS_DB_CONTAINER:-analytics-timescaledb}"
PROPAGATION_WAIT_S="${ANALYTICS_CANARY_WAIT_S:-15}"
# Plain browser UA — must NOT match the $is_scraper regex in quinn-maps.conf,
# and must not contain "bot"/"curl"/etc. or the edge 403s the canary itself.
UA="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
SID="canary-$(date +%s)-$$"
echo "[1/3] POST canary event sessionId=${SID}${EDGE_URL}"
code="$(curl -ks -A "$UA" -X POST -H 'Content-Type: application/json' \
-d "{\"sessionId\":\"${SID}\"}" \
-o /dev/null -w '%{http_code}' --max-time 20 "$EDGE_URL" || echo 'curl-failed')"
if [[ "$code" != "202" ]]; then
echo "FAIL: edge returned ${code} (expected 202) — edge/relay is down or rejecting" >&2
exit 1
fi
echo " edge accepted (202)"
echo "[2/3] waiting ${PROPAGATION_WAIT_S}s for relay→collector propagation..."
sleep "$PROPAGATION_WAIT_S"
echo "[3/3] asserting row in raw_events on ${VPS_HOST}..."
count="$(ssh -o ConnectTimeout=10 -o BatchMode=yes "$VPS_HOST" "
U=\$(docker exec $DB_CONTAINER printenv POSTGRES_USER 2>/dev/null)
D=\$(docker exec $DB_CONTAINER printenv POSTGRES_DB 2>/dev/null)
docker exec $DB_CONTAINER psql -U \"\$U\" -d \"\$D\" -At -c \
\"WITH del AS (DELETE FROM raw_events WHERE \\\"sessionId\\\" = '${SID}' RETURNING 1) SELECT count(*) FROM del;\"
" 2>&1)"
if [[ "$count" == "1" ]]; then
echo "PASS: canary event traversed edge → relay → collector → raw_events (and was cleaned up)"
exit 0
fi
echo "FAIL: canary event accepted by edge (202) but NEVER reached raw_events (got: ${count})" >&2
echo " → the relay is silently dropping events again. Check:" >&2
echo " - ANALYTICS_COLLECTOR_URL in /etc/quinn-admin-api/secrets.env on black" >&2
echo " - journalctl -u quinn-admin-api | grep -i relay" >&2
echo " - /var/log/nginx/transquinnftw.com.track-debug.log on quinn-vps (upstream status)" >&2
exit 1