66 lines
3.2 KiB
Bash
Executable file
66 lines
3.2 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# analytics-canary.sh — end-to-end ingest canary for the analytics pipeline.
|
|
#
|
|
# Proves the WHOLE chain hourly:
|
|
# browser path (transquinnftw.com/analytics/track/view)
|
|
# → vps edge nginx → upstream black_api (quinn-admin-api :3023 relay)
|
|
# → data.transquinnftw.com edge (write-key inject) → collector :4001
|
|
# → raw_events on analytics-timescaledb
|
|
#
|
|
# Both 2026 outages (5/16 ON CONFLICT, 6/1 relay drop) were invisible to
|
|
# process-level health for days/weeks. This canary fails within one tick.
|
|
#
|
|
# The canary sessionId is prefixed `canary-`; the processor's
|
|
# aggregation.service skips that prefix, so canaries reach raw_events (the
|
|
# proof) but never count toward metrics. The raw row is deleted after the
|
|
# assert to keep raw_events clean.
|
|
#
|
|
# Runs from apricot via quinn-analytics-canary.timer (hourly). Requires ssh
|
|
# access to quinn-vps. On failure, systemd OnFailure= fires
|
|
# quinn-analytics-canary-notify.service (iMessage via mac-sync).
|
|
# =============================================================================
|
|
set -uo pipefail
|
|
|
|
EDGE_URL="${ANALYTICS_CANARY_EDGE_URL:-https://transquinnftw.com/analytics/track/view}"
|
|
VPS_HOST="${ANALYTICS_CANARY_VPS:-quinn-vps}"
|
|
DB_CONTAINER="${ANALYTICS_DB_CONTAINER:-analytics-timescaledb}"
|
|
PROPAGATION_WAIT_S="${ANALYTICS_CANARY_WAIT_S:-15}"
|
|
# Plain browser UA — must NOT match the $is_scraper regex in quinn-maps.conf,
|
|
# and must not contain "bot"/"curl"/etc. or the edge 403s the canary itself.
|
|
UA="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
|
|
|
SID="canary-$(date +%s)-$$"
|
|
|
|
echo "[1/3] POST canary event sessionId=${SID} → ${EDGE_URL}"
|
|
code="$(curl -ks -A "$UA" -X POST -H 'Content-Type: application/json' \
|
|
-d "{\"sessionId\":\"${SID}\"}" \
|
|
-o /dev/null -w '%{http_code}' --max-time 20 "$EDGE_URL" || echo 'curl-failed')"
|
|
if [[ "$code" != "202" ]]; then
|
|
echo "FAIL: edge returned ${code} (expected 202) — edge/relay is down or rejecting" >&2
|
|
exit 1
|
|
fi
|
|
echo " edge accepted (202)"
|
|
|
|
echo "[2/3] waiting ${PROPAGATION_WAIT_S}s for relay→collector propagation..."
|
|
sleep "$PROPAGATION_WAIT_S"
|
|
|
|
echo "[3/3] asserting row in raw_events on ${VPS_HOST}..."
|
|
count="$(ssh -o ConnectTimeout=10 -o BatchMode=yes "$VPS_HOST" "
|
|
U=\$(docker exec $DB_CONTAINER printenv POSTGRES_USER 2>/dev/null)
|
|
D=\$(docker exec $DB_CONTAINER printenv POSTGRES_DB 2>/dev/null)
|
|
docker exec $DB_CONTAINER psql -U \"\$U\" -d \"\$D\" -At -c \
|
|
\"WITH del AS (DELETE FROM raw_events WHERE \\\"sessionId\\\" = '${SID}' RETURNING 1) SELECT count(*) FROM del;\"
|
|
" 2>&1)"
|
|
|
|
if [[ "$count" == "1" ]]; then
|
|
echo "PASS: canary event traversed edge → relay → collector → raw_events (and was cleaned up)"
|
|
exit 0
|
|
fi
|
|
|
|
echo "FAIL: canary event accepted by edge (202) but NEVER reached raw_events (got: ${count})" >&2
|
|
echo " → the relay is silently dropping events again. Check:" >&2
|
|
echo " - ANALYTICS_COLLECTOR_URL in /etc/quinn-admin-api/secrets.env on black" >&2
|
|
echo " - journalctl -u quinn-admin-api | grep -i relay" >&2
|
|
echo " - /var/log/nginx/transquinnftw.com.track-debug.log on quinn-vps (upstream status)" >&2
|
|
exit 1
|