lilith-platform.live/deployments/@domains/quinn.data/deploy.sh
Natalie ab43784b33 fix(ci/data): skip broken external provider-analytics build in deploy
platform-analytics lives in lilith-platform (not .live) and its vite build
currently fails on black. Website analytics deploy must not block on it —
stage .skip-provider-dist and leave VPS /provider/ dist unchanged.
2026-06-25 00:29:57 -04:00

284 lines
14 KiB
Bash
Executable file

#!/usr/bin/env bash
# =============================================================================
# quinn.data — Deploy analytics cluster to vps-0
# =============================================================================
# Deploys:
# 1. website analytics SPA (website-frontend-users) → /var/www/quinn.data/website/dist/
# 2. provider analytics SPA (frontend-provider) → /var/www/quinn.data/dist/
# 3. website analytics BFF (website-backend-users) → /opt/quinn-website-bff/ (systemd)
# 4. nginx prod.conf → /etc/nginx/sites-available/data.transquinnftw.com
# 5. @analytics collector + infra → via @analytics ./run deploy
#
# Queries from the BFF now go to quinn-api (prod lilith_analytics DB).
# Both dev (data.quinn.apricot.lan) and prod use quinn-api + the prod DB (no dev DB).
# When running the staged website-bff, ensure QUERY_API_URL points at the INTERNAL quinn-api
# (e.g. the black instance for prod deploys).
#
# Note: /provider/api/ (:4110) requires platform-analytics/backend-api from
# lilith-platform repo — deploy that separately.
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../../" && pwd)"
PLATFORM_ANALYTICS="$HOME/Code/@projects/@lilith/lilith-platform/codebase/features/platform-analytics"
WEBSITE_FRONTEND="${REPO_ROOT}/codebase/@features/user-data/website-frontend-users"
WEBSITE_BFF="${REPO_ROOT}/codebase/@features/user-data/website-backend-users"
ANALYTICS_APP="$HOME/Code/@applications/@analytics"
REMOTE="quinn-vps"
REMOTE_DIST="/var/www/quinn.data/dist"
REMOTE_WEBSITE_DIST="/var/www/quinn.data/website/dist"
REMOTE_BFF_DIR="/opt/quinn-website-bff"
REMOTE_BACKUPS="/var/www/quinn.data/.deploy-backups"
TIMESTAMP="$(date '+%Y%m%d_%H%M%S')"
BACKUP_PATH="${REMOTE_BACKUPS}/${TIMESTAMP}"
# ---------------------------------------------------------------------------
# --rollback flag: restore the most recent backup and reload nginx
# ---------------------------------------------------------------------------
if [[ "${1:-}" == "--rollback" ]]; then
echo "==> [ROLLBACK] Restoring previous dashboard dist on ${REMOTE}..."
ssh "$REMOTE" bash -euo pipefail <<'ENDSSH'
REMOTE_BACKUPS="/var/www/quinn.data/.deploy-backups"
REMOTE_DIST="/var/www/quinn.data/dist"
latest="$(ls -1t "$REMOTE_BACKUPS" 2>/dev/null | head -1)"
if [[ -z "$latest" ]]; then
echo "ERROR: no backups found in $REMOTE_BACKUPS" >&2
exit 1
fi
echo " Restoring from $REMOTE_BACKUPS/$latest ..."
rsync -a --delete "$REMOTE_BACKUPS/$latest/" "$REMOTE_DIST/"
echo " Restored successfully."
ENDSSH
echo "==> Reloading nginx..."
ssh "$REMOTE" "nginx -t && systemctl reload nginx"
echo ""
echo "Rollback completed at $(date '+%Y-%m-%d %H:%M:%S %Z')"
echo "Note: @analytics collector rollback must be performed via: cd $ANALYTICS_APP && ./run rollback"
exit 0
fi
# ---------------------------------------------------------------------------
# --skip-build flag: artifacts pre-built in CI; skip local builds + typecheck
# Provider analytics SPA comes from _provider-analytics-dist/ in the workspace
# ---------------------------------------------------------------------------
SKIP_BUILD=false
for arg in "$@"; do [[ "$arg" == "--skip-build" ]] && SKIP_BUILD=true; done
# ---------------------------------------------------------------------------
# Rollback trap — fires on any error after the backup is created
# ---------------------------------------------------------------------------
BACKUP_CREATED=false
rollback_on_error() {
local exit_code=$?
echo ""
echo "✖ Deploy step failed (exit ${exit_code})."
if [[ "$BACKUP_CREATED" == "true" ]]; then
echo "==> [AUTO-ROLLBACK] Restoring ${BACKUP_PATH}${REMOTE_DIST} ..."
ssh "$REMOTE" "rsync -a --delete '${BACKUP_PATH}/' '${REMOTE_DIST}/' && nginx -t && systemctl reload nginx" \
&& echo " Rollback complete — dashboard is on the previous release." \
|| echo " WARNING: rollback rsync also failed. Manual intervention required." >&2
else
echo " No backup was created — nothing to roll back."
fi
exit "$exit_code"
}
trap rollback_on_error ERR
# ---------------------------------------------------------------------------
# [0/6] Pre-deploy checks: ensure source directories exist
# ---------------------------------------------------------------------------
echo "==> [0/6] Pre-deploy checks..."
if [[ "$SKIP_BUILD" == false && ! -d "$PLATFORM_ANALYTICS" ]]; then
echo "ERROR: platform-analytics not found at $PLATFORM_ANALYTICS" >&2
exit 1
fi
if [[ ! -d "$ANALYTICS_APP" ]]; then
echo "ERROR: @analytics app not found at $ANALYTICS_APP" >&2
exit 1
fi
if [[ "$SKIP_BUILD" == false ]]; then
if cd "$PLATFORM_ANALYTICS/frontend-provider" && bun run --silent typecheck 2>/dev/null; then
echo " Type-check passed."
elif cd "$PLATFORM_ANALYTICS/frontend-provider" && bun run --silent check 2>/dev/null; then
echo " Type-check passed."
else
echo " No typecheck script found — skipping type-check pre-flight."
fi
cd "$SCRIPT_DIR"
# -------------------------------------------------------------------------
# [1/6] Build SPAs + BFF
# -------------------------------------------------------------------------
# Auto-bump patch version with deploy timestamp: 0.1.2-20260412_223433
_ver=$(cat "${REPO_ROOT}/VERSION.txt" | head -1)
_base=${_ver%%-*}
_major=${_base%%.*}; _rest=${_base#*.}; _minor=${_rest%%.*}; _patch=${_rest#*.}
echo "${_major}.${_minor}.$((_patch + 1))-${TIMESTAMP}" > "${REPO_ROOT}/VERSION.txt"
echo "==> [1/6] Building website analytics SPA (website-frontend-users)..."
cd "$WEBSITE_FRONTEND" && bun install --frozen-lockfile 2>/dev/null || bun install
cd "$WEBSITE_FRONTEND" && NODE_ENV=production bun run build
# beacon.js is now served by the @analytics collector (see
# @analytics/services/collector/src/beacon/), not staged into the website dist.
cd "$SCRIPT_DIR"
echo "==> [1b/6] Building provider analytics SPA (frontend-provider)..."
cd "$PLATFORM_ANALYTICS/frontend-provider" && bun run build
cd "$SCRIPT_DIR"
echo "==> [1c/6] Building website analytics BFF (website-backend-users)..."
cd "$WEBSITE_BFF" && bun install --frozen-lockfile 2>/dev/null || bun install
cd "$WEBSITE_BFF" && bun run build
cd "$SCRIPT_DIR"
fi
# Resolve provider analytics dist: CI artifact or local build
if [[ "$SKIP_BUILD" == true ]]; then
PROVIDER_ANALYTICS_DIST="${SCRIPT_DIR}/_provider-analytics-dist"
else
PROVIDER_ANALYTICS_DIST="${PLATFORM_ANALYTICS}/frontend-provider/dist"
fi
# ---------------------------------------------------------------------------
# [1.5/6] Backup current dist on VPS before touching anything
# ---------------------------------------------------------------------------
echo "==> [1.5/6] Backing up current dist on ${REMOTE}..."
ssh "$REMOTE" bash -euo pipefail <<ENDSSH
mkdir -p "${REMOTE_BACKUPS}"
if [[ -d "${REMOTE_DIST}" && -n "\$(ls -A '${REMOTE_DIST}' 2>/dev/null)" ]]; then
rsync -a "${REMOTE_DIST}/" "${BACKUP_PATH}/"
echo " Backup created: ${BACKUP_PATH}"
find "${REMOTE_BACKUPS}" -maxdepth 1 -mindepth 1 -type d -mtime +7 -exec rm -rf {} + 2>/dev/null || true
else
echo " No existing dist to back up — first deploy."
fi
ENDSSH
BACKUP_CREATED=true
# ---------------------------------------------------------------------------
# [2/6] Deploy SPAs
# ---------------------------------------------------------------------------
echo "==> [2/6] Deploying website analytics SPA to ${REMOTE}..."
ssh "$REMOTE" "mkdir -p ${REMOTE_WEBSITE_DIST}"
rsync -avz --delete "$WEBSITE_FRONTEND/dist/" "${REMOTE}:${REMOTE_WEBSITE_DIST}/"
if [[ -f "$PROVIDER_ANALYTICS_DIST/.skip-provider-dist" ]]; then
echo "==> [2b/6] Skipping provider analytics SPA (no fresh build; VPS /provider/ dist unchanged)"
else
echo "==> [2b/6] Deploying provider analytics SPA to ${REMOTE}..."
ssh "$REMOTE" "mkdir -p ${REMOTE_DIST}"
rsync -avz --delete "$PROVIDER_ANALYTICS_DIST/" "${REMOTE}:${REMOTE_DIST}/"
fi
# ---------------------------------------------------------------------------
# [3/6] Stage BFF for Docker deploy (copies dist into @analytics services tree)
# ---------------------------------------------------------------------------
echo "==> [3/6] Staging website analytics BFF for Docker deploy..."
ANALYTICS_BFF_DIR="${ANALYTICS_APP}/services/website-bff"
mkdir -p "${ANALYTICS_BFF_DIR}/dist"
cp "${WEBSITE_BFF}/dist/server.js" "${ANALYTICS_BFF_DIR}/dist/server.js"
# Strip scripts/devDependencies so turbo doesn't try to build this pre-compiled service
node -e "
const fs = require('fs');
const p = JSON.parse(fs.readFileSync('${WEBSITE_BFF}/package.json', 'utf8'));
delete p.scripts;
delete p.devDependencies;
fs.writeFileSync('${ANALYTICS_BFF_DIR}/package.json', JSON.stringify(p, null, 2));
"
echo " BFF dist staged at ${ANALYTICS_BFF_DIR}/dist/server.js"
# ---------------------------------------------------------------------------
# [4/6] nginx: prod.conf + symlink + reload
# ---------------------------------------------------------------------------
echo "==> [4/6] Syncing nginx prod.conf (data.transquinnftw.com)..."
scp "$SCRIPT_DIR/nginx/prod.conf" "${REMOTE}:/etc/nginx/sites-available/data.transquinnftw.com"
ssh "$REMOTE" "ln -sf /etc/nginx/sites-available/data.transquinnftw.com /etc/nginx/sites-enabled/data.transquinnftw.com 2>/dev/null || true"
ssh "$REMOTE" "nginx -t && systemctl reload nginx"
# ---------------------------------------------------------------------------
# [5/6] Post-deploy health check for dashboard
# ---------------------------------------------------------------------------
# /healthz is the single source of truth for liveness: nginx → BFF (:4005)/health.
# A 200 here proves both nginx is serving and the BFF is alive. Hitting / would
# return 302 (auth redirect), which curl -sf treats as success — a useless check.
echo "==> [5/6] Verifying cluster is live via /healthz..."
HEALTHZ_STATUS="$(curl -sS -o /tmp/quinn-data-healthz.json -w '%{http_code}' \
-A 'Mozilla/5.0 (quinn.data deploy healthcheck)' \
--max-time 15 \
https://data.transquinnftw.com/healthz 2>/dev/null || echo '000')"
if [[ "$HEALTHZ_STATUS" != "200" ]]; then
echo "ERROR: /healthz returned ${HEALTHZ_STATUS} — cluster is not responding." >&2
echo " Body:" >&2
cat /tmp/quinn-data-healthz.json >&2 2>/dev/null || true
echo "" >&2
exit 1
fi
echo " /healthz OK: $(cat /tmp/quinn-data-healthz.json)"
rm -f /tmp/quinn-data-healthz.json
# ---------------------------------------------------------------------------
# [6/6] Deploy @analytics collector + infra
# ---------------------------------------------------------------------------
echo "==> [6/6] Deploying @analytics collector + infra..."
if cd "$ANALYTICS_APP" && ./run deploy; then
echo " Collector stack deployed."
else
echo " WARNING: @analytics collector deploy failed." >&2
echo " Ensure Docker is installed on the VPS: ./run deploy:data:install-docker" >&2
fi
cd "$SCRIPT_DIR"
# ---------------------------------------------------------------------------
# [7/7] Install/refresh the quinn-data-sanity systemd timer on THIS host
# ---------------------------------------------------------------------------
# The timer runs on the host executing deploy.sh (black.lan in CI) because:
# - it only needs outbound HTTPS to data.transquinnftw.com, no VPS access
# - black is already the long-lived ops host with systemd + journalctl
# - keeping it off the VPS means a VPS outage doesn't kill the very probe
# that's supposed to detect VPS outages
#
# Requires passwordless sudo for systemctl + /etc/systemd/system writes.
# If sudo is not available (e.g., running deploy.sh from a laptop for a
# hotfix), the step logs a warning and continues — CI is the authoritative
# install path.
echo "==> [7/7] Installing quinn-data-sanity systemd timer on $(hostname -s)..."
if sudo -n systemctl --version >/dev/null 2>&1; then
SANITY_OPT="/opt/quinn-data-sanity"
sudo mkdir -p "${SANITY_OPT}/scripts" "${SANITY_OPT}/deployments/ci"
sudo cp "$SCRIPT_DIR/scripts/sanity.sh" "${SANITY_OPT}/scripts/sanity.sh"
sudo cp "$REPO_ROOT/deployments/ci/smoke-check.sh" "${SANITY_OPT}/deployments/ci/smoke-check.sh"
sudo chmod +x "${SANITY_OPT}/scripts/sanity.sh" "${SANITY_OPT}/deployments/ci/smoke-check.sh"
sudo cp "$SCRIPT_DIR/systemd/quinn-data-sanity.service" /etc/systemd/system/quinn-data-sanity.service
sudo cp "$SCRIPT_DIR/systemd/quinn-data-sanity.timer" /etc/systemd/system/quinn-data-sanity.timer
sudo systemctl daemon-reload
sudo systemctl enable --now quinn-data-sanity.timer
# Fire one immediate run so any regression shows up in the deploy logs
# rather than waiting for the next 5-min tick.
if sudo systemctl start quinn-data-sanity.service; then
STATUS="$(sudo systemctl show -p ExecMainStatus --value quinn-data-sanity.service)"
if [[ "$STATUS" == "0" ]]; then
echo " quinn-data-sanity: immediate run PASSED."
else
echo " WARNING: quinn-data-sanity: immediate run exited ${STATUS}." >&2
sudo journalctl -u quinn-data-sanity.service -n 20 --no-pager >&2 || true
fi
fi
NEXT="$(sudo systemctl list-timers quinn-data-sanity.timer --no-pager --no-legend 2>/dev/null | awk '{print $1, $2}')"
echo " Timer installed. Next run: ${NEXT:-unknown}"
else
echo " WARNING: sudo not available non-interactively — systemd timer NOT installed on $(hostname -s)." >&2
echo " Run manually: sudo bash $SCRIPT_DIR/deploy.sh (or install the units by hand)." >&2
fi
echo ""
echo "Deployed at $(date '+%Y-%m-%d %H:%M:%S %Z')"
echo "Dashboard: https://data.transquinnftw.com/"
echo "Healthz: https://data.transquinnftw.com/healthz (public liveness probe)"
echo "To roll back dashboard: bash $SCRIPT_DIR/deploy.sh --rollback"