diff --git a/deployments/@domains/quinn.www/scripts/deploy-edge-watcher.sh b/deployments/@domains/quinn.www/scripts/deploy-edge-watcher.sh new file mode 100755 index 00000000..4f625fda --- /dev/null +++ b/deployments/@domains/quinn.www/scripts/deploy-edge-watcher.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# +# deploy-edge-watcher.sh — install/update the public-edge health watcher on vps-0. +# +# Idempotent. Ships edge-watcher.sh to /opt/quinn-edge-watcher, installs the +# systemd oneshot + minute timer, seeds /etc/quinn-edge-watcher/watcher.env on +# first run (never clobbers an existing one), validates with a --dry-run, then +# enables the timer. +# +# vps-0 runs deploys as root (no sudo). See docs/EDGE_ISLAND_MODE.md. +# +# Usage: +# ./deploy-edge-watcher.sh # deploy + enable timer +# ./deploy-edge-watcher.sh --verify # ship + dry-run only; do NOT enable timer / send mail + +set -euo pipefail + +REMOTE="${EDGE_WATCHER_REMOTE:-quinn-vps}" +SRC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VERIFY_ONLY=0 +[[ "${1:-}" == "--verify" ]] && VERIFY_ONLY=1 + +echo "==> [1/4] Shipping watcher to ${REMOTE}:/opt/quinn-edge-watcher" +ssh "$REMOTE" 'mkdir -p /opt/quinn-edge-watcher/state /etc/quinn-edge-watcher' +scp -q "$SRC_DIR/edge-watcher.sh" "$REMOTE:/opt/quinn-edge-watcher/edge-watcher.sh" +ssh "$REMOTE" 'chmod +x /opt/quinn-edge-watcher/edge-watcher.sh' + +echo "==> [2/4] Seeding watcher.env (only if absent)" +ssh "$REMOTE" 'test -f /etc/quinn-edge-watcher/watcher.env || cat > /etc/quinn-edge-watcher/watcher.env < [3/4] Validating with --dry-run (no email, no state writes)" +ssh "$REMOTE" 'set -a; . /etc/quinn-edge-watcher/watcher.env 2>/dev/null; set +a; /opt/quinn-edge-watcher/edge-watcher.sh --dry-run' + +if [[ "$VERIFY_ONLY" == 1 ]]; then + echo "==> [verify] units NOT installed, timer NOT enabled. Re-run without --verify to go live." + exit 0 +fi + +echo "==> [4/4] Installing systemd units + enabling minute timer" +scp -q "$SRC_DIR/quinn-edge-watcher.service" "$REMOTE:/etc/systemd/system/quinn-edge-watcher.service" +scp -q "$SRC_DIR/quinn-edge-watcher.timer" "$REMOTE:/etc/systemd/system/quinn-edge-watcher.timer" +ssh "$REMOTE" 'systemctl daemon-reload && systemctl enable --now quinn-edge-watcher.timer && systemctl list-timers quinn-edge-watcher.timer --no-pager' + +echo "==> Done. First run sends an ACTIVE notice. Tail with:" +echo " ssh ${REMOTE} 'journalctl -u quinn-edge-watcher.service -n 30 --no-pager'" diff --git a/deployments/@domains/quinn.www/scripts/edge-watcher.sh b/deployments/@domains/quinn.www/scripts/edge-watcher.sh new file mode 100755 index 00000000..2062bb36 --- /dev/null +++ b/deployments/@domains/quinn.www/scripts/edge-watcher.sh @@ -0,0 +1,261 @@ +#!/usr/bin/env bash +# +# edge-watcher.sh — vps-0 public-edge health watcher. +# +# Probes every backend the public site depends on, writes a per-form status JSON +# (the island-mode kill-switch oracle that the SPA reads), and emails alerts: +# - IMMEDIATE on a confirmed UP->DOWN transition (anti-flap: N consecutive fails) +# - ESCALATION reminders at +1h / +4h / +6h while a target stays down +# - RECOVERY on DOWN->UP +# - WEEKLY "watcher active" heartbeat while everything is healthy +# +# Runs as a systemd oneshot fired by quinn-edge-watcher.timer (every minute). +# Email goes through the local DMS relay (127.0.0.1:25, permit_mynetworks) so the +# alert path does NOT depend on black — essential, since black being down is the +# very thing it alerts on. +# +# Decision-independent: read-only probes + email only. Touches no data path, so it +# is safe regardless of the canonical-DB question (see docs/EDGE_ISLAND_MODE.md §6). +# +# Usage: +# edge-watcher.sh # one cycle: probe, persist state, send due alerts +# edge-watcher.sh --dry-run # probe + print status JSON + would-be alerts; no email, no state writes + +# NB: errexit (set -e) is deliberately OFF. A monitor must not abort mid-cycle when +# a probe fails — `curl` exits non-zero on connection-refused, which under set -e +# would kill the very run that needs to raise the alarm. Failures are handled +# explicitly instead. nounset + pipefail stay on. +set -uo pipefail + +# --------------------------------------------------------------------------- +# Config (override via /etc/quinn-edge-watcher/watcher.env) +# --------------------------------------------------------------------------- +WATCHER_DIR="${EDGE_WATCHER_DIR:-/opt/quinn-edge-watcher}" +STATE_DIR="${EDGE_WATCHER_STATE_DIR:-${WATCHER_DIR}/state}" +STATUS_JSON="${EDGE_WATCHER_STATUS_JSON:-${STATE_DIR}/status.json}" +ALERT_TO="${EDGE_WATCHER_ALERT_TO:-transquinnftw@pm.me}" +ALERT_FROM="${EDGE_WATCHER_ALERT_FROM:-noreply@transquinnftw.com}" +SMTP_SERVER="${EDGE_WATCHER_SMTP:-127.0.0.1:25}" +PROBE_TIMEOUT="${EDGE_WATCHER_TIMEOUT:-3}" +FAIL_THRESHOLD="${EDGE_WATCHER_FAIL_THRESHOLD:-2}" # consecutive fails before DOWN (anti-flap) +HEARTBEAT_SECONDS="${EDGE_WATCHER_HEARTBEAT_SECONDS:-604800}" # 7 days +HOSTLABEL="${EDGE_WATCHER_HOSTLABEL:-vps-0 (transquinnftw.com edge)}" + +# Escalation reminder offsets (seconds after down_since). 0 == immediate. +ESCALATIONS=(0 3600 14400 21600) # immediate, +1h, +4h, +6h + +DRY_RUN=0 +[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=1 +# NO_MAIL: run live (persist state) but suppress actual sends — for ops verification. +NO_MAIL="${EDGE_WATCHER_NO_MAIL:-0}" + +# --------------------------------------------------------------------------- +# Targets: name|probe_url|forms_disabled_when_down (csv) +# A target is UP when it answers HTTP < 500; DOWN on connection failure/timeout +# (http_code 000) or a 5xx. Photos origin legitimately 404s at root => UP. +# --------------------------------------------------------------------------- +TARGETS=( + "black_api|http://10.0.0.11:3023/health|contact,touring,waitlist" + "black_data_api|http://10.0.0.11:3022/health|" + "local_my_api|http://127.0.0.1:3024/health|booking,roster" + "local_newsletter|http://127.0.0.1:3026/health|newsletter" + "black_photos|http://10.0.0.11:8081/|" +) + +# Surfaces reported in status.json (form/read -> the target it depends on). +declare -A FORM_DEP=( + [contact]=black_api [touring]=black_api [waitlist]=black_api + [booking]=local_my_api [roster]=local_my_api + [newsletter]=local_newsletter + [read_www]=black_api [read_provider_data]=black_data_api [read_photos]=black_photos +) + +now() { date +%s; } +iso() { date -u +%Y-%m-%dT%H:%M:%SZ; } + +log() { printf '[edge-watcher] %s\n' "$*" >&2; } + +# --------------------------------------------------------------------------- +# Email +# --------------------------------------------------------------------------- +send_email() { + local subject="$1" body="$2" + if [[ "$DRY_RUN" == 1 ]]; then + printf '\n--- WOULD SEND EMAIL ---\nTo: %s\nSubject: %s\n\n%s\n------------------------\n' \ + "$ALERT_TO" "$subject" "$body" >&2 + return 0 + fi + if [[ "$NO_MAIL" == 1 ]]; then + log "[no-mail] suppressed send: $subject" + return 0 + fi + if ! command -v swaks >/dev/null 2>&1; then + log "swaks not installed; cannot send: $subject" + return 1 + fi + swaks --server "$SMTP_SERVER" --from "$ALERT_FROM" --to "$ALERT_TO" \ + --header "Subject: $subject" --body "$body" --silent 2>/dev/null \ + || log "swaks send failed: $subject" +} + +# --------------------------------------------------------------------------- +# Per-target state file helpers (key=value lines) +# --------------------------------------------------------------------------- +state_file() { printf '%s/%s.state' "$STATE_DIR" "$1"; } + +state_get() { # name key default + local f; f="$(state_file "$1")" + [[ -f "$f" ]] || { printf '%s' "$3"; return; } + local v; v="$(grep -m1 "^$2=" "$f" 2>/dev/null | cut -d= -f2-)" + [[ -n "$v" ]] && printf '%s' "$v" || printf '%s' "$3" +} + +state_write() { # name k=v k=v ... + [[ "$DRY_RUN" == 1 ]] && return 0 + local name="$1"; shift + local f tmp; f="$(state_file "$name")"; tmp="${f}.tmp.$$" + printf '%s\n' "$@" > "$tmp" + mv -f "$tmp" "$f" +} + +# --------------------------------------------------------------------------- +# Probe one target -> sets globals: P_CODE P_LATENCY P_UP +# --------------------------------------------------------------------------- +probe() { + local url="$1" t0 t1 n + t0="$(date +%s%3N)" + # curl -w prints the http_code even on failure ("000"), so do NOT add `|| echo` + # — that double-appends and corrupts the value. + P_CODE="$(curl -s -o /dev/null -w '%{http_code}' --max-time "$PROBE_TIMEOUT" "$url" 2>/dev/null)" + t1="$(date +%s%3N)" + P_LATENCY=$(( t1 - t0 )) + [[ "$P_CODE" =~ ^[0-9]+$ ]] || P_CODE=000 + n=$(( 10#$P_CODE )) # 10# guards against octal parsing of leading-zero codes + # UP = process answered with a non-server-error status (100..499). Photos origin + # legitimately 404s at root. DOWN = no connection (000) or 5xx. + if (( n >= 100 && n < 500 )); then P_UP=1; else P_UP=0; fi +} + +# --------------------------------------------------------------------------- +# Main cycle +# --------------------------------------------------------------------------- +# In dry-run, redirect all state to a throwaway dir BEFORE any mkdir, so a bare +# verification run writes nothing under /opt. +[[ "$DRY_RUN" == 1 ]] && { STATE_DIR="$(mktemp -d)"; STATUS_JSON="${STATE_DIR}/status.json"; } +mkdir -p "$STATE_DIR" + +NOW="$(now)" +declare -A TARGET_UP=() # name -> 0/1 +json_targets="" + +for spec in "${TARGETS[@]}"; do + IFS='|' read -r name url _forms <<< "$spec" + probe "$url" + TARGET_UP["$name"]="$P_UP" + + prev_status="$(state_get "$name" status up)" + fails="$(state_get "$name" consecutive_fails 0)" + down_since="$(state_get "$name" down_since 0)" + alerts_sent="$(state_get "$name" alerts_sent '')" + + if [[ "$P_UP" == 1 ]]; then + if [[ "$prev_status" == down ]]; then + local_dur=$(( NOW - down_since )) + send_email "[edge-watcher] RECOVERED: ${name}" \ + "Target ${name} on ${HOSTLABEL} is back UP (HTTP ${P_CODE}). +Was down for $(( local_dur / 60 )) min. +Probe: ${url} +Time: $(iso)" + fi + state_write "$name" "status=up" "consecutive_fails=0" "down_since=0" "alerts_sent=" + reason="" + else + fails=$(( fails + 1 )) + if [[ "$prev_status" == up ]]; then + if (( fails >= FAIL_THRESHOLD )); then + # Confirmed transition UP -> DOWN + send_email "[edge-watcher] DOWN: ${name}" \ + "Target ${name} on ${HOSTLABEL} is DOWN (HTTP ${P_CODE}) after ${fails} consecutive failed probes. +Disables forms: ${_forms:-} +Probe: ${url} +Time: $(iso) +Escalation reminders will follow at +1h / +4h / +6h if it stays down." + state_write "$name" "status=down" "consecutive_fails=${fails}" "down_since=${NOW}" "alerts_sent=0" + else + # Flap guard: not yet confirmed down, do not alert + state_write "$name" "status=up" "consecutive_fails=${fails}" "down_since=0" "alerts_sent=" + fi + else + # Already down: send any due escalation reminders + local_elapsed=$(( NOW - down_since )) + for off in "${ESCALATIONS[@]}"; do + [[ "$off" == 0 ]] && continue + if (( local_elapsed >= off )) && [[ ",${alerts_sent}," != *",${off},"* ]]; then + send_email "[edge-watcher] STILL DOWN (+$(( off / 3600 ))h): ${name}" \ + "Target ${name} on ${HOSTLABEL} has been DOWN for $(( local_elapsed / 60 )) min. +Disables forms: ${_forms:-} +Probe: ${url} (HTTP ${P_CODE}) +Time: $(iso)" + alerts_sent="${alerts_sent},${off}" + fi + done + state_write "$name" "status=down" "consecutive_fails=${fails}" "down_since=${down_since}" "alerts_sent=${alerts_sent}" + fi + reason="backend_unreachable" + fi + + json_targets+=$(printf '{"name":"%s","up":%s,"httpCode":"%s","latencyMs":%s},' \ + "$name" "$([[ "$P_UP" == 1 ]] && echo true || echo false)" "$P_CODE" "$P_LATENCY") +done + +# --------------------------------------------------------------------------- +# Derive per-form status + write status.json (atomic) +# --------------------------------------------------------------------------- +json_forms="" +for form in "${!FORM_DEP[@]}"; do + dep="${FORM_DEP[$form]}" + up="${TARGET_UP[$dep]:-0}" + json_forms+=$(printf '"%s":{"enabled":%s,"dependsOn":"%s"},' \ + "$form" "$([[ "$up" == 1 ]] && echo true || echo false)" "$dep") +done + +black_reachable=false +{ [[ "${TARGET_UP[black_api]:-0}" == 1 ]] || [[ "${TARGET_UP[black_data_api]:-0}" == 1 ]]; } && black_reachable=true + +tmp="${STATUS_JSON}.tmp.$$" +printf '{"ts":"%s","host":"%s","blackReachable":%s,"targets":[%s],"forms":{%s}}\n' \ + "$(iso)" "$HOSTLABEL" "$black_reachable" "${json_targets%,}" "${json_forms%,}" > "$tmp" +mv -f "$tmp" "$STATUS_JSON" + +if [[ "$DRY_RUN" == 1 ]]; then + echo "=== status.json ==="; cat "$STATUS_JSON"; echo + rm -rf "$STATE_DIR" + exit 0 +fi + +# --------------------------------------------------------------------------- +# Weekly heartbeat (only while fully healthy) + first-run activation notice +# --------------------------------------------------------------------------- +HB_FILE="${STATE_DIR}/heartbeat.last" +all_up=1 +for spec in "${TARGETS[@]}"; do IFS='|' read -r name _ _ <<< "$spec"; [[ "${TARGET_UP[$name]}" == 1 ]] || all_up=0; done + +if [[ ! -f "$HB_FILE" ]]; then + send_email "[edge-watcher] ACTIVE: monitoring started on ${HOSTLABEL}" \ + "edge-watcher is now running on ${HOSTLABEL}. +Probing: black_api, black_data_api, local_my_api, local_newsletter, black_photos every minute. +Weekly active heartbeats + immediate/1h/4h/6h down alerts enabled. +Time: $(iso)" + echo "$NOW" > "$HB_FILE" +elif (( all_up == 1 )); then + last_hb="$(cat "$HB_FILE" 2>/dev/null || echo 0)" + if (( NOW - last_hb >= HEARTBEAT_SECONDS )); then + send_email "[edge-watcher] weekly heartbeat — all healthy on ${HOSTLABEL}" \ + "All edge backends healthy. Weekly active heartbeat. +$(cat "$STATUS_JSON") +Time: $(iso)" + echo "$NOW" > "$HB_FILE" + fi +fi + +exit 0 diff --git a/deployments/@domains/quinn.www/scripts/quinn-edge-watcher.service b/deployments/@domains/quinn.www/scripts/quinn-edge-watcher.service new file mode 100644 index 00000000..920b4c90 --- /dev/null +++ b/deployments/@domains/quinn.www/scripts/quinn-edge-watcher.service @@ -0,0 +1,18 @@ +[Unit] +Description=Quinn public-edge health watcher (probes backends, writes status oracle, emails alerts) +Documentation=https://github.com/lilith/lilith-platform.live/blob/main/docs/EDGE_ISLAND_MODE.md +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +EnvironmentFile=-/etc/quinn-edge-watcher/watcher.env +ExecStart=/opt/quinn-edge-watcher/edge-watcher.sh +Nice=10 +# Hardening: the watcher only needs to read configs, curl backends, write its +# own state dir, and invoke the local mail relay. +ProtectSystem=strict +ProtectHome=true +ReadWritePaths=/opt/quinn-edge-watcher/state +NoNewPrivileges=true +PrivateTmp=true diff --git a/deployments/@domains/quinn.www/scripts/quinn-edge-watcher.timer b/deployments/@domains/quinn.www/scripts/quinn-edge-watcher.timer new file mode 100644 index 00000000..2e981e06 --- /dev/null +++ b/deployments/@domains/quinn.www/scripts/quinn-edge-watcher.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Run quinn-edge-watcher every minute +Documentation=https://github.com/lilith/lilith-platform.live/blob/main/docs/EDGE_ISLAND_MODE.md + +[Timer] +OnBootSec=30s +OnUnitActiveSec=60s +AccuracySec=10s +Unit=quinn-edge-watcher.service + +[Install] +WantedBy=timers.target diff --git a/docs/EDGE_ISLAND_MODE.md b/docs/EDGE_ISLAND_MODE.md new file mode 100644 index 00000000..a7734137 --- /dev/null +++ b/docs/EDGE_ISLAND_MODE.md @@ -0,0 +1,174 @@ +# Edge Resilience & Island Mode — Verified Topology + Design + +**Status:** Investigation + design. **No runtime changes made** (read-only probes only). +**Verified:** 2026-06-21, via read-only `ssh quinn-vps` / `ssh black`, live nginx config, live `quinn-upstreams.conf`, live HTTP + DB probes. +**Current-state facts here supersede:** the SQLite-era inventory in [`PROD_DB_UNIFICATION_PLAN.md`](PROD_DB_UNIFICATION_PLAN.md) (the platform has since moved to PostgreSQL) and the "dead forms" verdict in [`FORMS_AUDIT.md`](FORMS_AUDIT.md) (2026-06-03 — the edge `location` blocks have since been added; forms are now routed). +**Direction docs (target, not current state):** [`migration-vps-to-black.md`](migration-vps-to-black.md), [`PROD_DB_UNIFICATION_PLAN.md`](PROD_DB_UNIFICATION_PLAN.md). + +--- + +## 1. Why this doc exists + +The originating ask: **public contact forms should automatically disable themselves when their backend is unreachable**, and **vps-0 should be able to "island mode" without black** (keep serving what it can when black or the WireGuard link drops). Investigating that surfaced a live topology that **diverges from the documented target**, plus a data-integrity issue independent of island mode. This doc records: + +1. The **verified current topology** (2026-06-21). +2. The **island-mode / runtime-kill-switch design**. +3. The **consolidated gap register**. +4. The **one open decision** that blocks the design. + +--- + +## 2. Verified current topology (2026-06-21) + +### 2.1 Hosts + +| Host | Role | Reachability | +|---|---|---| +| **vps-0** (`89.127.233.145`, WG `10.9.0.1`) | Public edge: nginx + static SPA + edge cache **+ a near-complete local backend stack incl. its own Postgres** | Public internet | +| **black** (`10.0.0.11` over WireGuard) | Canonical for the public read surface + contact/touring writes | LAN/WG only | + +> Reality check: vps-0 is **not** the "pure edge" the migration target describes. It runs `quinn-api`, `quinn-admin-api`, `quinn-data-api`, `quinn-my-api`, `quinn-sso-api`, `quinn-newsletter-api`, `quinn-m-backend-user`, plus `postgresql@17-quinn` (local `:5435`) and pgBouncer (`:6432`). + +### 2.2 Edge routing — nginx on vps-0 ([`prod.conf`](../deployments/@domains/quinn.www/nginx/prod.conf)) + +| Public path | Upstream | Resolves to | Cached? | +|---|---|---|---| +| `/www/*` (destinations, tour, blog, regions) | `black_api` | **black:3023** | `pseo_cache` 60m, serve-stale-on-error | +| `/sitemap.xml` | `black_api` | black:3023 | `pseo_cache` 60m | +| `/api/i18n/*` | `black_api` | black:3023 | **none** ⚠ | +| `/provider-api/*` (ProviderData JSON) | `black_data_api` | **black:3022** | `data_cache` 30m, serve-stale | +| `/photos/*` | `black_photos` | **black:8081** | `photos_cache` 7d, serve-stale | +| `/public/*` (contact, touring) — **write** | `black_api` | **black:3023** | none | +| `/waitlist` — **write** | `black_api` | black:3023 | none | +| `/api/bookings` — **write** | `black_my_api` | **127.0.0.1:3024 (LOCAL)** | none | +| `/public/roster/*` — **write** | `black_my_api` | **127.0.0.1:3024 (LOCAL)** | none | +| `/newsletter/*` — **write** | `black_newsletter` | **127.0.0.1:3026 (LOCAL)** | none | + +### 2.3 Live upstreams (`/etc/nginx/conf.d/quinn-upstreams.conf` — VPS-owned, not in repo) + +The `black_` prefix is **historical and misleading** — two upstreams are local to vps-0: + +``` +black_api → 10.0.0.11:3023 (black, WG) +black_data_api → 10.0.0.11:3022 (black, WG) +black_my_api → 127.0.0.1:3024 (LOCAL vps-0) +black_newsletter → 127.0.0.1:3026 (LOCAL vps-0) +black_photos → 10.0.0.11:8081 (black, WG) +``` + +> The repo's [`README-vps-owned.md`](../deployments/@domains/quinn.www/nginx/README-vps-owned.md) was **stale** (documented `black_api` as `:3030` and my-api/newsletter as `10.0.0.11`) — corrected 2026-06-21 to match live, since re-applying the stale values would mis-route production. + +### 2.4 Backends + databases — **split-brain** + +| Surface | Backend | Database | Canonical host | +|---|---|---|---| +| Public reads (`/www`, `/provider-api`) | black `:3023` / `:3022` | `black:25435/quinn(_admin)` | **black** | +| contact / touring / waitlist (write) | black `:3023` | `black:25435/quinn` | **black** | +| booking / roster (write) | vps-0 local `:3024` | `vps-0:5435/quinn` (via pgBouncer `:6432`) | **vps-0** | +| newsletter (write) | vps-0 local `:3026` | `vps-0:5435/quinn` | **vps-0** | + +**Writes are partitioned across two canonical Postgres instances by form.** Booking data exists *only* on vps-0; contact data *only* on black. Nothing reconciles them. + +### 2.5 The local stack is NOT a replica (HTTP compare, vps-0 local `:3023` vs black `:3023`) + +| Endpoint | LOCAL vps-0 | BLACK | | +|---|---|---|---| +| `/health` | build 257 / `0.1.149`, `mode:internal`, 2026-06-21 | older build (`{"ok":true}` only) | vps-0 is **newer** | +| `/www/destinations` | 79 items | **82** | DIFFER | +| `/www/provider-config` | 95 items | **98** | DIFFER | +| `/www/tour-stops` | — | — | DIFFER | + +vps-0's local `quinn` DB is **populated but drifted** (destinations max `2026-05-18`), with **no replication feed** from black. The public site reads from black (82 destinations); vps-0's copy (79) is behind and **not in the public read path**. This reads like a **stalled cutover**: vps-0 looks half-prepped to become primary (newer build, full local DB) but the public path was never switched to it. + +### 2.6 Edge cache durability + +| Zone | inactive (eviction) | Island value | +|---|---|---| +| `pseo_cache` (`/www`) | **1h** | Weak — cold pages evict within 1h of an outage → 502 | +| `data_cache` (`/provider-api`) | 1d | Good | +| `photos_cache` (`/photos`) | 30d | Strong | + +--- + +## 3. The exposure (what breaks when black / WG drops) + +- **Hard-fail:** contact, touring, waitlist (POST → black:3023). No pre-warning — the SPA fetches no runtime config, so it posts into a 502 blind. +- **Degrades to stale, then fails:** `/www` reads survive only while cached and only ~1h for cold pages (`pseo_cache inactive=1h`); `/provider-api` survives ~1d; `/api/i18n` is **uncached** and is **fetched at runtime** ([provider `App.tsx:91`](../codebase/@features/provider-website/frontend-public/src/App.tsx), [landing `App.tsx:241`](../codebase/@features/landing/frontend-public/src/App.tsx)) → translations hard-fail. +- **Stays fully alive (local on vps-0):** booking, roster, newsletter. + +There is **no watcher** on vps-0 for the API/forms surface today (a separate gallery monitor exists for photos only). + +--- + +## 4. Island-mode design (proposed — not built) + +In-process `edge-health` module in a vps-0-local `quinn.api` PUBLIC instance (placement decided), with manual override. Maps onto existing seams: [`public-proxy.ts`](../codebase/@features/api/src/app/middleware/public-proxy.ts) (`isLocallyServable` / `publicModeGate`) and the probe pattern in [`system-status.ts`](../codebase/@features/api/src/surfaces/admin/system-status.ts). + +1. **Runtime kill switch.** Background prober + circuit breaker per form (fed actively by probes, passively by proxy failures). `GET /edge/status` served **locally** (island-safe) returns the per-form enabled/disabled map. Frontend `FormGateProvider` fetches it on load + focus; forms render a "reach me by SMS" fallback instead of posting into a 502. +2. **Store-and-forward outbox** (for the black-dependent writes only: contact, touring, waitlist). Edge accepts the POST, persists to a durable local spool, returns `200`, and a background forwarder replays to black on recovery. Requires: idempotency key + black-side dedupe (`ON CONFLICT DO NOTHING`); encrypted/short-lived spool (PII on a public host); throttled replay (respect black + vps-0 fail2ban). +3. **Watcher + alerting.** Weekly "active" heartbeat + immediate failure alert with **1h / 4h / 6h backoff**, escalation state persisted across restarts, anti-flap (reuse gallery-monitor pattern), sent via vps-0 local DMS (`swaks --server 127.0.0.1:25`, black-independent). +4. **Backend fail-fast.** When a breaker is open, short-circuit the write with a fast structured `503` instead of hanging on a dead TCP connect. +5. **Never a new SPOF.** nginx keeps black as primary; the edge service is failover/accept-on-error only; under systemd `Restart=always`. + +**What stays alive in island mode:** booking, roster, newsletter (already local); cached `/www` + `/provider-api` reads (stale); contact/touring **accepted to the outbox** for later replay. Disabled/degraded: cold `/www` pages, runtime i18n, live contact/touring delivery. + +--- + +## 5. Gap register + +| # | Gap | Handling | Status | +|---|---|---|---| +| G1 | Public read/contact upstreams point only at black; no failover to the local twin | nginx upstream failover **— blocked: local DB is not a replica (G2)** | blocked | +| G2 | Local `:5435/quinn` is **not** a live replica of black (drifted, no feed) | Establish real replication before any failover-to-local | **verified NO** | +| G3 | **Split-brain writes** across two canonical DBs (contact→black, booking/roster/newsletter→vps-0) | Unify canonical DB (see §6); latent data-integrity issue independent of island mode | **verified** | +| G4 | `README-vps-owned.md` upstream ports stale → re-applying mis-routes prod | Reconciled to live mapping | **done 2026-06-21** | +| G5 | `pseo_cache inactive=1h` → cold `/www` evicts within 1h of outage | Raise to 24h+ in VPS-owned `quinn-maps.conf` | ops change | +| G6 | `/api/i18n/` uncached **and** fetched at runtime → translations hard-fail on black down | Add `proxy_cache` (long stale); confirm `@lilith/i18n` build-time fallbacks | open | +| G7 | No runtime form gating; SPA posts into 502 blind | `/edge/status` oracle (**watcher now produces it**) + serve it via nginx + `FormGateProvider` | **oracle done**; serving + frontend pending | +| G8 | Black-dependent writes (contact/touring/waitlist) hard-fail on outage | Store-and-forward outbox | needs rollout | +| G9 | `contact_submissions` has **no** unique/idempotency constraint → replay duplicates | Add `idempotency_key` + unique index + `ON CONFLICT DO NOTHING` | black migration | +| G10 | PII at rest on public host (outbox spool) | Encrypt at rest / short-lived / never log bodies | build rule | +| G11 | Provider SMTP notify delayed until replay | Accept delay, or local-DMS notify on accept | decision | +| G12 | Edge service could become a new SPOF | black stays primary; edge failover-only; `Restart=always` | build rule | +| G13 | Outbox unbounded growth + recovery thundering herd + vps-0 fail2ban on POST bursts | Cap spool + alert on depth/age; throttle replay (~≤30/min) | build rule | +| G14 | Heartbeat/alert robustness (1h/4h/6h escalation must survive restarts; anti-flap) | Persist state to file; systemd timer; local DMS | **DONE 2026-06-21 — deployed** | +| G15 | Local write-services can crash independently | Watcher probes `:3024`/`:3026` too — never assume "local = up" | build | +| G16 | Idempotency migration safety on existing contact/touring/waitlist inserts | Backfill-safe migration; verify before deploy | verify | + +--- + +## 6. Open decision — which DB is canonical? (blocks the design) + +The island-mode architecture depends on resolving the split-brain, and that is **above an agent's authority** — it's an operator decision that also touches [`migration-vps-to-black.md`](migration-vps-to-black.md). + +- **If black stays canonical** (the documented target): island mode = **outbox + accept-stale-cache** (G7–G14). The local vps-0 stack/DB is dead weight until replicated, and booking/roster/newsletter writes must be **moved back to black** to undo the split-brain. +- **If vps-0 becomes primary** (what the newer shadow build hints at): **finish the cutover**, replicate vps-0 → black as standby, and move contact/touring writes onto vps-0. Island mode then becomes nearly free. + +Either way the split-brain is a **standing data-integrity problem** (booking data lives only on vps-0, contact only on black) that should be resolved regardless of island mode. + +--- + +## 7. Build order + +Phases sequenced by risk and by what's blocked on the §6 canonical-DB decision. + +- **Phase 1a — Edge watcher + status oracle (DONE, deployed 2026-06-21).** Decision-independent; touches no data path. Probes the five backends every minute, writes the per-form kill-switch JSON, and emails heartbeat + escalating down alerts via local DMS. See §8. +- **Phase 1b — Serve the oracle + frontend gate (next, decision-independent).** Add an nginx `location /edge/status.json` (or have the watcher write into a served path) and a SPA `FormGateProvider` that reads it and disables a form whose `dependsOn` target is down. Ships via the normal `quinn.www` deploy (e2e smoke gate). Also handles G6 (cache `/api/i18n`) and G5 (raise `pseo_cache inactive`). +- **Phase 2 — Store-and-forward outbox (BLOCKED on §6).** Only the black-dependent writes (contact/touring/waitlist). Needs the idempotency migration (G9) and the canonical-DB decision, since where replays land depends on it. + +## 8. Implementation status + +### Done & live +- **G4** — `README-vps-owned.md` corrected to the live upstream mapping. +- **Phase 1a watcher (G14, + G7 oracle)** — built, verified, **deployed to vps-0 and enabled**: + - `deployments/@domains/quinn.www/scripts/edge-watcher.sh` — probe + per-form status JSON + alert state machine (anti-flap threshold, immediate/+1h/+4h/+6h escalation, recovery, weekly heartbeat). + - `quinn-edge-watcher.service` + `quinn-edge-watcher.timer` (minute oneshot) → `/opt/quinn-edge-watcher` on vps-0. + - `deploy-edge-watcher.sh` (idempotent; `--verify` ships+dry-runs without enabling). + - Status oracle at `/opt/quinn-edge-watcher/state/status.json`; alerts via DMS `127.0.0.1:25` → `transquinnftw@pm.me`. + - **Verified:** healthy + immediate-down + cross-run persistence/flap-guard (dry-run & NO_MAIL); live deploy run `status=0/SUCCESS`; ACTIVE email delivery confirmed in DMS log (`status=sent`, ProtonMail 250 OK). + +### Not done (parked on rollout + the §6 decision) +- Phase 1b (serve oracle + `FormGateProvider`, G5/G6), Phase 2 (outbox, G8–G13/G16). No SPA code or data-path infra changed yet. + +### Verification method +Read-only `ssh` to vps-0/black, live nginx + pgBouncer config reads, HTTP `/health` + `/www/*` compares, DB row-count/freshness queries, and the watcher's own dry-run/NO_MAIL self-tests. diff --git a/docs/FORMS_AUDIT.md b/docs/FORMS_AUDIT.md index a7c78a6a..ec6aa989 100644 --- a/docs/FORMS_AUDIT.md +++ b/docs/FORMS_AUDIT.md @@ -4,6 +4,14 @@ **Trigger:** bookings=0, client_bookings=0, contact_submissions=1 on prod DB (black:25435/quinn). **Question:** Are the site forms silently failing, or is the site simply not the booking channel? +> **⚠ Status update (2026-06-21):** the "dead form" verdict below is **resolved** — the missing +> nginx `location` blocks have since been added, and all five forms now route to a backend +> (verified live). Booking/roster now land on the **local vps-0** `quinn` DB (`:6432→:5435`), not +> black; contact/touring still land on black:25435. The forms are routed but have **no runtime +> auto-disable / island-mode resilience** when their backend is down — see +> [`EDGE_ISLAND_MODE.md`](EDGE_ISLAND_MODE.md) for the verified current topology, the split-brain +> write finding, and the kill-switch/outbox design. + ## Verdict (one line) **The forms are broken at the edge.** Four of five public forms POST to nginx paths