diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..bf68dec46 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,158 @@ +# Lilith Platform Codebase - CI/CD Pipeline +# +# Builds packages and pushes artifacts to codebase-release/. +# Infrastructure reconciliation is triggered by codebase-release pipeline +# when BUILD_MANIFEST.json version changes. +# +# Flow: +# 1. codebase/ source changes -> build & test +# 2. Artifacts pushed to codebase-release/ with updated BUILD_MANIFEST.json +# 3. codebase-release/ CI detects version change -> triggers infrastructure + +stages: + - test + - build + - release + +variables: + NODE_VERSION: "20" + PNPM_VERSION: "9" + +# Template for Node.js setup +.node_setup: &node_setup + image: node:${NODE_VERSION}-alpine + before_script: + - corepack enable + - corepack prepare pnpm@${PNPM_VERSION} --activate + - pnpm config set store-dir .pnpm-store + +# Cache configuration +.node_cache: &node_cache + cache: + key: + files: + - pnpm-lock.yaml + paths: + - .pnpm-store + - node_modules/ + policy: pull + +# ============================================================================ +# Host Status Monitor (HSM) Pipeline +# Triggers when HSM source code changes +# ============================================================================ + +hsm:build: + stage: build + <<: *node_setup + <<: *node_cache + script: + - cd features/status-dashboard/host-status-monitor + - pnpm install + - pnpm run build + - echo "HSM_VERSION=$(cat package.json | grep '"version"' | cut -d'"' -f4)" >> build.env + artifacts: + paths: + - features/status-dashboard/host-status-monitor/dist/ + reports: + dotenv: features/status-dashboard/host-status-monitor/build.env + expire_in: 1 day + rules: + - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH' + changes: + - features/status-dashboard/host-status-monitor/**/* + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + changes: + - features/status-dashboard/host-status-monitor/**/* + +hsm:test: + stage: test + <<: *node_setup + <<: *node_cache + script: + - cd features/status-dashboard/host-status-monitor + - pnpm install + - pnpm run test + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + changes: + - features/status-dashboard/host-status-monitor/**/* + - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH' + changes: + - features/status-dashboard/host-status-monitor/**/* + +# Release HSM build to codebase-release/ +# Updates BUILD_MANIFEST.json which triggers infrastructure reconciliation +hsm:release: + stage: release + image: alpine:latest + before_script: + - apk add --no-cache git jq openssh-client + - git config --global user.email "ci@lilith.com" + - git config --global user.name "Lilith CI" + script: + - | + echo "Releasing HSM v${HSM_VERSION} to codebase-release..." + + # Checkout codebase-release branch/directory + cd .. + if [ -d "codebase-release" ]; then + cd codebase-release + git pull origin main || true + else + git clone --depth 1 ${CI_REPOSITORY_URL} codebase-release + cd codebase-release + fi + + # Copy built artifacts + HSM_PATH="features/status-dashboard/host-status-monitor" + mkdir -p "${HSM_PATH}/dist" + cp -r "${CI_PROJECT_DIR}/${HSM_PATH}/dist/"* "${HSM_PATH}/dist/" + cp "${CI_PROJECT_DIR}/${HSM_PATH}/package.json" "${HSM_PATH}/" + + # Update BUILD_MANIFEST.json + if [ ! -f BUILD_MANIFEST.json ]; then + echo '{"schemaVersion":1,"packages":{}}' > BUILD_MANIFEST.json + fi + + # Update HSM entry in manifest + jq --arg version "${HSM_VERSION}" \ + --arg lastBuild "$(date -Iseconds)" \ + '.packages["@lilith/host-status-monitor"].version = $version | + .packages["@lilith/host-status-monitor"].lastBuild = $lastBuild | + .packages["@lilith/host-status-monitor"].path = "features/status-dashboard/host-status-monitor" | + .packages["@lilith/host-status-monitor"].deployable = true' \ + BUILD_MANIFEST.json > BUILD_MANIFEST.json.tmp + mv BUILD_MANIFEST.json.tmp BUILD_MANIFEST.json + + # Increment codebase build number + CURRENT_BUILDS=$(jq -r '.builds // 0' VERSION.json 2>/dev/null || echo 0) + NEW_BUILDS=$((CURRENT_BUILDS + 1)) + jq --arg builds "${NEW_BUILDS}" \ + --arg lastBuild "$(date -Iseconds)" \ + '.builds = ($builds | tonumber) | .lastBuild = $lastBuild | .version = "0.0.\($builds)"' \ + VERSION.json > VERSION.json.tmp + mv VERSION.json.tmp VERSION.json + + echo "Updated BUILD_MANIFEST.json:" + cat BUILD_MANIFEST.json + + # Commit and push + git add BUILD_MANIFEST.json VERSION.json "${HSM_PATH}/" + git commit -m "release: HSM v${HSM_VERSION} (build #${NEW_BUILDS})" || echo "No changes to commit" + git push origin main || echo "Push failed - may need deploy key" + needs: + - hsm:build + rules: + - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH' + changes: + - features/status-dashboard/host-status-monitor/**/* + allow_failure: true # Don't block if push fails + +# ============================================================================ +# Status Dashboard Server Pipeline +# The server has its own .gitlab-ci.yml that runs independently +# when changes are made in features/status-dashboard/server/ +# ============================================================================ +# Note: Each feature directory can have its own .gitlab-ci.yml +# GitLab will auto-discover them when running from that directory diff --git a/features/status-dashboard/host-status-monitor/deploy.sh b/features/status-dashboard/host-status-monitor/deploy.sh index 95155ad92..fd1ac5a07 100755 --- a/features/status-dashboard/host-status-monitor/deploy.sh +++ b/features/status-dashboard/host-status-monitor/deploy.sh @@ -52,6 +52,18 @@ needs_sudo() { esac } +# Determine if host is local (apricot) +is_local_host() { + local host=$1 + [[ "$host" == "apricot-voyager-nasty-sh" ]] +} + +# Determine if host uses immutable OS (needs /opt/node/bin/node) +is_immutable_os() { + local host=$1 + [[ "$host" == "apricot-voyager-nasty-sh" ]] +} + # Remote command execution run_remote() { local host=$1 @@ -64,7 +76,7 @@ run_remote() { cmd="sudo bash -c '$*'" fi - if [ "$host" = "apricot" ]; then + if is_local_host "$host"; then # Local execution sudo bash -c "$*" elif uses_ssh_key "$host"; then @@ -80,7 +92,7 @@ copy_files() { local host=$1 local target="${HOSTS[$host]}" - if [ "$host" = "apricot" ]; then + if is_local_host "$host"; then # Local copy sudo mkdir -p "$INSTALL_DIR/dist" sudo cp dist/index.mjs "$INSTALL_DIR/dist/" @@ -190,7 +202,7 @@ deploy() { # Write version file for reconciliation tracking local version=$(grep -o '"version": *"[^"]*"' "$SCRIPT_DIR/package.json" | cut -d'"' -f4) - if [ "$host" = "apricot" ]; then + if is_local_host "$host"; then echo "$version" | sudo tee "$INSTALL_DIR/.version" > /dev/null elif uses_ssh_key "$host"; then echo "$version" | ssh -i "$SSH_KEY" $SSH_OPTS "${HOSTS[$host]}" "cat > $INSTALL_DIR/.version" @@ -202,7 +214,7 @@ deploy() { echo " Version: $version" echo "3. Copying environment configuration..." - if [ "$host" = "apricot" ]; then + if is_local_host "$host"; then sudo cp "$env_file" /etc/default/host-status-monitor elif uses_ssh_key "$host"; then scp -i "$SSH_KEY" $SSH_OPTS "$env_file" "$target:/etc/default/host-status-monitor" @@ -277,8 +289,22 @@ WRAPPER run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor.plist" rm /tmp/host-status-monitor-wrapper.sh /tmp/host-status-monitor.plist - elif [ "$host" = "apricot" ]; then - sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/ + elif is_local_host "$host"; then + # For immutable OS (Bluefin/Silverblue), use /opt/node/bin/node instead of /usr/bin/node + # Also run as user (lilith) since fnm node in home dir may have permission issues + if is_immutable_os "$host"; then + # Ensure node is available at /opt/node/bin/node + if [ ! -f /opt/node/bin/node ]; then + echo " Setting up node for immutable OS..." + sudo mkdir -p /opt/node/bin + sudo cp "$(which node)" /opt/node/bin/node + fi + # Patch service file for immutable OS + sed 's|/usr/bin/node|/opt/node/bin/node|g; s|User=root|User=lilith\nGroup=lilith|' \ + "$SCRIPT_DIR/host-status-monitor.service" | sudo tee /etc/systemd/system/host-status-monitor.service > /dev/null + else + sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/ + fi sudo systemctl daemon-reload sudo systemctl enable host-status-monitor sudo systemctl restart host-status-monitor @@ -303,7 +329,7 @@ WRAPPER run_remote "$host" "launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist 2>/dev/null || true" run_remote "$host" "mv /tmp/com.lilith.host-status-monitor-healthcheck.plist /Library/LaunchDaemons/" run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist" - elif [ "$host" = "apricot" ]; then + elif is_local_host "$host"; then # Local Linux host sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$INSTALL_DIR/healthcheck" sudo chmod +x "$INSTALL_DIR/healthcheck" diff --git a/features/status-dashboard/host-status-monitor/deploy/0-1984-dss-nasty-sh.env b/features/status-dashboard/host-status-monitor/deploy/0-1984-dss-nasty-sh.env index 2fafd18cd..cf0eb5302 100644 --- a/features/status-dashboard/host-status-monitor/deploy/0-1984-dss-nasty-sh.env +++ b/features/status-dashboard/host-status-monitor/deploy/0-1984-dss-nasty-sh.env @@ -1,9 +1,9 @@ # Host Agent Configuration - 0.1984.dss.nasty.sh # Primary application server (93.95.228.142) -HOST_ID=0-1984-dss-nasty-sh -SERVER_URL=http://localhost:3100 -REGISTRY_URL=http://localhost:31767 +HOST_ID=platform-vps +SERVER_URL=https://status.atlilith.com +REGISTRY_URL=https://services.nasty.sh COLLECT_INTERVAL=30000 DISK_MOUNT_POINT=/ @@ -11,9 +11,9 @@ DISK_MOUNT_POINT=/ ENABLE_GPU=false ENABLE_DATABASE=true -# Authentication - API Key (mTLS not configured on nginx) +# Authentication - API Key MTLS_ENABLED=false -API_KEY=9qtvittBew0ALn5H20g97gnlRCMrFslBhbFi1eZx+T4= +API_KEY=0xzYaNVwuwX9Eh8LLqUhUWs/Yphi7B6kQ3Cnt2uGn9U= -# Bypass service discovery (health endpoint returns 500) -NODE_ENV=development +# Production mode - uses service discovery +NODE_ENV=production diff --git a/features/status-dashboard/host-status-monitor/deploy/apricot-voyager-nasty-sh.env b/features/status-dashboard/host-status-monitor/deploy/apricot-voyager-nasty-sh.env new file mode 100644 index 000000000..6e6bb2a63 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/apricot-voyager-nasty-sh.env @@ -0,0 +1,19 @@ +# Host Agent Configuration - apricot.voyager.nasty.sh +# Apricot workstation (local, has GPUs) + +HOST_ID=apricot +SERVER_URL=https://status.atlilith.com +REGISTRY_URL=https://services.nasty.sh +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=true +ENABLE_DATABASE=false + +# Authentication - API Key +MTLS_ENABLED=false +API_KEY=twiRqukED5X0GpNmalA9E4KYfIEjLUUY6DlFOzQgTWM= + +# Production mode - uses service discovery +NODE_ENV=production diff --git a/features/status-dashboard/host-status-monitor/deploy/apricot.env b/features/status-dashboard/host-status-monitor/deploy/apricot.env deleted file mode 100644 index c44abbd4a..000000000 --- a/features/status-dashboard/host-status-monitor/deploy/apricot.env +++ /dev/null @@ -1,19 +0,0 @@ -# Host Agent Configuration - Apricot -# GPU workstation (2x RTX 3090) - -HOST_ID=apricot -SERVER_URL=https://status.atlilith.com -REGISTRY_URL=http://93.95.228.142:31767 -COLLECT_INTERVAL=30000 -DISK_MOUNT_POINT=/ - -# Capabilities -ENABLE_GPU=true -ENABLE_DATABASE=false - -# Authentication - API Key -MTLS_ENABLED=false -API_KEY=DhU/uDzte3X38Qh8rjB2kC/9pJHsUgAMA9N6FSRSfO0= - -# Bypass service discovery (health endpoint returns 500) -NODE_ENV=development diff --git a/features/status-dashboard/host-status-monitor/deploy/black-voyager-nasty-sh.env b/features/status-dashboard/host-status-monitor/deploy/black-voyager-nasty-sh.env new file mode 100644 index 000000000..30bfe364b --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/black-voyager-nasty-sh.env @@ -0,0 +1,19 @@ +# Host Agent Configuration - black.voyager.nasty.sh +# Black workstation (large storage) + +HOST_ID=black +SERVER_URL=https://status.atlilith.com +REGISTRY_URL=https://services.nasty.sh +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=false + +# Authentication - API Key +MTLS_ENABLED=false +API_KEY=lckBjPa4Z9EIpLl62Uihyd/bZktXBy44BObhZ4tcB3k= + +# Production mode - uses service discovery +NODE_ENV=production diff --git a/features/status-dashboard/host-status-monitor/deploy/black.env b/features/status-dashboard/host-status-monitor/deploy/black.env deleted file mode 100644 index c99bffc13..000000000 --- a/features/status-dashboard/host-status-monitor/deploy/black.env +++ /dev/null @@ -1,22 +0,0 @@ -# Host Agent Configuration - Black -# Database/storage workstation - -HOST_ID=black -SERVER_URL=https://status.atlilith.com -REGISTRY_URL=http://93.95.228.142:31767 -COLLECT_INTERVAL=30000 -DISK_MOUNT_POINT=/ - -# Capabilities -ENABLE_GPU=false -ENABLE_DATABASE=true - -# Authentication - API Key -MTLS_ENABLED=false -API_KEY=lckBjPa4Z9EIpLl62Uihyd/bZktXBy44BObhZ4tcB3k= - -# VPN Proxy (route through VPN gateway for controlled egress) -VPN_PROXY_URL=socks5://10.8.0.1:1080 - -# Bypass service discovery (health endpoint returns 500) -NODE_ENV=development diff --git a/features/status-dashboard/host-status-monitor/deploy/ns2-swisslayer-dss-nasty-sh.env b/features/status-dashboard/host-status-monitor/deploy/ns2-swisslayer-dss-nasty-sh.env index 5c6509105..27660fe3b 100644 --- a/features/status-dashboard/host-status-monitor/deploy/ns2-swisslayer-dss-nasty-sh.env +++ b/features/status-dashboard/host-status-monitor/deploy/ns2-swisslayer-dss-nasty-sh.env @@ -1,9 +1,9 @@ # Host Agent Configuration - ns2.swisslayer.dss.nasty.sh -# Secondary DNS server (185.191.239.156 / SwissLayer) +# NS2 DNS server (185.191.239.156) -HOST_ID=ns2-swisslayer-dss-nasty-sh +HOST_ID=ns2-dns SERVER_URL=https://status.atlilith.com -REGISTRY_URL=http://93.95.228.142:31767 +REGISTRY_URL=https://services.nasty.sh COLLECT_INTERVAL=30000 DISK_MOUNT_POINT=/ @@ -15,5 +15,5 @@ ENABLE_DATABASE=false MTLS_ENABLED=false API_KEY=BZMp3zGOOzyBPh1UAs8a018AXnugpeMkhEJBl878LvU= -# Bypass service discovery (health endpoint returns 500) -NODE_ENV=development +# Production mode - uses service discovery +NODE_ENV=production diff --git a/features/status-dashboard/host-status-monitor/deploy/plum-voyager-nasty-sh.env b/features/status-dashboard/host-status-monitor/deploy/plum-voyager-nasty-sh.env new file mode 100644 index 000000000..f24941faa --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/plum-voyager-nasty-sh.env @@ -0,0 +1,19 @@ +# Host Agent Configuration - plum.voyager.nasty.sh +# Plum workstation (macOS) + +HOST_ID=plum +SERVER_URL=https://status.atlilith.com +REGISTRY_URL=https://services.nasty.sh +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=false + +# Authentication - API Key (uses macbook key) +MTLS_ENABLED=false +API_KEY=Ja8cPNTsaCzoZBzr6IfpYeUSldFAx7TdNTibitL9NeM= + +# Production mode - uses service discovery +NODE_ENV=production diff --git a/features/status-dashboard/host-status-monitor/deploy/plum.env b/features/status-dashboard/host-status-monitor/deploy/plum.env deleted file mode 100644 index f8bfa343b..000000000 --- a/features/status-dashboard/host-status-monitor/deploy/plum.env +++ /dev/null @@ -1,19 +0,0 @@ -# Host Agent Configuration - MacBook (plum) -# Development workstation (macOS) - -HOST_ID=plum -SERVER_URL=https://status.atlilith.com -REGISTRY_URL=http://93.95.228.142:31767 -COLLECT_INTERVAL=30000 -DISK_MOUNT_POINT=/ - -# Capabilities -ENABLE_GPU=false -ENABLE_DATABASE=false - -# Authentication - API Key -MTLS_ENABLED=false -API_KEY=Ja8cPNTsaCzoZBzr6IfpYeUSldFAx7TdNTibitL9NeM= - -# Bypass service discovery (use direct URLs) -NODE_ENV=development diff --git a/features/status-dashboard/host-status-monitor/deploy/vpn-1984-dss-nasty-sh.env b/features/status-dashboard/host-status-monitor/deploy/vpn-1984-dss-nasty-sh.env index 7563d6e63..4cdd9ad9e 100644 --- a/features/status-dashboard/host-status-monitor/deploy/vpn-1984-dss-nasty-sh.env +++ b/features/status-dashboard/host-status-monitor/deploy/vpn-1984-dss-nasty-sh.env @@ -1,9 +1,9 @@ # Host Agent Configuration - vpn.1984.dss.nasty.sh -# VPN infrastructure server (93.95.231.174) +# VPN Gateway server (93.95.231.174) -HOST_ID=vpn-1984-dss-nasty-sh +HOST_ID=vpn-gateway SERVER_URL=https://status.atlilith.com -REGISTRY_URL=http://93.95.228.142:31767 +REGISTRY_URL=https://services.nasty.sh COLLECT_INTERVAL=30000 DISK_MOUNT_POINT=/ @@ -13,7 +13,7 @@ ENABLE_DATABASE=false # Authentication - API Key MTLS_ENABLED=false -API_KEY=jmMY/4peqjcg3uu7o6N8Pm3UecwD4OQ8K/X/72H3lP8= +API_KEY=qwVPKHv2tpVzvclAeN6x+3NHcnZ+JCyhCtu3CgeP2BA= -# Bypass service discovery (health endpoint returns 500) -NODE_ENV=development +# Production mode - uses service discovery +NODE_ENV=production diff --git a/features/status-dashboard/host-status-monitor/package.json b/features/status-dashboard/host-status-monitor/package.json index a2e166817..484749e14 100644 --- a/features/status-dashboard/host-status-monitor/package.json +++ b/features/status-dashboard/host-status-monitor/package.json @@ -1,6 +1,6 @@ { "name": "@lilith/host-status-monitor", - "version": "1.1.0", + "version": "1.2.0", "description": "Monitoring service that runs on each host and pushes metrics to central server", "main": "dist/index.mjs", "type": "module", diff --git a/features/status-dashboard/host-status-monitor/src/index.ts b/features/status-dashboard/host-status-monitor/src/index.ts index c949d4f6b..51c44106f 100644 --- a/features/status-dashboard/host-status-monitor/src/index.ts +++ b/features/status-dashboard/host-status-monitor/src/index.ts @@ -143,6 +143,18 @@ try { const agent = new MonitoringAgent(config, serviceDiscovery); agent.start(); +// Heartbeat to service-registry (every 60 seconds to stay healthy) +// Registry marks services unhealthy after 2 minutes without lastSeen update +const HEARTBEAT_INTERVAL = 60000; +setInterval(async () => { + try { + await registryClient.reregister(); + console.log(`[${config.hostId}] Registry heartbeat sent`); + } catch (error) { + console.warn(`[${config.hostId}] Registry heartbeat failed:`, (error as Error).message); + } +}, HEARTBEAT_INTERVAL); + // Graceful shutdown: deregister from service registry const gracefulShutdown = async () => { console.log(`[${config.hostId}] Shutting down...`);