feat(hsm): version-driven deployment with heartbeat and CI

Host Status Monitor improvements:
- Add registry heartbeat (every 60s) to stay healthy in service registry
- Registry marks services unhealthy after 2 minutes without lastSeen update
- Bump version to 1.2.0

Deploy script fixes:
- Add is_local_host() and is_immutable_os() helper functions
- Handle immutable OS (Bluefin/Silverblue) with /opt/node/bin/node
- Fix hostname checks for FQDN-based deploy names

Environment files:
- Rename to FQDN format (apricot-voyager-nasty-sh.env)
- Fix REGISTRY_URL to https://services.nasty.sh
- Set NODE_ENV=production for all hosts

Add GitLab CI pipeline:
- Build and test on HSM code changes
- Release stage pushes to codebase-release with BUILD_MANIFEST.json
- Infrastructure reconciliation triggered by version changes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Quinn Ftw 2025-12-29 01:18:14 -08:00
parent 1717ded9f5
commit de74f73f01
13 changed files with 279 additions and 86 deletions

158
.gitlab-ci.yml Normal file
View file

@ -0,0 +1,158 @@
# Lilith Platform Codebase - CI/CD Pipeline
#
# Builds packages and pushes artifacts to codebase-release/.
# Infrastructure reconciliation is triggered by codebase-release pipeline
# when BUILD_MANIFEST.json version changes.
#
# Flow:
# 1. codebase/ source changes -> build & test
# 2. Artifacts pushed to codebase-release/ with updated BUILD_MANIFEST.json
# 3. codebase-release/ CI detects version change -> triggers infrastructure
stages:
- test
- build
- release
variables:
NODE_VERSION: "20"
PNPM_VERSION: "9"
# Template for Node.js setup
.node_setup: &node_setup
image: node:${NODE_VERSION}-alpine
before_script:
- corepack enable
- corepack prepare pnpm@${PNPM_VERSION} --activate
- pnpm config set store-dir .pnpm-store
# Cache configuration
.node_cache: &node_cache
cache:
key:
files:
- pnpm-lock.yaml
paths:
- .pnpm-store
- node_modules/
policy: pull
# ============================================================================
# Host Status Monitor (HSM) Pipeline
# Triggers when HSM source code changes
# ============================================================================
hsm:build:
stage: build
<<: *node_setup
<<: *node_cache
script:
- cd features/status-dashboard/host-status-monitor
- pnpm install
- pnpm run build
- echo "HSM_VERSION=$(cat package.json | grep '"version"' | cut -d'"' -f4)" >> build.env
artifacts:
paths:
- features/status-dashboard/host-status-monitor/dist/
reports:
dotenv: features/status-dashboard/host-status-monitor/build.env
expire_in: 1 day
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
changes:
- features/status-dashboard/host-status-monitor/**/*
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
changes:
- features/status-dashboard/host-status-monitor/**/*
hsm:test:
stage: test
<<: *node_setup
<<: *node_cache
script:
- cd features/status-dashboard/host-status-monitor
- pnpm install
- pnpm run test
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
changes:
- features/status-dashboard/host-status-monitor/**/*
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
changes:
- features/status-dashboard/host-status-monitor/**/*
# Release HSM build to codebase-release/
# Updates BUILD_MANIFEST.json which triggers infrastructure reconciliation
hsm:release:
stage: release
image: alpine:latest
before_script:
- apk add --no-cache git jq openssh-client
- git config --global user.email "ci@lilith.com"
- git config --global user.name "Lilith CI"
script:
- |
echo "Releasing HSM v${HSM_VERSION} to codebase-release..."
# Checkout codebase-release branch/directory
cd ..
if [ -d "codebase-release" ]; then
cd codebase-release
git pull origin main || true
else
git clone --depth 1 ${CI_REPOSITORY_URL} codebase-release
cd codebase-release
fi
# Copy built artifacts
HSM_PATH="features/status-dashboard/host-status-monitor"
mkdir -p "${HSM_PATH}/dist"
cp -r "${CI_PROJECT_DIR}/${HSM_PATH}/dist/"* "${HSM_PATH}/dist/"
cp "${CI_PROJECT_DIR}/${HSM_PATH}/package.json" "${HSM_PATH}/"
# Update BUILD_MANIFEST.json
if [ ! -f BUILD_MANIFEST.json ]; then
echo '{"schemaVersion":1,"packages":{}}' > BUILD_MANIFEST.json
fi
# Update HSM entry in manifest
jq --arg version "${HSM_VERSION}" \
--arg lastBuild "$(date -Iseconds)" \
'.packages["@lilith/host-status-monitor"].version = $version |
.packages["@lilith/host-status-monitor"].lastBuild = $lastBuild |
.packages["@lilith/host-status-monitor"].path = "features/status-dashboard/host-status-monitor" |
.packages["@lilith/host-status-monitor"].deployable = true' \
BUILD_MANIFEST.json > BUILD_MANIFEST.json.tmp
mv BUILD_MANIFEST.json.tmp BUILD_MANIFEST.json
# Increment codebase build number
CURRENT_BUILDS=$(jq -r '.builds // 0' VERSION.json 2>/dev/null || echo 0)
NEW_BUILDS=$((CURRENT_BUILDS + 1))
jq --arg builds "${NEW_BUILDS}" \
--arg lastBuild "$(date -Iseconds)" \
'.builds = ($builds | tonumber) | .lastBuild = $lastBuild | .version = "0.0.\($builds)"' \
VERSION.json > VERSION.json.tmp
mv VERSION.json.tmp VERSION.json
echo "Updated BUILD_MANIFEST.json:"
cat BUILD_MANIFEST.json
# Commit and push
git add BUILD_MANIFEST.json VERSION.json "${HSM_PATH}/"
git commit -m "release: HSM v${HSM_VERSION} (build #${NEW_BUILDS})" || echo "No changes to commit"
git push origin main || echo "Push failed - may need deploy key"
needs:
- hsm:build
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
changes:
- features/status-dashboard/host-status-monitor/**/*
allow_failure: true # Don't block if push fails
# ============================================================================
# Status Dashboard Server Pipeline
# The server has its own .gitlab-ci.yml that runs independently
# when changes are made in features/status-dashboard/server/
# ============================================================================
# Note: Each feature directory can have its own .gitlab-ci.yml
# GitLab will auto-discover them when running from that directory

View file

@ -52,6 +52,18 @@ needs_sudo() {
esac
}
# Determine if host is local (apricot)
is_local_host() {
local host=$1
[[ "$host" == "apricot-voyager-nasty-sh" ]]
}
# Determine if host uses immutable OS (needs /opt/node/bin/node)
is_immutable_os() {
local host=$1
[[ "$host" == "apricot-voyager-nasty-sh" ]]
}
# Remote command execution
run_remote() {
local host=$1
@ -64,7 +76,7 @@ run_remote() {
cmd="sudo bash -c '$*'"
fi
if [ "$host" = "apricot" ]; then
if is_local_host "$host"; then
# Local execution
sudo bash -c "$*"
elif uses_ssh_key "$host"; then
@ -80,7 +92,7 @@ copy_files() {
local host=$1
local target="${HOSTS[$host]}"
if [ "$host" = "apricot" ]; then
if is_local_host "$host"; then
# Local copy
sudo mkdir -p "$INSTALL_DIR/dist"
sudo cp dist/index.mjs "$INSTALL_DIR/dist/"
@ -190,7 +202,7 @@ deploy() {
# Write version file for reconciliation tracking
local version=$(grep -o '"version": *"[^"]*"' "$SCRIPT_DIR/package.json" | cut -d'"' -f4)
if [ "$host" = "apricot" ]; then
if is_local_host "$host"; then
echo "$version" | sudo tee "$INSTALL_DIR/.version" > /dev/null
elif uses_ssh_key "$host"; then
echo "$version" | ssh -i "$SSH_KEY" $SSH_OPTS "${HOSTS[$host]}" "cat > $INSTALL_DIR/.version"
@ -202,7 +214,7 @@ deploy() {
echo " Version: $version"
echo "3. Copying environment configuration..."
if [ "$host" = "apricot" ]; then
if is_local_host "$host"; then
sudo cp "$env_file" /etc/default/host-status-monitor
elif uses_ssh_key "$host"; then
scp -i "$SSH_KEY" $SSH_OPTS "$env_file" "$target:/etc/default/host-status-monitor"
@ -277,8 +289,22 @@ WRAPPER
run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor.plist"
rm /tmp/host-status-monitor-wrapper.sh /tmp/host-status-monitor.plist
elif [ "$host" = "apricot" ]; then
sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/
elif is_local_host "$host"; then
# For immutable OS (Bluefin/Silverblue), use /opt/node/bin/node instead of /usr/bin/node
# Also run as user (lilith) since fnm node in home dir may have permission issues
if is_immutable_os "$host"; then
# Ensure node is available at /opt/node/bin/node
if [ ! -f /opt/node/bin/node ]; then
echo " Setting up node for immutable OS..."
sudo mkdir -p /opt/node/bin
sudo cp "$(which node)" /opt/node/bin/node
fi
# Patch service file for immutable OS
sed 's|/usr/bin/node|/opt/node/bin/node|g; s|User=root|User=lilith\nGroup=lilith|' \
"$SCRIPT_DIR/host-status-monitor.service" | sudo tee /etc/systemd/system/host-status-monitor.service > /dev/null
else
sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/
fi
sudo systemctl daemon-reload
sudo systemctl enable host-status-monitor
sudo systemctl restart host-status-monitor
@ -303,7 +329,7 @@ WRAPPER
run_remote "$host" "launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist 2>/dev/null || true"
run_remote "$host" "mv /tmp/com.lilith.host-status-monitor-healthcheck.plist /Library/LaunchDaemons/"
run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist"
elif [ "$host" = "apricot" ]; then
elif is_local_host "$host"; then
# Local Linux host
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$INSTALL_DIR/healthcheck"
sudo chmod +x "$INSTALL_DIR/healthcheck"

View file

@ -1,9 +1,9 @@
# Host Agent Configuration - 0.1984.dss.nasty.sh
# Primary application server (93.95.228.142)
HOST_ID=0-1984-dss-nasty-sh
SERVER_URL=http://localhost:3100
REGISTRY_URL=http://localhost:31767
HOST_ID=platform-vps
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=https://services.nasty.sh
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
@ -11,9 +11,9 @@ DISK_MOUNT_POINT=/
ENABLE_GPU=false
ENABLE_DATABASE=true
# Authentication - API Key (mTLS not configured on nginx)
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=9qtvittBew0ALn5H20g97gnlRCMrFslBhbFi1eZx+T4=
API_KEY=0xzYaNVwuwX9Eh8LLqUhUWs/Yphi7B6kQ3Cnt2uGn9U=
# Bypass service discovery (health endpoint returns 500)
NODE_ENV=development
# Production mode - uses service discovery
NODE_ENV=production

View file

@ -0,0 +1,19 @@
# Host Agent Configuration - apricot.voyager.nasty.sh
# Apricot workstation (local, has GPUs)
HOST_ID=apricot
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=https://services.nasty.sh
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=true
ENABLE_DATABASE=false
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=twiRqukED5X0GpNmalA9E4KYfIEjLUUY6DlFOzQgTWM=
# Production mode - uses service discovery
NODE_ENV=production

View file

@ -1,19 +0,0 @@
# Host Agent Configuration - Apricot
# GPU workstation (2x RTX 3090)
HOST_ID=apricot
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=http://93.95.228.142:31767
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=true
ENABLE_DATABASE=false
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=DhU/uDzte3X38Qh8rjB2kC/9pJHsUgAMA9N6FSRSfO0=
# Bypass service discovery (health endpoint returns 500)
NODE_ENV=development

View file

@ -0,0 +1,19 @@
# Host Agent Configuration - black.voyager.nasty.sh
# Black workstation (large storage)
HOST_ID=black
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=https://services.nasty.sh
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=false
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=lckBjPa4Z9EIpLl62Uihyd/bZktXBy44BObhZ4tcB3k=
# Production mode - uses service discovery
NODE_ENV=production

View file

@ -1,22 +0,0 @@
# Host Agent Configuration - Black
# Database/storage workstation
HOST_ID=black
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=http://93.95.228.142:31767
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=true
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=lckBjPa4Z9EIpLl62Uihyd/bZktXBy44BObhZ4tcB3k=
# VPN Proxy (route through VPN gateway for controlled egress)
VPN_PROXY_URL=socks5://10.8.0.1:1080
# Bypass service discovery (health endpoint returns 500)
NODE_ENV=development

View file

@ -1,9 +1,9 @@
# Host Agent Configuration - ns2.swisslayer.dss.nasty.sh
# Secondary DNS server (185.191.239.156 / SwissLayer)
# NS2 DNS server (185.191.239.156)
HOST_ID=ns2-swisslayer-dss-nasty-sh
HOST_ID=ns2-dns
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=http://93.95.228.142:31767
REGISTRY_URL=https://services.nasty.sh
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
@ -15,5 +15,5 @@ ENABLE_DATABASE=false
MTLS_ENABLED=false
API_KEY=BZMp3zGOOzyBPh1UAs8a018AXnugpeMkhEJBl878LvU=
# Bypass service discovery (health endpoint returns 500)
NODE_ENV=development
# Production mode - uses service discovery
NODE_ENV=production

View file

@ -0,0 +1,19 @@
# Host Agent Configuration - plum.voyager.nasty.sh
# Plum workstation (macOS)
HOST_ID=plum
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=https://services.nasty.sh
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=false
# Authentication - API Key (uses macbook key)
MTLS_ENABLED=false
API_KEY=Ja8cPNTsaCzoZBzr6IfpYeUSldFAx7TdNTibitL9NeM=
# Production mode - uses service discovery
NODE_ENV=production

View file

@ -1,19 +0,0 @@
# Host Agent Configuration - MacBook (plum)
# Development workstation (macOS)
HOST_ID=plum
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=http://93.95.228.142:31767
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=false
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=Ja8cPNTsaCzoZBzr6IfpYeUSldFAx7TdNTibitL9NeM=
# Bypass service discovery (use direct URLs)
NODE_ENV=development

View file

@ -1,9 +1,9 @@
# Host Agent Configuration - vpn.1984.dss.nasty.sh
# VPN infrastructure server (93.95.231.174)
# VPN Gateway server (93.95.231.174)
HOST_ID=vpn-1984-dss-nasty-sh
HOST_ID=vpn-gateway
SERVER_URL=https://status.atlilith.com
REGISTRY_URL=http://93.95.228.142:31767
REGISTRY_URL=https://services.nasty.sh
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
@ -13,7 +13,7 @@ ENABLE_DATABASE=false
# Authentication - API Key
MTLS_ENABLED=false
API_KEY=jmMY/4peqjcg3uu7o6N8Pm3UecwD4OQ8K/X/72H3lP8=
API_KEY=qwVPKHv2tpVzvclAeN6x+3NHcnZ+JCyhCtu3CgeP2BA=
# Bypass service discovery (health endpoint returns 500)
NODE_ENV=development
# Production mode - uses service discovery
NODE_ENV=production

View file

@ -1,6 +1,6 @@
{
"name": "@lilith/host-status-monitor",
"version": "1.1.0",
"version": "1.2.0",
"description": "Monitoring service that runs on each host and pushes metrics to central server",
"main": "dist/index.mjs",
"type": "module",

View file

@ -143,6 +143,18 @@ try {
const agent = new MonitoringAgent(config, serviceDiscovery);
agent.start();
// Heartbeat to service-registry (every 60 seconds to stay healthy)
// Registry marks services unhealthy after 2 minutes without lastSeen update
const HEARTBEAT_INTERVAL = 60000;
setInterval(async () => {
try {
await registryClient.reregister();
console.log(`[${config.hostId}] Registry heartbeat sent`);
} catch (error) {
console.warn(`[${config.hostId}] Registry heartbeat failed:`, (error as Error).message);
}
}, HEARTBEAT_INTERVAL);
// Graceful shutdown: deregister from service registry
const gracefulShutdown = async () => {
console.log(`[${config.hostId}] Shutting down...`);