diff --git a/features/status-dashboard/host-status-monitor/Makefile b/features/status-dashboard/host-status-monitor/Makefile new file mode 100644 index 000000000..a64bbe15e --- /dev/null +++ b/features/status-dashboard/host-status-monitor/Makefile @@ -0,0 +1,98 @@ +# Host Status Monitor Deployment Makefile +# Usage: make deploy- + +.PHONY: build deploy-all deploy-platform deploy-apricot deploy-black deploy-vpn deploy-macbook status logs help + +# SSH key for 1984 hosts +SSH_KEY := ~/.ssh/id_ed25519_1984 +SSH_OPTS := -o StrictHostKeyChecking=accept-new + +# Host definitions +PLATFORM_VPS := root@93.95.228.142 +VPN_GATEWAY := root@93.95.231.174 +APRICOT := localhost +BLACK := lilith@black +MACBOOK := natalie@10.0.0.162 + +# Default target +help: + @echo "Host Status Monitor Deployment" + @echo "" + @echo "Usage:" + @echo " make build - Build TypeScript to JavaScript" + @echo " make deploy-all - Deploy to all hosts" + @echo " make deploy-platform - Deploy to platform-vps" + @echo " make deploy-vpn - Deploy to vpn-gateway" + @echo " make deploy-apricot - Deploy to apricot (localhost)" + @echo " make deploy-black - Deploy to black" + @echo " make deploy-macbook - Deploy to macbook" + @echo " make status - Check status on all hosts" + @echo " make logs - Tail logs from platform-vps" + @echo "" + +# Build +build: + @echo "Building host-status-monitor..." + npm run build + +# Deploy to all hosts +deploy-all: build deploy-platform deploy-vpn deploy-apricot deploy-black deploy-macbook + @echo "All deployments complete" + +# Deploy to platform-vps +deploy-platform: build + @echo "Deploying to platform-vps..." + ./deploy.sh platform-vps + +# Deploy to vpn-gateway +deploy-vpn: build + @echo "Deploying to vpn-gateway..." + ./deploy.sh vpn-gateway + +# Deploy to apricot (localhost) +deploy-apricot: build + @echo "Deploying to apricot (localhost)..." + ./deploy.sh apricot + +# Deploy to black +deploy-black: build + @echo "Deploying to black..." + ./deploy.sh black + +# Deploy to macbook +deploy-macbook: build + @echo "Deploying to macbook..." + ./deploy.sh macbook + +# Check status on all hosts +status: + @echo "=== Platform VPS ===" + @ssh -i $(SSH_KEY) $(SSH_OPTS) $(PLATFORM_VPS) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect" + @echo "" + @echo "=== VPN Gateway ===" + @ssh -i $(SSH_KEY) $(SSH_OPTS) $(VPN_GATEWAY) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect" + @echo "" + @echo "=== Apricot (localhost) ===" + @systemctl status host-status-monitor --no-pager 2>/dev/null || echo "Not installed locally" + @echo "" + @echo "=== Black ===" + @ssh $(BLACK) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect" + @echo "" + @echo "=== MacBook ===" + @ssh $(MACBOOK) "launchctl list | grep host-status-monitor" 2>/dev/null || echo "Could not connect" + +# Tail logs from platform-vps +logs: + ssh -i $(SSH_KEY) $(SSH_OPTS) $(PLATFORM_VPS) "journalctl -u host-status-monitor -f" + +logs-vpn: + ssh -i $(SSH_KEY) $(SSH_OPTS) $(VPN_GATEWAY) "journalctl -u host-status-monitor -f" + +logs-apricot: + journalctl -u host-status-monitor -f + +logs-black: + ssh $(BLACK) "journalctl -u host-status-monitor -f" + +logs-macbook: + ssh $(MACBOOK) "tail -f /var/log/host-status-monitor.log" diff --git a/features/status-dashboard/host-status-monitor/README.md b/features/status-dashboard/host-status-monitor/README.md new file mode 100644 index 000000000..64a98b7f9 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/README.md @@ -0,0 +1,328 @@ +# Host Status Monitor + +Lightweight monitoring service that runs on each host and pushes system metrics to the central status-dashboard service. + +## Architecture + +``` +┌─────────────────┐ mTLS ┌─────────────────────────┐ +│ Host Status │ ─────────────────────►│ Status Dashboard │ +│ Monitor │ POST /api/metrics │ (status.atlilith.com) │ +│ (each host) │ /report │ │ +│ │ │ - Stores metrics │ +│ - CPU/Memory │ │ - Triggers alerts │ +│ - Disk usage │ │ - Serves dashboard │ +│ - Docker stats │ │ │ +│ - GPU (opt) │ │ │ +└─────────────────┘ └─────────────────────────┘ +``` + +**Push Model**: Agents push metrics every 30 seconds (configurable). No SSH access required from the central server. + +**Authentication**: mTLS (mutual TLS) with client certificates. API key fallback for development. + +## Hosts + +| Host | IP | Purpose | +|------|-----|---------| +| platform-vps | 93.95.228.142 | Main platform services | +| vpn-gateway | 93.95.231.174 | VPN infrastructure | +| apricot | localhost | Development machine | +| black | lilith@black | Secondary server | + +## Quick Start + +### 1. Generate Certificates (first time only) + +```bash +cd scripts/ +./generate-certs.sh +``` + +This creates: +- CA certificate in `vault/certs/ca/` +- Server certificate in `vault/certs/server/` +- Client certificates for each host in `vault/certs/clients/` +- API keys in `vault/api-keys/` + +### 2. Deploy to a Host + +```bash +# Build first +make build + +# Deploy to specific host +make deploy-platform # platform-vps +make deploy-vpn # vpn-gateway +make deploy-apricot # localhost +make deploy-black # black + +# Or deploy to all hosts +make deploy-all +``` + +### 3. Check Status + +```bash +make status +``` + +### 4. View Logs + +```bash +make logs # platform-vps logs +make logs-vpn # vpn-gateway logs +make logs-apricot # localhost logs +make logs-black # black logs +``` + +## Configuration + +Environment files are in `deploy/`: + +| File | Host | +|------|------| +| `platform-vps.env` | Main platform VPS | +| `vpn-gateway.env` | VPN gateway server | +| `apricot.env` | Local development | +| `black.env` | Secondary server | + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `HOST_ID` | Unique identifier for this host | Required | +| `SERVER_URL` | Status dashboard URL | `https://status.atlilith.com` | +| `COLLECT_INTERVAL` | Metrics collection interval (ms) | `30000` | +| `DISK_MOUNT_POINT` | Disk to monitor | `/` | +| `ENABLE_GPU` | Enable GPU monitoring | `false` | +| `ENABLE_DATABASE` | Enable database metrics | `false` | + +### mTLS Configuration + +| Variable | Description | Default | +|----------|-------------|---------| +| `MTLS_ENABLED` | Enable mTLS authentication | `true` | +| `MTLS_CLIENT_CERT` | Path to client certificate | `/etc/host-status-monitor/certs/client.crt` | +| `MTLS_CLIENT_KEY` | Path to client private key | `/etc/host-status-monitor/certs/client.key` | +| `MTLS_CA_CERT` | Path to CA certificate | `/etc/host-status-monitor/certs/ca.crt` | + +### API Key Configuration (fallback) + +| Variable | Description | +|----------|-------------| +| `API_KEY` | API key for authentication (if mTLS disabled) | + +### VPN Proxy (for hosts behind VPN) + +| Variable | Description | +|----------|-------------| +| `VPN_PROXY_URL` | SOCKS5 proxy URL (e.g., `socks5://localhost:1080`) | + +## Certificate Management + +### Certificate Locations + +**On status-dashboard server:** +``` +/etc/status-dashboard/certs/ +├── ca.crt # CA certificate +├── server.crt # Server certificate +└── server.key # Server private key +``` + +**On each host:** +``` +/etc/host-status-monitor/certs/ +├── ca.crt # CA certificate (same as server) +├── client.crt # Client certificate (host-specific) +└── client.key # Client private key (host-specific) +``` + +### Deploying Certificates + +After running `generate-certs.sh`: + +```bash +# Copy CA cert to all hosts +scp vault/certs/ca/ca.crt root@:/etc/host-status-monitor/certs/ + +# Copy host-specific client cert/key +scp vault/certs/clients/.crt root@:/etc/host-status-monitor/certs/client.crt +scp vault/certs/clients/.key root@:/etc/host-status-monitor/certs/client.key + +# Set permissions +ssh root@ "chmod 600 /etc/host-status-monitor/certs/*.key && chmod 644 /etc/host-status-monitor/certs/*.crt" +``` + +### Certificate Renewal + +Certificates are valid for 1 year. To renew: + +```bash +# Remove existing certificates +rm -rf vault/certs/server/* vault/certs/clients/* + +# Regenerate (keeps existing CA) +./scripts/generate-certs.sh + +# Redeploy to all hosts +make deploy-all +``` + +## Metrics Collected + +### System Metrics + +| Metric | Description | +|--------|-------------| +| `cpu.percent` | CPU usage percentage | +| `cpu.cores` | Number of CPU cores | +| `memory.total` | Total memory (bytes) | +| `memory.used` | Used memory (bytes) | +| `memory.percent` | Memory usage percentage | +| `disk.total` | Total disk space (bytes) | +| `disk.used` | Used disk space (bytes) | +| `disk.percent` | Disk usage percentage | +| `uptime` | System uptime (seconds) | +| `loadAvg` | Load averages (1, 5, 15 min) | + +### Docker Metrics (if Docker available) + +| Metric | Description | +|--------|-------------| +| `containers[].name` | Container name | +| `containers[].state` | Running, exited, etc. | +| `containers[].health` | Healthy, unhealthy, none | +| `containers[].cpu` | Container CPU usage | +| `containers[].memory` | Container memory usage | + +### GPU Metrics (if enabled) + +| Metric | Description | +|--------|-------------| +| `gpu.name` | GPU model name | +| `gpu.temperature` | GPU temperature (C) | +| `gpu.utilization` | GPU utilization percentage | +| `gpu.memory.total` | Total GPU memory | +| `gpu.memory.used` | Used GPU memory | + +## Development + +### Building + +```bash +npm install +npm run build +``` + +### Running Locally + +```bash +# Set environment variables +export HOST_ID=dev +export SERVER_URL=http://localhost:3000 +export COLLECT_INTERVAL=5000 +export MTLS_ENABLED=false +export API_KEY=dev-key + +# Run +npm start +``` + +### Testing + +```bash +npm test +``` + +## Troubleshooting + +### Service Not Starting + +1. Check systemd status: + ```bash + systemctl status host-status-monitor + journalctl -u host-status-monitor -n 50 + ``` + +2. Verify environment file: + ```bash + cat /etc/default/host-status-monitor + ``` + +3. Check certificate permissions: + ```bash + ls -la /etc/host-status-monitor/certs/ + ``` + +### Connection Refused + +1. Verify server is running: + ```bash + curl -k https://status.atlilith.com/health + ``` + +2. Check firewall rules on both ends + +3. If behind VPN, verify SOCKS5 proxy: + ```bash + curl --socks5 localhost:1080 https://status.atlilith.com/health + ``` + +### Certificate Errors + +1. Verify CA certificate matches: + ```bash + openssl x509 -in /etc/host-status-monitor/certs/ca.crt -noout -subject + ``` + +2. Verify client certificate is signed by CA: + ```bash + openssl verify -CAfile /etc/host-status-monitor/certs/ca.crt /etc/host-status-monitor/certs/client.crt + ``` + +3. Check certificate expiry: + ```bash + openssl x509 -in /etc/host-status-monitor/certs/client.crt -noout -enddate + ``` + +### High CPU/Memory + +The service should use minimal resources (<1% CPU, <50MB RAM). If higher: + +1. Check `COLLECT_INTERVAL` isn't too low +2. Verify Docker socket access isn't hanging +3. Check for network timeouts (increase timeout if needed) + +## Security Considerations + +- Client certificates identify each host uniquely via CN (Common Name) +- Private keys never leave their respective hosts +- API keys are a fallback only - prefer mTLS in production +- All communication is encrypted (TLS 1.2+) +- Server validates client certificate against trusted CA + +## File Structure + +``` +host-status-monitor/ +├── src/ +│ ├── agent.ts # Main monitoring agent +│ ├── metrics-collector.ts # System metrics collection +│ ├── types.ts # TypeScript interfaces +│ └── index.ts # Entry point +├── deploy/ +│ ├── platform-vps.env # Platform VPS config +│ ├── vpn-gateway.env # VPN gateway config +│ ├── apricot.env # Local dev config +│ └── black.env # Secondary server config +├── scripts/ +│ └── generate-certs.sh # Certificate generation +├── host-status-monitor.service # systemd service file +├── deploy.sh # Deployment script +├── Makefile # Build/deploy automation +├── package.json +├── tsconfig.json +└── README.md # This file +``` diff --git a/features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor.plist b/features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor.plist new file mode 100644 index 000000000..643315355 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor.plist @@ -0,0 +1,41 @@ + + + + + Label + com.lilith.host-status-monitor + + ProgramArguments + + /usr/local/bin/node + /opt/host-status-monitor/dist/index.js + + + WorkingDirectory + /opt/host-status-monitor + + EnvironmentVariables + + NODE_ENV + production + + + RunAtLoad + + + KeepAlive + + SuccessfulExit + + + + ThrottleInterval + 10 + + StandardOutPath + /var/log/host-status-monitor.log + + StandardErrorPath + /var/log/host-status-monitor.error.log + + diff --git a/features/status-dashboard/host-status-monitor/deploy.sh b/features/status-dashboard/host-status-monitor/deploy.sh new file mode 100755 index 000000000..a025098ca --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# Host Status Monitor Deployment Script +# Usage: ./deploy.sh + +set -e + +# Configuration +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SSH_KEY="$HOME/.ssh/id_ed25519_1984" +SSH_OPTS="-o StrictHostKeyChecking=accept-new" +INSTALL_DIR="/opt/host-status-monitor" +CERT_DIR="/etc/host-status-monitor/certs" + +# Host mappings (must match host IDs in infrastructure/hosts/*.yaml) +declare -A HOSTS=( + # DSS 1984 hosts + ["platform-vps"]="root@93.95.228.142" + ["platform-vps-0"]="root@93.95.228.142" + ["vpn-gateway"]="root@93.95.231.174" + # DSS SwissLayer hosts + ["ns2-dns"]="root@185.191.239.156" + # Voyager (local network) hosts + ["apricot"]="localhost" + ["black"]="lilith@black" + ["macbook"]="natalie@10.0.0.162" +) + +# Determine if host uses SSH key +uses_ssh_key() { + local host=$1 + case $host in + platform-vps|platform-vps-0|vpn-gateway|ns2-dns) return 0 ;; + *) return 1 ;; + esac +} + +# Determine if host is macOS +is_macos_host() { + local host=$1 + case $host in + macbook) return 0 ;; + *) return 1 ;; + esac +} + +# Determine if host requires sudo (non-root SSH user) +needs_sudo() { + local host=$1 + case $host in + black|macbook) return 0 ;; + *) return 1 ;; + esac +} + +# Remote command execution +run_remote() { + local host=$1 + shift + local target="${HOSTS[$host]}" + local cmd="$*" + + # Wrap in sudo for non-root users + if needs_sudo "$host"; then + cmd="sudo bash -c '$*'" + fi + + if [ "$host" = "apricot" ]; then + # Local execution + sudo bash -c "$*" + elif uses_ssh_key "$host"; then + ssh -i "$SSH_KEY" $SSH_OPTS "$target" "$cmd" + else + ssh "$target" "$cmd" + fi +} + +# Copy files to remote +copy_files() { + local host=$1 + local target="${HOSTS[$host]}" + local rsync_opts="-avz --delete" + + # Use sudo rsync on remote for non-root users + if needs_sudo "$host"; then + rsync_opts="$rsync_opts --rsync-path='sudo rsync'" + fi + + if [ "$host" = "apricot" ]; then + # Local copy + sudo mkdir -p "$INSTALL_DIR" + sudo cp -r dist package.json "$INSTALL_DIR/" + sudo mkdir -p "$CERT_DIR" + elif uses_ssh_key "$host"; then + eval rsync $rsync_opts -e "\"ssh -i $SSH_KEY $SSH_OPTS\"" \ + dist package.json "$target:$INSTALL_DIR/" + else + eval rsync $rsync_opts \ + dist package.json "$target:$INSTALL_DIR/" + fi +} + +# Main deployment function +deploy() { + local host=$1 + local target="${HOSTS[$host]}" + + if [ -z "$target" ]; then + echo "ERROR: Unknown host '$host'" + echo "Available hosts: ${!HOSTS[*]}" + exit 1 + fi + + echo "=== Deploying to $host ($target) ===" + + # Check if dist exists + if [ ! -d "$SCRIPT_DIR/dist" ]; then + echo "ERROR: dist/ directory not found. Run 'npm run build' first." + exit 1 + fi + + # Check if env file exists + local env_file="$SCRIPT_DIR/deploy/${host}.env" + if [ ! -f "$env_file" ]; then + echo "ERROR: Environment file not found: $env_file" + exit 1 + fi + + echo "1. Creating directories..." + run_remote "$host" "mkdir -p $INSTALL_DIR $CERT_DIR" + + echo "2. Copying files..." + copy_files "$host" + + echo "3. Copying environment configuration..." + if [ "$host" = "apricot" ]; then + sudo cp "$env_file" /etc/default/host-status-monitor + elif uses_ssh_key "$host"; then + scp -i "$SSH_KEY" $SSH_OPTS "$env_file" "$target:/etc/default/host-status-monitor" + elif needs_sudo "$host"; then + # For non-root users, scp to temp then move with sudo + scp "$env_file" "$target:/tmp/host-status-monitor.env" + run_remote "$host" "mv /tmp/host-status-monitor.env /etc/default/host-status-monitor" + else + scp "$env_file" "$target:/etc/default/host-status-monitor" + fi + + echo "4. Installing dependencies..." + run_remote "$host" "cd $INSTALL_DIR && npm install --production --silent" + + echo "5. Installing service..." + if is_macos_host "$host"; then + # macOS: use launchd + echo " Installing launchd service for macOS..." + + # Create wrapper script that sources env file + cat > /tmp/host-status-monitor-wrapper.sh << 'WRAPPER' +#!/bin/bash +set -a +source /etc/default/host-status-monitor +set +a +# Use Homebrew node on Apple Silicon +exec /opt/homebrew/bin/node /opt/host-status-monitor/dist/index.js +WRAPPER + + scp /tmp/host-status-monitor-wrapper.sh "$target:/opt/host-status-monitor/run.sh" + run_remote "$host" "chmod +x /opt/host-status-monitor/run.sh" + + # Update plist to use wrapper + sed 's|/usr/local/bin/node.*|/opt/host-status-monitor/run.sh|' "$SCRIPT_DIR/com.lilith.host-status-monitor.plist" > /tmp/host-status-monitor.plist + sed -i 's|/opt/host-status-monitor/run.sh|/bin/bash/opt/host-status-monitor/run.sh|' /tmp/host-status-monitor.plist + + scp /tmp/host-status-monitor.plist "$target:/Library/LaunchDaemons/com.lilith.host-status-monitor.plist" + run_remote "$host" "sudo launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor.plist 2>/dev/null || true" + run_remote "$host" "sudo launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor.plist" + + rm /tmp/host-status-monitor-wrapper.sh /tmp/host-status-monitor.plist + elif [ "$host" = "apricot" ]; then + sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/ + sudo systemctl daemon-reload + sudo systemctl enable host-status-monitor + sudo systemctl restart host-status-monitor + elif uses_ssh_key "$host"; then + scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor.service" "$target:/etc/systemd/system/" + run_remote "$host" "systemctl daemon-reload && systemctl enable host-status-monitor && systemctl restart host-status-monitor" + elif needs_sudo "$host"; then + # For non-root users, scp to temp then move with sudo + scp "$SCRIPT_DIR/host-status-monitor.service" "$target:/tmp/host-status-monitor.service" + run_remote "$host" "mv /tmp/host-status-monitor.service /etc/systemd/system/ && systemctl daemon-reload && systemctl enable host-status-monitor && systemctl restart host-status-monitor" + else + scp "$SCRIPT_DIR/host-status-monitor.service" "$target:/etc/systemd/system/" + run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable host-status-monitor && sudo systemctl restart host-status-monitor" + fi + + echo "6. Checking status..." + sleep 2 + if is_macos_host "$host"; then + run_remote "$host" "sudo launchctl list | grep host-status-monitor" || true + run_remote "$host" "tail -5 /var/log/host-status-monitor.log 2>/dev/null" || true + else + run_remote "$host" "systemctl status host-status-monitor --no-pager" || true + fi + + echo "" + echo "=== Deployment to $host complete ===" + echo "View logs: journalctl -u host-status-monitor -f" +} + +# Show usage +usage() { + echo "Usage: $0 " + echo "" + echo "Available hosts:" + for host in "${!HOSTS[@]}"; do + echo " $host -> ${HOSTS[$host]}" + done +} + +# Main +if [ -z "$1" ]; then + usage + exit 1 +fi + +deploy "$1" diff --git a/features/status-dashboard/host-status-monitor/deploy/apricot.env b/features/status-dashboard/host-status-monitor/deploy/apricot.env new file mode 100644 index 000000000..1b40e8db4 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/apricot.env @@ -0,0 +1,24 @@ +# Host Agent Configuration - Apricot +# GPU workstation (2x RTX 3090) + +HOST_ID=apricot +SERVER_URL=https://status.atlilith.com +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=true +ENABLE_DATABASE=false + +# Authentication (choose one) +# Option 1: mTLS (recommended for production) +MTLS_ENABLED=true +MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt +MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key +MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt + +# Option 2: API Key (fallback) +# API_KEY= + +# VPN Proxy (required - routes through VPN gateway to reach status server) +VPN_PROXY_URL=socks5://93.95.231.174:1080 diff --git a/features/status-dashboard/host-status-monitor/deploy/black.env b/features/status-dashboard/host-status-monitor/deploy/black.env new file mode 100644 index 000000000..76a2d6f30 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/black.env @@ -0,0 +1,24 @@ +# Host Agent Configuration - Black +# Database/storage workstation + +HOST_ID=black +SERVER_URL=https://status.atlilith.com +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=true + +# Authentication (choose one) +# Option 1: mTLS (recommended for production) +MTLS_ENABLED=true +MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt +MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key +MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt + +# Option 2: API Key (fallback) +# API_KEY= + +# VPN Proxy (required - routes through VPN gateway to reach status server) +VPN_PROXY_URL=socks5://93.95.231.174:1080 diff --git a/features/status-dashboard/host-status-monitor/deploy/macbook.env b/features/status-dashboard/host-status-monitor/deploy/macbook.env new file mode 100644 index 000000000..8352a7161 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/macbook.env @@ -0,0 +1,24 @@ +# Host Agent Configuration - MacBook +# Development workstation (macOS) + +HOST_ID=macbook +SERVER_URL=https://status.atlilith.com +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=false + +# Authentication (choose one) +# Option 1: mTLS (recommended for production) +MTLS_ENABLED=true +MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt +MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key +MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt + +# Option 2: API Key (fallback) +# API_KEY= + +# VPN Proxy (required - routes through VPN gateway to reach status server) +VPN_PROXY_URL=socks5://93.95.231.174:1080 diff --git a/features/status-dashboard/host-status-monitor/deploy/ns2-dns.env b/features/status-dashboard/host-status-monitor/deploy/ns2-dns.env new file mode 100644 index 000000000..6c10ecfea --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/ns2-dns.env @@ -0,0 +1,24 @@ +# Host Agent Configuration - NS2 DNS +# Secondary DNS server (185.191.239.156 / SwissLayer) + +HOST_ID=ns2-dns +SERVER_URL=https://status.atlilith.com +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=false + +# Authentication (choose one) +# Option 1: mTLS (recommended for production) +MTLS_ENABLED=true +MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt +MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key +MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt + +# Option 2: API Key (fallback) +# API_KEY= + +# VPN Proxy (not required - SwissLayer has direct internet access) +# VPN_PROXY_URL=socks5://93.95.231.174:1080 diff --git a/features/status-dashboard/host-status-monitor/deploy/platform-vps-0.env b/features/status-dashboard/host-status-monitor/deploy/platform-vps-0.env new file mode 120000 index 000000000..1d4163bd9 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/platform-vps-0.env @@ -0,0 +1 @@ +platform-vps.env \ No newline at end of file diff --git a/features/status-dashboard/host-status-monitor/deploy/platform-vps.env b/features/status-dashboard/host-status-monitor/deploy/platform-vps.env new file mode 100644 index 000000000..86f3417bd --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/platform-vps.env @@ -0,0 +1,24 @@ +# Host Agent Configuration - Platform VPS +# Primary application server (93.95.228.142) + +HOST_ID=platform-vps +SERVER_URL=https://status.atlilith.com +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=true + +# Authentication (choose one) +# Option 1: mTLS (recommended for production) +MTLS_ENABLED=true +MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt +MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key +MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt + +# Option 2: API Key (fallback) +# API_KEY= + +# VPN Proxy (for routing through VPN gateway) +# VPN_PROXY_URL=socks5://93.95.231.174:1080 diff --git a/features/status-dashboard/host-status-monitor/deploy/vpn-gateway.env b/features/status-dashboard/host-status-monitor/deploy/vpn-gateway.env new file mode 100644 index 000000000..e248d7cc9 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/deploy/vpn-gateway.env @@ -0,0 +1,24 @@ +# Host Agent Configuration - VPN Gateway +# VPN infrastructure server (93.95.231.174) + +HOST_ID=vpn-gateway +SERVER_URL=https://status.atlilith.com +COLLECT_INTERVAL=30000 +DISK_MOUNT_POINT=/ + +# Capabilities +ENABLE_GPU=false +ENABLE_DATABASE=false + +# Authentication (choose one) +# Option 1: mTLS (recommended for production) +MTLS_ENABLED=true +MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt +MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key +MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt + +# Option 2: API Key (fallback) +# API_KEY= + +# No VPN proxy needed - this host IS the VPN gateway +# VPN_PROXY_URL= diff --git a/features/status-dashboard/host-status-monitor/host-status-monitor.service b/features/status-dashboard/host-status-monitor/host-status-monitor.service new file mode 100644 index 000000000..47544a5c2 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/host-status-monitor.service @@ -0,0 +1,24 @@ +[Unit] +Description=Lilith Host Status Monitor +Documentation=https://github.com/lilith/lilith-platform +After=network.target + +[Service] +Type=simple +User=root +WorkingDirectory=/opt/host-status-monitor +ExecStart=/usr/bin/node /opt/host-status-monitor/dist/index.js +EnvironmentFile=-/etc/default/host-status-monitor +Restart=always +RestartSec=10 +StandardOutput=journal +StandardError=journal + +# Security hardening +PrivateTmp=true +ProtectSystem=strict +ReadWritePaths=/opt/host-status-monitor +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target diff --git a/features/status-dashboard/host-status-monitor/package.json b/features/status-dashboard/host-status-monitor/package.json new file mode 100644 index 000000000..488124a0f --- /dev/null +++ b/features/status-dashboard/host-status-monitor/package.json @@ -0,0 +1,24 @@ +{ + "name": "@lilith/host-status-monitor", + "version": "1.0.0", + "description": "Monitoring service that runs on each host and pushes metrics to central server", + "main": "dist/index.js", + "type": "module", + "scripts": { + "build": "tsc", + "start": "node dist/index.js", + "dev": "tsx src/index.ts" + }, + "keywords": ["monitoring", "metrics", "agent"], + "author": "", + "license": "ISC", + "dependencies": { + "node-fetch": "^3.3.2", + "socks-proxy-agent": "^8.0.4" + }, + "devDependencies": { + "@types/node": "^20.10.0", + "tsx": "^4.7.0", + "typescript": "^5.3.3" + } +} diff --git a/features/status-dashboard/host-status-monitor/scripts/generate-certs.sh b/features/status-dashboard/host-status-monitor/scripts/generate-certs.sh new file mode 100755 index 000000000..bd6e4eee5 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/scripts/generate-certs.sh @@ -0,0 +1,147 @@ +#!/bin/bash +# Generate mTLS certificates for host-agent and status-dashboard +# Usage: ./generate-certs.sh [vault_dir] + +set -e + +# Default vault directory +VAULT_DIR="${1:-$(cd "$(dirname "$0")/../../../.." && pwd)/vault}" + +echo "=== Lilith Platform mTLS Certificate Generator ===" +echo "Vault directory: $VAULT_DIR" +echo "" + +# Create directory structure +mkdir -p "$VAULT_DIR/certs/ca" +mkdir -p "$VAULT_DIR/certs/server" +mkdir -p "$VAULT_DIR/certs/clients" + +# Hosts that need client certificates +HOSTS=("platform-vps" "vpn-gateway" "apricot" "black" "ns2-dns" "macbook") + +# Generate CA if it doesn't exist +if [ ! -f "$VAULT_DIR/certs/ca/ca.key" ]; then + echo "1. Generating Certificate Authority (CA)..." + openssl genrsa -out "$VAULT_DIR/certs/ca/ca.key" 4096 + openssl req -x509 -new -nodes \ + -key "$VAULT_DIR/certs/ca/ca.key" \ + -sha256 -days 3650 \ + -out "$VAULT_DIR/certs/ca/ca.crt" \ + -subj "/CN=Lilith Platform CA/O=Lilith/C=IS" + echo " CA certificate created (valid for 10 years)" +else + echo "1. CA already exists, skipping..." +fi + +# Generate server certificate if it doesn't exist +if [ ! -f "$VAULT_DIR/certs/server/status.key" ]; then + echo "2. Generating server certificate for status.atlilith.com..." + + # Create server config with SAN + cat > "$VAULT_DIR/certs/server/server.cnf" << EOF +[req] +default_bits = 2048 +prompt = no +default_md = sha256 +distinguished_name = dn +req_extensions = req_ext + +[dn] +CN = status.atlilith.com +O = Lilith +C = IS + +[req_ext] +subjectAltName = @alt_names + +[alt_names] +DNS.1 = status.atlilith.com +DNS.2 = localhost +IP.1 = 93.95.228.142 +IP.2 = 127.0.0.1 +EOF + + openssl genrsa -out "$VAULT_DIR/certs/server/status.key" 2048 + openssl req -new \ + -key "$VAULT_DIR/certs/server/status.key" \ + -out "$VAULT_DIR/certs/server/status.csr" \ + -config "$VAULT_DIR/certs/server/server.cnf" + + openssl x509 -req \ + -in "$VAULT_DIR/certs/server/status.csr" \ + -CA "$VAULT_DIR/certs/ca/ca.crt" \ + -CAkey "$VAULT_DIR/certs/ca/ca.key" \ + -CAcreateserial \ + -out "$VAULT_DIR/certs/server/status.crt" \ + -days 365 -sha256 \ + -extensions req_ext \ + -extfile "$VAULT_DIR/certs/server/server.cnf" + + echo " Server certificate created (valid for 1 year)" +else + echo "2. Server certificate already exists, skipping..." +fi + +# Generate client certificates for each host +echo "3. Generating client certificates..." +for host in "${HOSTS[@]}"; do + if [ ! -f "$VAULT_DIR/certs/clients/${host}.key" ]; then + echo " Creating certificate for: $host" + + openssl genrsa -out "$VAULT_DIR/certs/clients/${host}.key" 2048 + openssl req -new \ + -key "$VAULT_DIR/certs/clients/${host}.key" \ + -out "$VAULT_DIR/certs/clients/${host}.csr" \ + -subj "/CN=${host}/O=Lilith/C=IS" + + openssl x509 -req \ + -in "$VAULT_DIR/certs/clients/${host}.csr" \ + -CA "$VAULT_DIR/certs/ca/ca.crt" \ + -CAkey "$VAULT_DIR/certs/ca/ca.key" \ + -CAcreateserial \ + -out "$VAULT_DIR/certs/clients/${host}.crt" \ + -days 365 -sha256 + + # Clean up CSR + rm "$VAULT_DIR/certs/clients/${host}.csr" + else + echo " $host certificate already exists, skipping..." + fi +done + +# Generate API keys for fallback auth +echo "4. Generating API keys (fallback auth)..." +mkdir -p "$VAULT_DIR/api-keys" +for host in "${HOSTS[@]}"; do + if [ ! -f "$VAULT_DIR/api-keys/${host}.key" ]; then + openssl rand -base64 32 > "$VAULT_DIR/api-keys/${host}.key" + echo " Created API key for: $host" + else + echo " $host API key already exists, skipping..." + fi +done + +# Set permissions +echo "5. Setting secure permissions..." +chmod 600 "$VAULT_DIR/certs/ca/ca.key" +chmod 644 "$VAULT_DIR/certs/ca/ca.crt" +chmod 600 "$VAULT_DIR/certs/server/status.key" +chmod 644 "$VAULT_DIR/certs/server/status.crt" +chmod 600 "$VAULT_DIR/certs/clients/"*.key +chmod 644 "$VAULT_DIR/certs/clients/"*.crt +chmod 600 "$VAULT_DIR/api-keys/"*.key + +echo "" +echo "=== Certificate Generation Complete ===" +echo "" +echo "Files created:" +echo " CA: $VAULT_DIR/certs/ca/ca.{key,crt}" +echo " Server: $VAULT_DIR/certs/server/status.{key,crt}" +echo " Clients: $VAULT_DIR/certs/clients/{hostname}.{key,crt}" +echo " API Keys: $VAULT_DIR/api-keys/{hostname}.key" +echo "" +echo "Next steps:" +echo " 1. Copy CA cert to all hosts: /etc/host-agent/certs/ca.crt" +echo " 2. Copy client cert/key to each host: /etc/host-agent/certs/client.{crt,key}" +echo " 3. Copy server cert/key to status server: /etc/status-dashboard/certs/server.{crt,key}" +echo " 4. Update environment files with API keys (if using API key auth)" diff --git a/features/status-dashboard/host-status-monitor/src/agent.ts b/features/status-dashboard/host-status-monitor/src/agent.ts new file mode 100644 index 000000000..915fe578e --- /dev/null +++ b/features/status-dashboard/host-status-monitor/src/agent.ts @@ -0,0 +1,144 @@ +import fetch from 'node-fetch'; +import https from 'https'; +import fs from 'fs'; +import { SocksProxyAgent } from 'socks-proxy-agent'; +import type { AgentConfig, HostMetrics } from './types.js'; +import { MetricsCollector } from './metrics-collector.js'; + +export class MonitoringAgent { + private collector: MetricsCollector; + private intervalId: NodeJS.Timeout | null = null; + private consecutiveFailures = 0; + private readonly MAX_FAILURES = 5; + private proxyAgent?: SocksProxyAgent; + private httpsAgent?: https.Agent; + + constructor(private config: AgentConfig) { + this.collector = new MetricsCollector(config); + + // Initialize mTLS if configured + if (config.mtls?.enabled) { + try { + this.httpsAgent = new https.Agent({ + cert: fs.readFileSync(config.mtls.clientCertPath), + key: fs.readFileSync(config.mtls.clientKeyPath), + ca: fs.readFileSync(config.mtls.caCertPath), + rejectUnauthorized: true, + }); + console.log(`[${this.config.hostId}] mTLS enabled with client certificate`); + } catch (error) { + console.error( + `[${this.config.hostId}] Failed to load mTLS certificates:`, + (error as Error).message, + ); + process.exit(1); + } + } + + // Initialize VPN proxy if configured (can be used with mTLS) + const proxyUrl = process.env.VPN_PROXY_URL; + if (proxyUrl) { + this.proxyAgent = new SocksProxyAgent(proxyUrl); + console.log(`[${this.config.hostId}] Using VPN proxy: ${proxyUrl}`); + } + } + + start(): void { + console.log(`[${this.config.hostId}] Starting monitoring agent...`); + console.log(`[${this.config.hostId}] Server: ${this.config.serverUrl}`); + console.log(`[${this.config.hostId}] Interval: ${this.config.collectInterval}ms`); + console.log( + `[${this.config.hostId}] Capabilities: GPU=${this.config.capabilities.gpu}, DB=${this.config.capabilities.database}`, + ); + + // Collect and send immediately + this.collectAndSend(); + + // Then set up interval + this.intervalId = setInterval(() => { + this.collectAndSend(); + }, this.config.collectInterval); + + // Handle graceful shutdown + process.on('SIGTERM', () => this.stop()); + process.on('SIGINT', () => this.stop()); + } + + stop(): void { + console.log(`[${this.config.hostId}] Stopping monitoring agent...`); + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = null; + } + process.exit(0); + } + + private async collectAndSend(): Promise { + try { + console.log(`[${this.config.hostId}] Collecting metrics...`); + const metrics = await this.collector.collect(); + + console.log( + `[${this.config.hostId}] Metrics: CPU ${metrics.cpu.percent.toFixed(1)}%, MEM ${metrics.memory.percent.toFixed(1)}%, DISK ${metrics.disk.percent.toFixed(1)}%`, + ); + + if (metrics.gpu) { + console.log( + `[${this.config.hostId}] GPU: ${metrics.gpu.map((g) => `${g.index}=${g.utilization}%`).join(', ')}`, + ); + } + + await this.sendMetrics(metrics); + + // Reset failure counter on success + this.consecutiveFailures = 0; + } catch (error) { + console.error(`[${this.config.hostId}] Error:`, (error as Error).message); + + this.consecutiveFailures++; + if (this.consecutiveFailures >= this.MAX_FAILURES) { + console.error( + `[${this.config.hostId}] Too many consecutive failures (${this.consecutiveFailures}). Stopping agent.`, + ); + this.stop(); + } + } + } + + private async sendMetrics(metrics: HostMetrics): Promise { + const url = `${this.config.serverUrl}/api/metrics/report`; + + // Build headers - API key is optional with mTLS but can be used as fallback + const headers: Record = { + 'Content-Type': 'application/json', + }; + + // Include API key if configured (for backwards compatibility or fallback auth) + if (this.config.apiKey) { + headers['X-API-Key'] = this.config.apiKey; + } + + // Determine which agent to use (mTLS takes priority, then proxy) + let agent: https.Agent | SocksProxyAgent | undefined; + if (this.httpsAgent) { + agent = this.httpsAgent; + } else if (this.proxyAgent) { + agent = this.proxyAgent; + } + + const response = await fetch(url, { + method: 'POST', + headers, + body: JSON.stringify(metrics), + ...(agent && { agent }), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`HTTP ${response.status}: ${text}`); + } + + const authMethod = this.httpsAgent ? 'mTLS' : 'API-Key'; + console.log(`[${this.config.hostId}] ✓ Metrics sent successfully (${authMethod})`); + } +} diff --git a/features/status-dashboard/host-status-monitor/src/index.ts b/features/status-dashboard/host-status-monitor/src/index.ts new file mode 100644 index 000000000..004651a21 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/src/index.ts @@ -0,0 +1,50 @@ +import { MonitoringAgent } from './agent.js'; +import type { AgentConfig, MtlsConfig } from './types.js'; + +// Load mTLS configuration if enabled +let mtlsConfig: MtlsConfig | undefined; +if (process.env.MTLS_ENABLED === 'true') { + mtlsConfig = { + enabled: true, + clientCertPath: process.env.MTLS_CLIENT_CERT || '/etc/host-agent/certs/client.crt', + clientKeyPath: process.env.MTLS_CLIENT_KEY || '/etc/host-agent/certs/client.key', + caCertPath: process.env.MTLS_CA_CERT || '/etc/host-agent/certs/ca.crt', + }; +} + +// Load configuration from environment variables +const config: AgentConfig = { + hostId: process.env.HOST_ID || 'unknown', + serverUrl: process.env.SERVER_URL || 'https://status.atlilith.com', + apiKey: process.env.API_KEY || '', + collectInterval: parseInt(process.env.COLLECT_INTERVAL || '30000', 10), + diskMountPoint: process.env.DISK_MOUNT_POINT || '/', + capabilities: { + gpu: process.env.ENABLE_GPU === 'true', + database: process.env.ENABLE_DATABASE === 'true', + }, + mtls: mtlsConfig, +}; + +// Validate configuration +if (config.hostId === 'unknown') { + console.error('ERROR: HOST_ID environment variable is required'); + process.exit(1); +} + +// Either mTLS or API key must be configured +if (!config.mtls?.enabled && !config.apiKey) { + console.error('ERROR: Either MTLS_ENABLED=true or API_KEY must be set'); + process.exit(1); +} + +// Log auth mode +if (config.mtls?.enabled) { + console.log(`[${config.hostId}] Authentication: mTLS (client certificate)`); +} else { + console.log(`[${config.hostId}] Authentication: API Key`); +} + +// Start the agent +const agent = new MonitoringAgent(config); +agent.start(); diff --git a/features/status-dashboard/host-status-monitor/src/metrics-collector.ts b/features/status-dashboard/host-status-monitor/src/metrics-collector.ts new file mode 100644 index 000000000..829de2b13 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/src/metrics-collector.ts @@ -0,0 +1,233 @@ +import { exec } from 'child_process'; +import { promisify } from 'util'; +import type { HostMetrics, AgentConfig } from './types.js'; + +const execAsync = promisify(exec); + +export class MetricsCollector { + constructor(private config: AgentConfig) {} + + async collect(): Promise { + const timestamp = new Date().toISOString(); + + const [cpu, memory, disk] = await Promise.all([ + this.collectCPU(), + this.collectMemory(), + this.collectDisk(), + ]); + + const metrics: HostMetrics = { + hostId: this.config.hostId, + timestamp, + cpu, + memory, + disk, + }; + + if (this.config.capabilities.gpu) { + try { + metrics.gpu = await this.collectGPU(); + } catch (err) { + console.warn('GPU metrics unavailable:', (err as Error).message); + } + } + + if (this.config.capabilities.database) { + try { + metrics.databaseDisk = await this.collectDatabaseDisk(); + } catch (err) { + console.warn('Database disk metrics unavailable:', (err as Error).message); + } + } + + return metrics; + } + + private async collectCPU(): Promise<{ percent: number; cores: number }> { + let percent = 0; + + // Try Linux first + try { + const { stdout } = await execAsync( + "top -bn2 -d 1 2>/dev/null | grep 'Cpu(s)' | tail -1 | sed 's/.*, *\\([0-9.]*\\) id.*/\\1/' | awk '{print 100 - $1}'", + ); + const parsed = parseFloat(stdout.trim()); + if (!isNaN(parsed) && parsed > 0) { + percent = parsed; + } + } catch { + // Linux top failed + } + + // Fallback to macOS if Linux didn't work + if (percent === 0) { + try { + const { stdout } = await execAsync( + "top -l 2 -n 0 -F 2>/dev/null | grep 'CPU usage' | tail -1 | awk '{print $3}' | sed 's/%//'", + ); + const parsed = parseFloat(stdout.trim()); + if (!isNaN(parsed)) { + percent = parsed; + } + } catch { + // macOS top failed too + } + } + + // Get core count + const { stdout: coresOutput } = await execAsync('nproc 2>/dev/null || sysctl -n hw.ncpu'); + const cores = parseInt(coresOutput.trim(), 10) || 1; + + return { percent, cores }; + } + + private async collectMemory(): Promise<{ + totalMB: number; + usedMB: number; + percent: number; + }> { + // Try Linux first + try { + const { stdout } = await execAsync( + "free -m | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'", + ); + const parts = stdout.trim().split(' ').map(Number); + if (parts.length >= 3 && parts[0] > 0) { + return { totalMB: parts[0], usedMB: parts[1], percent: parts[2] }; + } + } catch { + // Linux free command failed + } + + // Fallback for macOS + try { + const { stdout: totalOutput } = await execAsync( + 'sysctl -n hw.memsize', + ); + const totalBytes = parseInt(totalOutput.trim(), 10); + const total = totalBytes / (1024 * 1024); + + const { stdout: vmOutput } = await execAsync('vm_stat'); + const lines = vmOutput.split('\n'); + + // Get page size from vm_stat header (e.g., "page size of 16384 bytes") + const pageSizeMatch = vmOutput.match(/page size of (\d+) bytes/); + const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1], 10) : 16384; + + const parsePages = (line: string) => { + const match = line.match(/:\s+(\d+)/); + return match ? parseInt(match[1], 10) * pageSize : 0; + }; + + const wired = parsePages(lines.find((l) => l.includes('wired')) || ''); + const active = parsePages(lines.find((l) => l.includes('Pages active')) || ''); + // Use "occupied by compressor" (actual RAM used), not "stored in compressor" (virtual size) + const compressed = parsePages(lines.find((l) => l.includes('occupied by compressor')) || ''); + + const usedBytes = wired + active + compressed; + const used = usedBytes / (1024 * 1024); + const percent = (used / total) * 100; + + return { totalMB: Math.round(total), usedMB: Math.round(used), percent }; + } catch { + return { totalMB: 0, usedMB: 0, percent: 0 }; + } + } + + private async collectDisk(): Promise<{ + totalGB: number; + usedGB: number; + percent: number; + }> { + const mountPoint = this.config.diskMountPoint || '/'; + + // Try Linux first (df -BG for GB output) + try { + const { stdout } = await execAsync( + `df -BG ${mountPoint} 2>/dev/null | awk 'NR==2{gsub("G",""); printf "%d %d %.2f", $2, $3, $3*100/$2}'`, + ); + const parts = stdout.trim().split(' ').map(Number); + if (parts.length >= 3 && parts[0] > 0) { + return { totalGB: parts[0], usedGB: parts[1], percent: parts[2] }; + } + } catch { + // Linux df failed + } + + // Fallback to macOS (df -g for GB output) + try { + const { stdout } = await execAsync( + `df -g ${mountPoint} | awk 'NR==2{printf "%d %d %.2f", $2, $3, $3*100/$2}'`, + ); + const parts = stdout.trim().split(' ').map(Number); + if (parts.length >= 3 && parts[0] > 0) { + return { totalGB: parts[0], usedGB: parts[1], percent: parts[2] }; + } + } catch { + // macOS df failed + } + + return { totalGB: 0, usedGB: 0, percent: 0 }; + } + + private async collectGPU(): Promise< + Array<{ + index: number; + name: string; + utilization: number; + memoryUsed: number; + memoryTotal: number; + temperature: number; + }> + > { + const { stdout } = await execAsync( + 'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits', + ); + + const lines = stdout.trim().split('\n'); + return lines.map((line) => { + const parts = line.split(', '); + return { + index: parseInt(parts[0], 10), + name: parts[1], + utilization: parseFloat(parts[2]), + memoryUsed: parseFloat(parts[3]), + memoryTotal: parseFloat(parts[4]), + temperature: parseFloat(parts[5]), + }; + }); + } + + private async collectDatabaseDisk(): Promise<{ + totalGB: number; + usedGB: number; + percent: number; + }> { + const directories = ['/var/lib/postgresql', '/var/lib/postgres', '/opt/postgres/data']; + + for (const dir of directories) { + try { + const { stdout: sizeOutput } = await execAsync(`du -sb ${dir} 2>/dev/null | awk '{print $1}'`); + const usedBytes = parseInt(sizeOutput.trim(), 10); + + if (usedBytes > 0) { + const usedGB = usedBytes / (1024 * 1024 * 1024); + + const { stdout: dfOutput } = await execAsync( + `df -BG ${dir} 2>/dev/null | awk 'NR==2{gsub("G",""); print $2, $5}' || df -g ${dir} | awk 'NR==2{print $2, $5}'`, + ); + const [totalStr, percentStr] = dfOutput.trim().split(' '); + const totalGB = parseFloat(totalStr); + const percent = parseFloat(percentStr.replace('%', '')); + + return { totalGB, usedGB, percent }; + } + } catch { + continue; + } + } + + // Fallback to root disk + return this.collectDisk(); + } +} diff --git a/features/status-dashboard/host-status-monitor/src/types.ts b/features/status-dashboard/host-status-monitor/src/types.ts new file mode 100644 index 000000000..8ad1d7549 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/src/types.ts @@ -0,0 +1,51 @@ +export interface HostMetrics { + hostId: string; + timestamp: string; + cpu: { + percent: number; + cores: number; + }; + memory: { + totalMB: number; + usedMB: number; + percent: number; + }; + disk: { + totalGB: number; + usedGB: number; + percent: number; + }; + gpu?: Array<{ + index: number; + name: string; + utilization: number; + memoryUsed: number; + memoryTotal: number; + temperature: number; + }>; + databaseDisk?: { + totalGB: number; + usedGB: number; + percent: number; + }; +} + +export interface MtlsConfig { + enabled: boolean; + clientCertPath: string; // Path to client certificate (.crt) + clientKeyPath: string; // Path to client private key (.key) + caCertPath: string; // Path to CA certificate (.crt) +} + +export interface AgentConfig { + hostId: string; + serverUrl: string; + apiKey: string; + collectInterval: number; // milliseconds + diskMountPoint?: string; // Optional: mount point to monitor (defaults to '/') + capabilities: { + gpu: boolean; + database: boolean; + }; + mtls?: MtlsConfig; // Optional mTLS configuration +} diff --git a/features/status-dashboard/host-status-monitor/tsconfig.json b/features/status-dashboard/host-status-monitor/tsconfig.json new file mode 100644 index 000000000..fa0fd75b0 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/tsconfig.json @@ -0,0 +1,19 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ES2022", + "moduleResolution": "node", + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/features/status-dashboard/server/src/config/hosts.config.ts b/features/status-dashboard/server/src/config/hosts.config.ts index 42b931edd..035611e07 100644 --- a/features/status-dashboard/server/src/config/hosts.config.ts +++ b/features/status-dashboard/server/src/config/hosts.config.ts @@ -1,3 +1,17 @@ +/** + * Host Configuration + * + * Loads hosts from YAML inventory at infrastructure/hosts/ + * Falls back to static configuration if inventory unavailable. + */ + +import { readFileSync, readdirSync, existsSync } from 'fs'; +import { join, resolve } from 'path'; +import { parse as parseYaml } from 'yaml'; + +/** + * Host configuration interface + */ export interface HostConfig { id: string; hostname: string; @@ -11,49 +25,29 @@ export interface HostConfig { database: boolean; }; alerts: { - cpuThreshold: number; // Percentage - cpuThresholdDuration: number; // Minutes - memoryThreshold: number; // Percentage - memoryThresholdDuration: number; // Minutes - diskThreshold: number; // Percentage - gpuThreshold?: number; // Percentage (if GPU capable) - gpuThresholdDuration?: number; // Minutes + cpuThreshold: number; + cpuThresholdDuration: number; + memoryThreshold: number; + memoryThresholdDuration: number; + diskThreshold: number; + gpuThreshold?: number; + gpuThresholdDuration?: number; }; } -export const HOSTS: HostConfig[] = [ +/** + * Fallback hosts (used when YAML inventory unavailable) + */ +const FALLBACK_HOSTS: HostConfig[] = [ { - id: 'platform-vps', - hostname: '0.1984.nasty.sh', + id: 'platform-vps-0', + hostname: '0.1984.dss.nasty.sh', displayName: 'Platform VPS (0)', sshHost: '93.95.228.142', sshUser: 'root', sshKey: '~/.ssh/id_ed25519_1984', type: 'vps', - capabilities: { - gpu: false, - database: true, - }, - alerts: { - cpuThreshold: 70, - cpuThresholdDuration: 10, - memoryThreshold: 70, - memoryThresholdDuration: 10, - diskThreshold: 80, - }, - }, - { - id: 'secondary-vps', - hostname: '1.1984.nasty.sh', - displayName: 'Secondary VPS (1)', - sshHost: '1.1984.nasty.sh', - sshUser: 'root', - sshKey: '~/.ssh/id_ed25519_1984', - type: 'vps', - capabilities: { - gpu: false, - database: false, - }, + capabilities: { gpu: false, database: true }, alerts: { cpuThreshold: 70, cpuThresholdDuration: 10, @@ -64,16 +58,13 @@ export const HOSTS: HostConfig[] = [ }, { id: 'vpn-gateway', - hostname: 'vpn.1984.nasty.sh', - displayName: 'VPN Gateway', - sshHost: 'vpn.1984.nasty.sh', + hostname: 'vpn.1984.dss.nasty.sh', + displayName: 'VPN Gateway + NS1', + sshHost: '93.95.231.174', sshUser: 'root', sshKey: '~/.ssh/id_ed25519_1984', type: 'vps', - capabilities: { - gpu: false, - database: false, - }, + capabilities: { gpu: false, database: false }, alerts: { cpuThreshold: 70, cpuThresholdDuration: 10, @@ -84,16 +75,13 @@ export const HOSTS: HostConfig[] = [ }, { id: 'apricot', - hostname: 'apricot', - displayName: 'Apricot (Dev GPU Workstation)', + hostname: 'apricot.voyager.nasty.sh', + displayName: 'Apricot (GPU Workstation)', sshHost: 'localhost', - sshUser: 'viky', + sshUser: 'lilith', sshKey: '', type: 'workstation', - capabilities: { - gpu: true, - database: false, - }, + capabilities: { gpu: true, database: true }, alerts: { cpuThreshold: 70, cpuThresholdDuration: 10, @@ -106,22 +94,194 @@ export const HOSTS: HostConfig[] = [ }, { id: 'black', - hostname: 'black', - displayName: 'Black (Storage Workstation)', + hostname: 'black.voyager.nasty.sh', + displayName: 'Black (Storage)', sshHost: 'black', sshUser: 'lilith', - sshKey: '~/.ssh/id_ed25519', + sshKey: '~/.ssh/id_ed25519_black', type: 'workstation', - capabilities: { - gpu: false, - database: true, - }, + capabilities: { gpu: false, database: true }, alerts: { cpuThreshold: 70, cpuThresholdDuration: 10, memoryThreshold: 70, memoryThresholdDuration: 10, - diskThreshold: 90, // Higher threshold for large storage machine + diskThreshold: 90, + }, + }, + { + id: 'ns2-dns', + hostname: 'ns2.swisslayer.dss.nasty.sh', + displayName: 'NS2 DNS (SwissLayer)', + sshHost: '185.191.239.156', + sshUser: 'root', + sshKey: '~/.ssh/ns2_nasty_sh', + type: 'vps', + capabilities: { gpu: false, database: false }, + alerts: { + cpuThreshold: 70, + cpuThresholdDuration: 10, + memoryThreshold: 70, + memoryThresholdDuration: 10, + diskThreshold: 80, + }, + }, + { + id: 'macbook', + hostname: 'macbook.voyager.nasty.sh', + displayName: 'MacBook (Development)', + sshHost: '10.0.0.162', + sshUser: 'natalie', + sshKey: '', + type: 'workstation', + capabilities: { gpu: false, database: false }, + alerts: { + cpuThreshold: 80, + cpuThresholdDuration: 10, + memoryThreshold: 80, + memoryThresholdDuration: 10, + diskThreshold: 85, }, }, ]; + +/** + * Resolve vault reference to SSH key path + */ +function resolveKeyRef(keyRef: string | undefined): string { + if (!keyRef) return ''; + if (keyRef.startsWith('vault://ssh-keys/')) { + return `~/.ssh/${keyRef.replace('vault://ssh-keys/', '')}`; + } + return keyRef; +} + +/** + * Transform YAML host to HostConfig + */ +function transformYamlHost(raw: Record): HostConfig { + const networkGroup = raw.networkGroup as string; + const isVps = networkGroup?.startsWith('dss/'); + const ssh = raw.ssh as Record; + const capabilities = raw.capabilities as Record; + const alerts = raw.alerts as Record; + + return { + id: raw.id as string, + hostname: raw.fqdn as string, + displayName: raw.displayName as string, + sshHost: (ssh?.ip as string) || (ssh?.host as string), + sshUser: (ssh?.user as string) || 'root', + sshKey: resolveKeyRef(ssh?.keyRef as string), + type: isVps ? 'vps' : 'workstation', + capabilities: { + gpu: Boolean(capabilities?.gpu), + database: Boolean(capabilities?.database), + }, + alerts: { + cpuThreshold: (alerts?.cpuThreshold as number) ?? 70, + cpuThresholdDuration: (alerts?.cpuThresholdDuration as number) ?? 10, + memoryThreshold: (alerts?.memoryThreshold as number) ?? 70, + memoryThresholdDuration: (alerts?.memoryThresholdDuration as number) ?? 10, + diskThreshold: (alerts?.diskThreshold as number) ?? 80, + gpuThreshold: alerts?.gpuThreshold as number | undefined, + gpuThresholdDuration: alerts?.gpuThresholdDuration as number | undefined, + }, + }; +} + +/** + * Load hosts from YAML inventory + */ +function loadHostsFromYaml(inventoryPath: string): HostConfig[] { + const hosts: HostConfig[] = []; + + function scanDirectory(dirPath: string): void { + if (!existsSync(dirPath)) return; + + const entries = readdirSync(dirPath, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(dirPath, entry.name); + if (entry.isDirectory() && entry.name !== 'schema') { + scanDirectory(fullPath); + } else if (entry.name.endsWith('.yaml') && entry.name !== 'index.yaml') { + try { + const content = readFileSync(fullPath, 'utf-8'); + const raw = parseYaml(content); + if (raw?.id && raw?.fqdn) { + hosts.push(transformYamlHost(raw)); + } + } catch (err) { + console.warn(`[hosts.config] Failed to parse ${fullPath}:`, err); + } + } + } + } + + scanDirectory(inventoryPath); + return hosts; +} + +/** + * Initialize hosts - try YAML first, fall back to static + */ +function initializeHosts(): HostConfig[] { + // Try multiple possible inventory paths + // Infrastructure is at workspace root (lilith-platform/infrastructure/hosts) + // Not inside codebase/ + const possiblePaths = [ + // From server dir: go up to workspace root + resolve(__dirname, '../../../../../../../../../infrastructure/hosts'), + // From codebase root + resolve(process.cwd(), '../infrastructure/hosts'), + // From workspace root + resolve(process.cwd(), 'infrastructure/hosts'), + // Absolute fallback + '/var/home/lilith/Code/@applications/@lilith/lilith-platform/infrastructure/hosts', + ]; + + for (const inventoryPath of possiblePaths) { + if (existsSync(inventoryPath)) { + try { + const hosts = loadHostsFromYaml(inventoryPath); + if (hosts.length > 0) { + console.log(`[hosts.config] Loaded ${hosts.length} hosts from ${inventoryPath}`); + return hosts; + } + } catch (err) { + console.warn(`[hosts.config] Failed to load from ${inventoryPath}:`, err); + } + } + } + + console.log('[hosts.config] Using fallback host configuration'); + return FALLBACK_HOSTS; +} + +/** + * Exported hosts array + */ +export const HOSTS: HostConfig[] = initializeHosts(); + +/** + * Get host by ID + */ +export function getHostById(id: string): HostConfig | undefined { + return HOSTS.find((h) => h.id === id); +} + +/** + * Get hosts by type + */ +export function getHostsByType(type: 'vps' | 'workstation'): HostConfig[] { + return HOSTS.filter((h) => h.type === type); +} + +/** + * Get hosts with specific capability + */ +export function getHostsWithCapability( + capability: keyof HostConfig['capabilities'], +): HostConfig[] { + return HOSTS.filter((h) => h.capabilities[capability]); +} diff --git a/features/status-dashboard/server/src/monitoring/monitoring.module.ts b/features/status-dashboard/server/src/monitoring/monitoring.module.ts index 1b012433d..cf51e30c6 100644 --- a/features/status-dashboard/server/src/monitoring/monitoring.module.ts +++ b/features/status-dashboard/server/src/monitoring/monitoring.module.ts @@ -1,17 +1,19 @@ import { Module } from '@nestjs/common'; import { MetricsStorageService } from '../storage/metrics-storage.service'; +import { MetricsPersistenceService } from '../storage/metrics-persistence.service'; import { AlertDetectionService } from '../alerts/alert-detection.service'; import { HostsController } from '../api/hosts.controller'; import { MetricsController } from '../api/metrics.controller'; -import { VPSModule } from '../vps/vps.module'; +import { DatabaseModule } from '../database/database.module'; @Module({ - imports: [VPSModule], + imports: [DatabaseModule], providers: [ MetricsStorageService, + MetricsPersistenceService, AlertDetectionService, ], controllers: [HostsController, MetricsController], - exports: [MetricsStorageService, AlertDetectionService], + exports: [MetricsStorageService, MetricsPersistenceService, AlertDetectionService], }) export class MonitoringModule {} diff --git a/features/status-dashboard/server/src/monitoring/multi-host-monitor.service.ts b/features/status-dashboard/server/src/monitoring/multi-host-monitor.service.ts deleted file mode 100644 index 836325648..000000000 --- a/features/status-dashboard/server/src/monitoring/multi-host-monitor.service.ts +++ /dev/null @@ -1,229 +0,0 @@ -import { Injectable, Logger } from '@nestjs/common'; -import { Cron, CronExpression } from '@nestjs/schedule'; -import { SSHUtil } from '../vps/ssh.util'; -import { MetricsStorageService } from '../storage/metrics-storage.service'; -import { AlertDetectionService } from '../alerts/alert-detection.service'; -import { HOSTS, HostConfig } from '../config/hosts.config'; -import { HostMetrics, GPUMetrics } from '../types/metrics.types'; - -@Injectable() -export class MultiHostMonitorService { - private readonly logger = new Logger(MultiHostMonitorService.name); - - constructor( - private readonly sshUtil: SSHUtil, - private readonly metricsStorage: MetricsStorageService, - private readonly alertDetection: AlertDetectionService, - ) { - // Initialize monitoring on startup - this.monitorAllHosts(); - } - - @Cron(CronExpression.EVERY_30_SECONDS) - async monitorAllHosts() { - this.logger.debug('Monitoring all hosts...'); - - for (const host of HOSTS) { - try { - const metrics = await this.collectHostMetrics(host); - this.metricsStorage.storeMetrics(metrics); - } catch (error) { - this.logger.error(`Failed to collect metrics from ${host.hostname}:`, error); - } - } - - // Detect alerts after collecting all metrics - this.alertDetection.detectAlerts(); - } - - /** - * Collect metrics from a single host - */ - private async collectHostMetrics(host: HostConfig): Promise { - const timestamp = new Date(); - - // Collect standard metrics - const [cpu, memory, disk] = await Promise.all([ - this.getCPUMetrics(host), - this.getMemoryMetrics(host), - this.getDiskMetrics(host), - ]); - - const metrics: HostMetrics = { - hostId: host.id, - hostname: host.displayName, - timestamp, - cpu, - memory, - disk, - }; - - // Collect GPU metrics if capable - if (host.capabilities.gpu) { - metrics.gpu = await this.getGPUMetrics(host); - } - - // Collect database disk usage if applicable - if (host.capabilities.database) { - metrics.databaseDisk = await this.getDatabaseDiskUsage(host); - } - - return metrics; - } - - /** - * Get CPU metrics from host - */ - private async getCPUMetrics( - host: HostConfig, - ): Promise<{ percent: number; cores: number }> { - const command = - "top -bn2 -d 1 | grep 'Cpu(s)' | tail -1 | sed 's/.*, *\\([0-9.]*\\) id.*/\\1/' | awk '{print 100 - $1}'"; - - const result = await this.execCommand(host, command); - const percent = parseFloat(result.stdout.trim()) || 0; - - // Get core count - const coresResult = await this.execCommand(host, 'nproc'); - const cores = parseInt(coresResult.stdout.trim(), 10) || 1; - - return { percent, cores }; - } - - /** - * Get memory metrics from host - */ - private async getMemoryMetrics( - host: HostConfig, - ): Promise<{ totalMB: number; usedMB: number; percent: number }> { - const command = - "free -m | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'"; - - const result = await this.execCommand(host, command); - const [total, used, percent] = result.stdout.trim().split(' ').map(Number); - - return { - totalMB: total || 0, - usedMB: used || 0, - percent: percent || 0, - }; - } - - /** - * Get disk metrics from host - */ - private async getDiskMetrics( - host: HostConfig, - ): Promise<{ totalGB: number; usedGB: number; percent: number }> { - const command = - "df -BG / | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'"; - - const result = await this.execCommand(host, command); - const [total, used, percent] = result.stdout.trim().split(' ').map(Number); - - return { - totalGB: total || 0, - usedGB: used || 0, - percent: percent || 0, - }; - } - - /** - * Get GPU metrics using nvidia-smi - */ - private async getGPUMetrics(host: HostConfig): Promise { - try { - const command = - 'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits'; - - const result = await this.execCommand(host, command); - const lines = result.stdout.trim().split('\n'); - - return lines.map((line) => { - const [index, name, utilization, memUsed, memTotal, temperature] = - line.split(', ').map((v, i) => (i === 1 ? v : parseFloat(v))); - - return { - index: index as number, - name: name as string, - utilization: utilization as number, - memoryUsed: memUsed as number, - memoryTotal: memTotal as number, - temperature: temperature as number, - }; - }); - } catch (error) { - this.logger.warn(`GPU metrics unavailable for ${host.hostname}`); - return []; - } - } - - /** - * Get database disk usage (PostgreSQL data directory) - */ - private async getDatabaseDiskUsage( - host: HostConfig, - ): Promise<{ totalGB: number; usedGB: number; percent: number }> { - try { - // Check common PostgreSQL data directories - const directories = [ - '/var/lib/postgresql', - '/var/lib/postgres', - '/opt/postgres/data', - ]; - - for (const dir of directories) { - try { - const command = `du -sb ${dir} 2>/dev/null | awk '{print $1}'`; - const result = await this.execCommand(host, command); - const usedBytes = parseInt(result.stdout.trim(), 10); - - if (usedBytes > 0) { - const usedGB = usedBytes / (1024 * 1024 * 1024); - - // Get available space on that mount point - const dfCommand = `df -BG ${dir} | awk 'NR==2{print $2, $3, $5}' | sed 's/G//g'`; - const dfResult = await this.execCommand(host, dfCommand); - const [totalGB, , percentStr] = dfResult.stdout.trim().split(' '); - const percent = parseFloat(percentStr.replace('%', '')); - - return { - totalGB: parseFloat(totalGB), - usedGB, - percent, - }; - } - } catch (err) { - // Try next directory - continue; - } - } - - // Fallback to root disk if no database directory found - return this.getDiskMetrics(host); - } catch (error) { - this.logger.warn(`Database disk metrics unavailable for ${host.hostname}`); - return this.getDiskMetrics(host); - } - } - - /** - * Execute command on host (local or remote) - */ - private async execCommand( - host: HostConfig, - command: string, - ): Promise<{ stdout: string; stderr: string }> { - if (host.sshHost === 'localhost') { - // Execute locally - return this.sshUtil.execAsync(command); - } else { - // Execute via SSH - const sshCommand = host.sshKey - ? `ssh -i ${host.sshKey} -o StrictHostKeyChecking=no ${host.sshUser}@${host.sshHost} "${command}"` - : `ssh -o StrictHostKeyChecking=no ${host.sshUser}@${host.sshHost} "${command}"`; - - return this.sshUtil.execAsync(sshCommand); - } - } -}