feat(status-dashboard): add push-only host monitoring with macOS support

- Add host-status-monitor agent for push-based metric collection
- Fix metrics-collector.ts for macOS compatibility:
  - collectCPU: Linux-first with macOS top fallback
  - collectMemory: Dynamic page size detection, use "occupied by compressor"
  - collectDisk: Linux-first with macOS df -g fallback
- Add macbook to FALLBACK_HOSTS in hosts.config.ts
- Delete unused multi-host-monitor.service.ts (SSH polling)
- Server now runs push-only mode by default

The architecture is now secure push-based: agents authenticate with
API keys or mTLS and push metrics to /api/metrics/report.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Quinn Ftw 2025-12-25 23:12:12 -08:00
parent 2cee20740b
commit e426f6ae5b
22 changed files with 1749 additions and 288 deletions

View file

@ -0,0 +1,98 @@
# Host Status Monitor Deployment Makefile
# Usage: make deploy-<hostname>
.PHONY: build deploy-all deploy-platform deploy-apricot deploy-black deploy-vpn deploy-macbook status logs help
# SSH key for 1984 hosts
SSH_KEY := ~/.ssh/id_ed25519_1984
SSH_OPTS := -o StrictHostKeyChecking=accept-new
# Host definitions
PLATFORM_VPS := root@93.95.228.142
VPN_GATEWAY := root@93.95.231.174
APRICOT := localhost
BLACK := lilith@black
MACBOOK := natalie@10.0.0.162
# Default target
help:
@echo "Host Status Monitor Deployment"
@echo ""
@echo "Usage:"
@echo " make build - Build TypeScript to JavaScript"
@echo " make deploy-all - Deploy to all hosts"
@echo " make deploy-platform - Deploy to platform-vps"
@echo " make deploy-vpn - Deploy to vpn-gateway"
@echo " make deploy-apricot - Deploy to apricot (localhost)"
@echo " make deploy-black - Deploy to black"
@echo " make deploy-macbook - Deploy to macbook"
@echo " make status - Check status on all hosts"
@echo " make logs - Tail logs from platform-vps"
@echo ""
# Build
build:
@echo "Building host-status-monitor..."
npm run build
# Deploy to all hosts
deploy-all: build deploy-platform deploy-vpn deploy-apricot deploy-black deploy-macbook
@echo "All deployments complete"
# Deploy to platform-vps
deploy-platform: build
@echo "Deploying to platform-vps..."
./deploy.sh platform-vps
# Deploy to vpn-gateway
deploy-vpn: build
@echo "Deploying to vpn-gateway..."
./deploy.sh vpn-gateway
# Deploy to apricot (localhost)
deploy-apricot: build
@echo "Deploying to apricot (localhost)..."
./deploy.sh apricot
# Deploy to black
deploy-black: build
@echo "Deploying to black..."
./deploy.sh black
# Deploy to macbook
deploy-macbook: build
@echo "Deploying to macbook..."
./deploy.sh macbook
# Check status on all hosts
status:
@echo "=== Platform VPS ==="
@ssh -i $(SSH_KEY) $(SSH_OPTS) $(PLATFORM_VPS) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect"
@echo ""
@echo "=== VPN Gateway ==="
@ssh -i $(SSH_KEY) $(SSH_OPTS) $(VPN_GATEWAY) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect"
@echo ""
@echo "=== Apricot (localhost) ==="
@systemctl status host-status-monitor --no-pager 2>/dev/null || echo "Not installed locally"
@echo ""
@echo "=== Black ==="
@ssh $(BLACK) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect"
@echo ""
@echo "=== MacBook ==="
@ssh $(MACBOOK) "launchctl list | grep host-status-monitor" 2>/dev/null || echo "Could not connect"
# Tail logs from platform-vps
logs:
ssh -i $(SSH_KEY) $(SSH_OPTS) $(PLATFORM_VPS) "journalctl -u host-status-monitor -f"
logs-vpn:
ssh -i $(SSH_KEY) $(SSH_OPTS) $(VPN_GATEWAY) "journalctl -u host-status-monitor -f"
logs-apricot:
journalctl -u host-status-monitor -f
logs-black:
ssh $(BLACK) "journalctl -u host-status-monitor -f"
logs-macbook:
ssh $(MACBOOK) "tail -f /var/log/host-status-monitor.log"

View file

@ -0,0 +1,328 @@
# Host Status Monitor
Lightweight monitoring service that runs on each host and pushes system metrics to the central status-dashboard service.
## Architecture
```
┌─────────────────┐ mTLS ┌─────────────────────────┐
│ Host Status │ ─────────────────────►│ Status Dashboard │
│ Monitor │ POST /api/metrics │ (status.atlilith.com) │
│ (each host) │ /report │ │
│ │ │ - Stores metrics │
│ - CPU/Memory │ │ - Triggers alerts │
│ - Disk usage │ │ - Serves dashboard │
│ - Docker stats │ │ │
│ - GPU (opt) │ │ │
└─────────────────┘ └─────────────────────────┘
```
**Push Model**: Agents push metrics every 30 seconds (configurable). No SSH access required from the central server.
**Authentication**: mTLS (mutual TLS) with client certificates. API key fallback for development.
## Hosts
| Host | IP | Purpose |
|------|-----|---------|
| platform-vps | 93.95.228.142 | Main platform services |
| vpn-gateway | 93.95.231.174 | VPN infrastructure |
| apricot | localhost | Development machine |
| black | lilith@black | Secondary server |
## Quick Start
### 1. Generate Certificates (first time only)
```bash
cd scripts/
./generate-certs.sh
```
This creates:
- CA certificate in `vault/certs/ca/`
- Server certificate in `vault/certs/server/`
- Client certificates for each host in `vault/certs/clients/`
- API keys in `vault/api-keys/`
### 2. Deploy to a Host
```bash
# Build first
make build
# Deploy to specific host
make deploy-platform # platform-vps
make deploy-vpn # vpn-gateway
make deploy-apricot # localhost
make deploy-black # black
# Or deploy to all hosts
make deploy-all
```
### 3. Check Status
```bash
make status
```
### 4. View Logs
```bash
make logs # platform-vps logs
make logs-vpn # vpn-gateway logs
make logs-apricot # localhost logs
make logs-black # black logs
```
## Configuration
Environment files are in `deploy/`:
| File | Host |
|------|------|
| `platform-vps.env` | Main platform VPS |
| `vpn-gateway.env` | VPN gateway server |
| `apricot.env` | Local development |
| `black.env` | Secondary server |
### Environment Variables
| Variable | Description | Default |
|----------|-------------|---------|
| `HOST_ID` | Unique identifier for this host | Required |
| `SERVER_URL` | Status dashboard URL | `https://status.atlilith.com` |
| `COLLECT_INTERVAL` | Metrics collection interval (ms) | `30000` |
| `DISK_MOUNT_POINT` | Disk to monitor | `/` |
| `ENABLE_GPU` | Enable GPU monitoring | `false` |
| `ENABLE_DATABASE` | Enable database metrics | `false` |
### mTLS Configuration
| Variable | Description | Default |
|----------|-------------|---------|
| `MTLS_ENABLED` | Enable mTLS authentication | `true` |
| `MTLS_CLIENT_CERT` | Path to client certificate | `/etc/host-status-monitor/certs/client.crt` |
| `MTLS_CLIENT_KEY` | Path to client private key | `/etc/host-status-monitor/certs/client.key` |
| `MTLS_CA_CERT` | Path to CA certificate | `/etc/host-status-monitor/certs/ca.crt` |
### API Key Configuration (fallback)
| Variable | Description |
|----------|-------------|
| `API_KEY` | API key for authentication (if mTLS disabled) |
### VPN Proxy (for hosts behind VPN)
| Variable | Description |
|----------|-------------|
| `VPN_PROXY_URL` | SOCKS5 proxy URL (e.g., `socks5://localhost:1080`) |
## Certificate Management
### Certificate Locations
**On status-dashboard server:**
```
/etc/status-dashboard/certs/
├── ca.crt # CA certificate
├── server.crt # Server certificate
└── server.key # Server private key
```
**On each host:**
```
/etc/host-status-monitor/certs/
├── ca.crt # CA certificate (same as server)
├── client.crt # Client certificate (host-specific)
└── client.key # Client private key (host-specific)
```
### Deploying Certificates
After running `generate-certs.sh`:
```bash
# Copy CA cert to all hosts
scp vault/certs/ca/ca.crt root@<host>:/etc/host-status-monitor/certs/
# Copy host-specific client cert/key
scp vault/certs/clients/<hostname>.crt root@<host>:/etc/host-status-monitor/certs/client.crt
scp vault/certs/clients/<hostname>.key root@<host>:/etc/host-status-monitor/certs/client.key
# Set permissions
ssh root@<host> "chmod 600 /etc/host-status-monitor/certs/*.key && chmod 644 /etc/host-status-monitor/certs/*.crt"
```
### Certificate Renewal
Certificates are valid for 1 year. To renew:
```bash
# Remove existing certificates
rm -rf vault/certs/server/* vault/certs/clients/*
# Regenerate (keeps existing CA)
./scripts/generate-certs.sh
# Redeploy to all hosts
make deploy-all
```
## Metrics Collected
### System Metrics
| Metric | Description |
|--------|-------------|
| `cpu.percent` | CPU usage percentage |
| `cpu.cores` | Number of CPU cores |
| `memory.total` | Total memory (bytes) |
| `memory.used` | Used memory (bytes) |
| `memory.percent` | Memory usage percentage |
| `disk.total` | Total disk space (bytes) |
| `disk.used` | Used disk space (bytes) |
| `disk.percent` | Disk usage percentage |
| `uptime` | System uptime (seconds) |
| `loadAvg` | Load averages (1, 5, 15 min) |
### Docker Metrics (if Docker available)
| Metric | Description |
|--------|-------------|
| `containers[].name` | Container name |
| `containers[].state` | Running, exited, etc. |
| `containers[].health` | Healthy, unhealthy, none |
| `containers[].cpu` | Container CPU usage |
| `containers[].memory` | Container memory usage |
### GPU Metrics (if enabled)
| Metric | Description |
|--------|-------------|
| `gpu.name` | GPU model name |
| `gpu.temperature` | GPU temperature (C) |
| `gpu.utilization` | GPU utilization percentage |
| `gpu.memory.total` | Total GPU memory |
| `gpu.memory.used` | Used GPU memory |
## Development
### Building
```bash
npm install
npm run build
```
### Running Locally
```bash
# Set environment variables
export HOST_ID=dev
export SERVER_URL=http://localhost:3000
export COLLECT_INTERVAL=5000
export MTLS_ENABLED=false
export API_KEY=dev-key
# Run
npm start
```
### Testing
```bash
npm test
```
## Troubleshooting
### Service Not Starting
1. Check systemd status:
```bash
systemctl status host-status-monitor
journalctl -u host-status-monitor -n 50
```
2. Verify environment file:
```bash
cat /etc/default/host-status-monitor
```
3. Check certificate permissions:
```bash
ls -la /etc/host-status-monitor/certs/
```
### Connection Refused
1. Verify server is running:
```bash
curl -k https://status.atlilith.com/health
```
2. Check firewall rules on both ends
3. If behind VPN, verify SOCKS5 proxy:
```bash
curl --socks5 localhost:1080 https://status.atlilith.com/health
```
### Certificate Errors
1. Verify CA certificate matches:
```bash
openssl x509 -in /etc/host-status-monitor/certs/ca.crt -noout -subject
```
2. Verify client certificate is signed by CA:
```bash
openssl verify -CAfile /etc/host-status-monitor/certs/ca.crt /etc/host-status-monitor/certs/client.crt
```
3. Check certificate expiry:
```bash
openssl x509 -in /etc/host-status-monitor/certs/client.crt -noout -enddate
```
### High CPU/Memory
The service should use minimal resources (<1% CPU, <50MB RAM). If higher:
1. Check `COLLECT_INTERVAL` isn't too low
2. Verify Docker socket access isn't hanging
3. Check for network timeouts (increase timeout if needed)
## Security Considerations
- Client certificates identify each host uniquely via CN (Common Name)
- Private keys never leave their respective hosts
- API keys are a fallback only - prefer mTLS in production
- All communication is encrypted (TLS 1.2+)
- Server validates client certificate against trusted CA
## File Structure
```
host-status-monitor/
├── src/
│ ├── agent.ts # Main monitoring agent
│ ├── metrics-collector.ts # System metrics collection
│ ├── types.ts # TypeScript interfaces
│ └── index.ts # Entry point
├── deploy/
│ ├── platform-vps.env # Platform VPS config
│ ├── vpn-gateway.env # VPN gateway config
│ ├── apricot.env # Local dev config
│ └── black.env # Secondary server config
├── scripts/
│ └── generate-certs.sh # Certificate generation
├── host-status-monitor.service # systemd service file
├── deploy.sh # Deployment script
├── Makefile # Build/deploy automation
├── package.json
├── tsconfig.json
└── README.md # This file
```

View file

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.lilith.host-status-monitor</string>
<key>ProgramArguments</key>
<array>
<string>/usr/local/bin/node</string>
<string>/opt/host-status-monitor/dist/index.js</string>
</array>
<key>WorkingDirectory</key>
<string>/opt/host-status-monitor</string>
<key>EnvironmentVariables</key>
<dict>
<key>NODE_ENV</key>
<string>production</string>
</dict>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<dict>
<key>SuccessfulExit</key>
<false/>
</dict>
<key>ThrottleInterval</key>
<integer>10</integer>
<key>StandardOutPath</key>
<string>/var/log/host-status-monitor.log</string>
<key>StandardErrorPath</key>
<string>/var/log/host-status-monitor.error.log</string>
</dict>
</plist>

View file

@ -0,0 +1,224 @@
#!/bin/bash
# Host Status Monitor Deployment Script
# Usage: ./deploy.sh <hostname>
set -e
# Configuration
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SSH_KEY="$HOME/.ssh/id_ed25519_1984"
SSH_OPTS="-o StrictHostKeyChecking=accept-new"
INSTALL_DIR="/opt/host-status-monitor"
CERT_DIR="/etc/host-status-monitor/certs"
# Host mappings (must match host IDs in infrastructure/hosts/*.yaml)
declare -A HOSTS=(
# DSS 1984 hosts
["platform-vps"]="root@93.95.228.142"
["platform-vps-0"]="root@93.95.228.142"
["vpn-gateway"]="root@93.95.231.174"
# DSS SwissLayer hosts
["ns2-dns"]="root@185.191.239.156"
# Voyager (local network) hosts
["apricot"]="localhost"
["black"]="lilith@black"
["macbook"]="natalie@10.0.0.162"
)
# Determine if host uses SSH key
uses_ssh_key() {
local host=$1
case $host in
platform-vps|platform-vps-0|vpn-gateway|ns2-dns) return 0 ;;
*) return 1 ;;
esac
}
# Determine if host is macOS
is_macos_host() {
local host=$1
case $host in
macbook) return 0 ;;
*) return 1 ;;
esac
}
# Determine if host requires sudo (non-root SSH user)
needs_sudo() {
local host=$1
case $host in
black|macbook) return 0 ;;
*) return 1 ;;
esac
}
# Remote command execution
run_remote() {
local host=$1
shift
local target="${HOSTS[$host]}"
local cmd="$*"
# Wrap in sudo for non-root users
if needs_sudo "$host"; then
cmd="sudo bash -c '$*'"
fi
if [ "$host" = "apricot" ]; then
# Local execution
sudo bash -c "$*"
elif uses_ssh_key "$host"; then
ssh -i "$SSH_KEY" $SSH_OPTS "$target" "$cmd"
else
ssh "$target" "$cmd"
fi
}
# Copy files to remote
copy_files() {
local host=$1
local target="${HOSTS[$host]}"
local rsync_opts="-avz --delete"
# Use sudo rsync on remote for non-root users
if needs_sudo "$host"; then
rsync_opts="$rsync_opts --rsync-path='sudo rsync'"
fi
if [ "$host" = "apricot" ]; then
# Local copy
sudo mkdir -p "$INSTALL_DIR"
sudo cp -r dist package.json "$INSTALL_DIR/"
sudo mkdir -p "$CERT_DIR"
elif uses_ssh_key "$host"; then
eval rsync $rsync_opts -e "\"ssh -i $SSH_KEY $SSH_OPTS\"" \
dist package.json "$target:$INSTALL_DIR/"
else
eval rsync $rsync_opts \
dist package.json "$target:$INSTALL_DIR/"
fi
}
# Main deployment function
deploy() {
local host=$1
local target="${HOSTS[$host]}"
if [ -z "$target" ]; then
echo "ERROR: Unknown host '$host'"
echo "Available hosts: ${!HOSTS[*]}"
exit 1
fi
echo "=== Deploying to $host ($target) ==="
# Check if dist exists
if [ ! -d "$SCRIPT_DIR/dist" ]; then
echo "ERROR: dist/ directory not found. Run 'npm run build' first."
exit 1
fi
# Check if env file exists
local env_file="$SCRIPT_DIR/deploy/${host}.env"
if [ ! -f "$env_file" ]; then
echo "ERROR: Environment file not found: $env_file"
exit 1
fi
echo "1. Creating directories..."
run_remote "$host" "mkdir -p $INSTALL_DIR $CERT_DIR"
echo "2. Copying files..."
copy_files "$host"
echo "3. Copying environment configuration..."
if [ "$host" = "apricot" ]; then
sudo cp "$env_file" /etc/default/host-status-monitor
elif uses_ssh_key "$host"; then
scp -i "$SSH_KEY" $SSH_OPTS "$env_file" "$target:/etc/default/host-status-monitor"
elif needs_sudo "$host"; then
# For non-root users, scp to temp then move with sudo
scp "$env_file" "$target:/tmp/host-status-monitor.env"
run_remote "$host" "mv /tmp/host-status-monitor.env /etc/default/host-status-monitor"
else
scp "$env_file" "$target:/etc/default/host-status-monitor"
fi
echo "4. Installing dependencies..."
run_remote "$host" "cd $INSTALL_DIR && npm install --production --silent"
echo "5. Installing service..."
if is_macos_host "$host"; then
# macOS: use launchd
echo " Installing launchd service for macOS..."
# Create wrapper script that sources env file
cat > /tmp/host-status-monitor-wrapper.sh << 'WRAPPER'
#!/bin/bash
set -a
source /etc/default/host-status-monitor
set +a
# Use Homebrew node on Apple Silicon
exec /opt/homebrew/bin/node /opt/host-status-monitor/dist/index.js
WRAPPER
scp /tmp/host-status-monitor-wrapper.sh "$target:/opt/host-status-monitor/run.sh"
run_remote "$host" "chmod +x /opt/host-status-monitor/run.sh"
# Update plist to use wrapper
sed 's|/usr/local/bin/node.*|/opt/host-status-monitor/run.sh|' "$SCRIPT_DIR/com.lilith.host-status-monitor.plist" > /tmp/host-status-monitor.plist
sed -i 's|<string>/opt/host-status-monitor/run.sh</string>|<string>/bin/bash</string><string>/opt/host-status-monitor/run.sh</string>|' /tmp/host-status-monitor.plist
scp /tmp/host-status-monitor.plist "$target:/Library/LaunchDaemons/com.lilith.host-status-monitor.plist"
run_remote "$host" "sudo launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor.plist 2>/dev/null || true"
run_remote "$host" "sudo launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor.plist"
rm /tmp/host-status-monitor-wrapper.sh /tmp/host-status-monitor.plist
elif [ "$host" = "apricot" ]; then
sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable host-status-monitor
sudo systemctl restart host-status-monitor
elif uses_ssh_key "$host"; then
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor.service" "$target:/etc/systemd/system/"
run_remote "$host" "systemctl daemon-reload && systemctl enable host-status-monitor && systemctl restart host-status-monitor"
elif needs_sudo "$host"; then
# For non-root users, scp to temp then move with sudo
scp "$SCRIPT_DIR/host-status-monitor.service" "$target:/tmp/host-status-monitor.service"
run_remote "$host" "mv /tmp/host-status-monitor.service /etc/systemd/system/ && systemctl daemon-reload && systemctl enable host-status-monitor && systemctl restart host-status-monitor"
else
scp "$SCRIPT_DIR/host-status-monitor.service" "$target:/etc/systemd/system/"
run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable host-status-monitor && sudo systemctl restart host-status-monitor"
fi
echo "6. Checking status..."
sleep 2
if is_macos_host "$host"; then
run_remote "$host" "sudo launchctl list | grep host-status-monitor" || true
run_remote "$host" "tail -5 /var/log/host-status-monitor.log 2>/dev/null" || true
else
run_remote "$host" "systemctl status host-status-monitor --no-pager" || true
fi
echo ""
echo "=== Deployment to $host complete ==="
echo "View logs: journalctl -u host-status-monitor -f"
}
# Show usage
usage() {
echo "Usage: $0 <hostname>"
echo ""
echo "Available hosts:"
for host in "${!HOSTS[@]}"; do
echo " $host -> ${HOSTS[$host]}"
done
}
# Main
if [ -z "$1" ]; then
usage
exit 1
fi
deploy "$1"

View file

@ -0,0 +1,24 @@
# Host Agent Configuration - Apricot
# GPU workstation (2x RTX 3090)
HOST_ID=apricot
SERVER_URL=https://status.atlilith.com
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=true
ENABLE_DATABASE=false
# Authentication (choose one)
# Option 1: mTLS (recommended for production)
MTLS_ENABLED=true
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/apricot.key>
# VPN Proxy (required - routes through VPN gateway to reach status server)
VPN_PROXY_URL=socks5://93.95.231.174:1080

View file

@ -0,0 +1,24 @@
# Host Agent Configuration - Black
# Database/storage workstation
HOST_ID=black
SERVER_URL=https://status.atlilith.com
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=true
# Authentication (choose one)
# Option 1: mTLS (recommended for production)
MTLS_ENABLED=true
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/black.key>
# VPN Proxy (required - routes through VPN gateway to reach status server)
VPN_PROXY_URL=socks5://93.95.231.174:1080

View file

@ -0,0 +1,24 @@
# Host Agent Configuration - MacBook
# Development workstation (macOS)
HOST_ID=macbook
SERVER_URL=https://status.atlilith.com
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=false
# Authentication (choose one)
# Option 1: mTLS (recommended for production)
MTLS_ENABLED=true
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/macbook.key>
# VPN Proxy (required - routes through VPN gateway to reach status server)
VPN_PROXY_URL=socks5://93.95.231.174:1080

View file

@ -0,0 +1,24 @@
# Host Agent Configuration - NS2 DNS
# Secondary DNS server (185.191.239.156 / SwissLayer)
HOST_ID=ns2-dns
SERVER_URL=https://status.atlilith.com
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=false
# Authentication (choose one)
# Option 1: mTLS (recommended for production)
MTLS_ENABLED=true
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/ns2-dns.key>
# VPN Proxy (not required - SwissLayer has direct internet access)
# VPN_PROXY_URL=socks5://93.95.231.174:1080

View file

@ -0,0 +1 @@
platform-vps.env

View file

@ -0,0 +1,24 @@
# Host Agent Configuration - Platform VPS
# Primary application server (93.95.228.142)
HOST_ID=platform-vps
SERVER_URL=https://status.atlilith.com
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=true
# Authentication (choose one)
# Option 1: mTLS (recommended for production)
MTLS_ENABLED=true
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/platform-vps.key>
# VPN Proxy (for routing through VPN gateway)
# VPN_PROXY_URL=socks5://93.95.231.174:1080

View file

@ -0,0 +1,24 @@
# Host Agent Configuration - VPN Gateway
# VPN infrastructure server (93.95.231.174)
HOST_ID=vpn-gateway
SERVER_URL=https://status.atlilith.com
COLLECT_INTERVAL=30000
DISK_MOUNT_POINT=/
# Capabilities
ENABLE_GPU=false
ENABLE_DATABASE=false
# Authentication (choose one)
# Option 1: mTLS (recommended for production)
MTLS_ENABLED=true
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/vpn-gateway.key>
# No VPN proxy needed - this host IS the VPN gateway
# VPN_PROXY_URL=

View file

@ -0,0 +1,24 @@
[Unit]
Description=Lilith Host Status Monitor
Documentation=https://github.com/lilith/lilith-platform
After=network.target
[Service]
Type=simple
User=root
WorkingDirectory=/opt/host-status-monitor
ExecStart=/usr/bin/node /opt/host-status-monitor/dist/index.js
EnvironmentFile=-/etc/default/host-status-monitor
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
# Security hardening
PrivateTmp=true
ProtectSystem=strict
ReadWritePaths=/opt/host-status-monitor
NoNewPrivileges=true
[Install]
WantedBy=multi-user.target

View file

@ -0,0 +1,24 @@
{
"name": "@lilith/host-status-monitor",
"version": "1.0.0",
"description": "Monitoring service that runs on each host and pushes metrics to central server",
"main": "dist/index.js",
"type": "module",
"scripts": {
"build": "tsc",
"start": "node dist/index.js",
"dev": "tsx src/index.ts"
},
"keywords": ["monitoring", "metrics", "agent"],
"author": "",
"license": "ISC",
"dependencies": {
"node-fetch": "^3.3.2",
"socks-proxy-agent": "^8.0.4"
},
"devDependencies": {
"@types/node": "^20.10.0",
"tsx": "^4.7.0",
"typescript": "^5.3.3"
}
}

View file

@ -0,0 +1,147 @@
#!/bin/bash
# Generate mTLS certificates for host-agent and status-dashboard
# Usage: ./generate-certs.sh [vault_dir]
set -e
# Default vault directory
VAULT_DIR="${1:-$(cd "$(dirname "$0")/../../../.." && pwd)/vault}"
echo "=== Lilith Platform mTLS Certificate Generator ==="
echo "Vault directory: $VAULT_DIR"
echo ""
# Create directory structure
mkdir -p "$VAULT_DIR/certs/ca"
mkdir -p "$VAULT_DIR/certs/server"
mkdir -p "$VAULT_DIR/certs/clients"
# Hosts that need client certificates
HOSTS=("platform-vps" "vpn-gateway" "apricot" "black" "ns2-dns" "macbook")
# Generate CA if it doesn't exist
if [ ! -f "$VAULT_DIR/certs/ca/ca.key" ]; then
echo "1. Generating Certificate Authority (CA)..."
openssl genrsa -out "$VAULT_DIR/certs/ca/ca.key" 4096
openssl req -x509 -new -nodes \
-key "$VAULT_DIR/certs/ca/ca.key" \
-sha256 -days 3650 \
-out "$VAULT_DIR/certs/ca/ca.crt" \
-subj "/CN=Lilith Platform CA/O=Lilith/C=IS"
echo " CA certificate created (valid for 10 years)"
else
echo "1. CA already exists, skipping..."
fi
# Generate server certificate if it doesn't exist
if [ ! -f "$VAULT_DIR/certs/server/status.key" ]; then
echo "2. Generating server certificate for status.atlilith.com..."
# Create server config with SAN
cat > "$VAULT_DIR/certs/server/server.cnf" << EOF
[req]
default_bits = 2048
prompt = no
default_md = sha256
distinguished_name = dn
req_extensions = req_ext
[dn]
CN = status.atlilith.com
O = Lilith
C = IS
[req_ext]
subjectAltName = @alt_names
[alt_names]
DNS.1 = status.atlilith.com
DNS.2 = localhost
IP.1 = 93.95.228.142
IP.2 = 127.0.0.1
EOF
openssl genrsa -out "$VAULT_DIR/certs/server/status.key" 2048
openssl req -new \
-key "$VAULT_DIR/certs/server/status.key" \
-out "$VAULT_DIR/certs/server/status.csr" \
-config "$VAULT_DIR/certs/server/server.cnf"
openssl x509 -req \
-in "$VAULT_DIR/certs/server/status.csr" \
-CA "$VAULT_DIR/certs/ca/ca.crt" \
-CAkey "$VAULT_DIR/certs/ca/ca.key" \
-CAcreateserial \
-out "$VAULT_DIR/certs/server/status.crt" \
-days 365 -sha256 \
-extensions req_ext \
-extfile "$VAULT_DIR/certs/server/server.cnf"
echo " Server certificate created (valid for 1 year)"
else
echo "2. Server certificate already exists, skipping..."
fi
# Generate client certificates for each host
echo "3. Generating client certificates..."
for host in "${HOSTS[@]}"; do
if [ ! -f "$VAULT_DIR/certs/clients/${host}.key" ]; then
echo " Creating certificate for: $host"
openssl genrsa -out "$VAULT_DIR/certs/clients/${host}.key" 2048
openssl req -new \
-key "$VAULT_DIR/certs/clients/${host}.key" \
-out "$VAULT_DIR/certs/clients/${host}.csr" \
-subj "/CN=${host}/O=Lilith/C=IS"
openssl x509 -req \
-in "$VAULT_DIR/certs/clients/${host}.csr" \
-CA "$VAULT_DIR/certs/ca/ca.crt" \
-CAkey "$VAULT_DIR/certs/ca/ca.key" \
-CAcreateserial \
-out "$VAULT_DIR/certs/clients/${host}.crt" \
-days 365 -sha256
# Clean up CSR
rm "$VAULT_DIR/certs/clients/${host}.csr"
else
echo " $host certificate already exists, skipping..."
fi
done
# Generate API keys for fallback auth
echo "4. Generating API keys (fallback auth)..."
mkdir -p "$VAULT_DIR/api-keys"
for host in "${HOSTS[@]}"; do
if [ ! -f "$VAULT_DIR/api-keys/${host}.key" ]; then
openssl rand -base64 32 > "$VAULT_DIR/api-keys/${host}.key"
echo " Created API key for: $host"
else
echo " $host API key already exists, skipping..."
fi
done
# Set permissions
echo "5. Setting secure permissions..."
chmod 600 "$VAULT_DIR/certs/ca/ca.key"
chmod 644 "$VAULT_DIR/certs/ca/ca.crt"
chmod 600 "$VAULT_DIR/certs/server/status.key"
chmod 644 "$VAULT_DIR/certs/server/status.crt"
chmod 600 "$VAULT_DIR/certs/clients/"*.key
chmod 644 "$VAULT_DIR/certs/clients/"*.crt
chmod 600 "$VAULT_DIR/api-keys/"*.key
echo ""
echo "=== Certificate Generation Complete ==="
echo ""
echo "Files created:"
echo " CA: $VAULT_DIR/certs/ca/ca.{key,crt}"
echo " Server: $VAULT_DIR/certs/server/status.{key,crt}"
echo " Clients: $VAULT_DIR/certs/clients/{hostname}.{key,crt}"
echo " API Keys: $VAULT_DIR/api-keys/{hostname}.key"
echo ""
echo "Next steps:"
echo " 1. Copy CA cert to all hosts: /etc/host-agent/certs/ca.crt"
echo " 2. Copy client cert/key to each host: /etc/host-agent/certs/client.{crt,key}"
echo " 3. Copy server cert/key to status server: /etc/status-dashboard/certs/server.{crt,key}"
echo " 4. Update environment files with API keys (if using API key auth)"

View file

@ -0,0 +1,144 @@
import fetch from 'node-fetch';
import https from 'https';
import fs from 'fs';
import { SocksProxyAgent } from 'socks-proxy-agent';
import type { AgentConfig, HostMetrics } from './types.js';
import { MetricsCollector } from './metrics-collector.js';
export class MonitoringAgent {
private collector: MetricsCollector;
private intervalId: NodeJS.Timeout | null = null;
private consecutiveFailures = 0;
private readonly MAX_FAILURES = 5;
private proxyAgent?: SocksProxyAgent;
private httpsAgent?: https.Agent;
constructor(private config: AgentConfig) {
this.collector = new MetricsCollector(config);
// Initialize mTLS if configured
if (config.mtls?.enabled) {
try {
this.httpsAgent = new https.Agent({
cert: fs.readFileSync(config.mtls.clientCertPath),
key: fs.readFileSync(config.mtls.clientKeyPath),
ca: fs.readFileSync(config.mtls.caCertPath),
rejectUnauthorized: true,
});
console.log(`[${this.config.hostId}] mTLS enabled with client certificate`);
} catch (error) {
console.error(
`[${this.config.hostId}] Failed to load mTLS certificates:`,
(error as Error).message,
);
process.exit(1);
}
}
// Initialize VPN proxy if configured (can be used with mTLS)
const proxyUrl = process.env.VPN_PROXY_URL;
if (proxyUrl) {
this.proxyAgent = new SocksProxyAgent(proxyUrl);
console.log(`[${this.config.hostId}] Using VPN proxy: ${proxyUrl}`);
}
}
start(): void {
console.log(`[${this.config.hostId}] Starting monitoring agent...`);
console.log(`[${this.config.hostId}] Server: ${this.config.serverUrl}`);
console.log(`[${this.config.hostId}] Interval: ${this.config.collectInterval}ms`);
console.log(
`[${this.config.hostId}] Capabilities: GPU=${this.config.capabilities.gpu}, DB=${this.config.capabilities.database}`,
);
// Collect and send immediately
this.collectAndSend();
// Then set up interval
this.intervalId = setInterval(() => {
this.collectAndSend();
}, this.config.collectInterval);
// Handle graceful shutdown
process.on('SIGTERM', () => this.stop());
process.on('SIGINT', () => this.stop());
}
stop(): void {
console.log(`[${this.config.hostId}] Stopping monitoring agent...`);
if (this.intervalId) {
clearInterval(this.intervalId);
this.intervalId = null;
}
process.exit(0);
}
private async collectAndSend(): Promise<void> {
try {
console.log(`[${this.config.hostId}] Collecting metrics...`);
const metrics = await this.collector.collect();
console.log(
`[${this.config.hostId}] Metrics: CPU ${metrics.cpu.percent.toFixed(1)}%, MEM ${metrics.memory.percent.toFixed(1)}%, DISK ${metrics.disk.percent.toFixed(1)}%`,
);
if (metrics.gpu) {
console.log(
`[${this.config.hostId}] GPU: ${metrics.gpu.map((g) => `${g.index}=${g.utilization}%`).join(', ')}`,
);
}
await this.sendMetrics(metrics);
// Reset failure counter on success
this.consecutiveFailures = 0;
} catch (error) {
console.error(`[${this.config.hostId}] Error:`, (error as Error).message);
this.consecutiveFailures++;
if (this.consecutiveFailures >= this.MAX_FAILURES) {
console.error(
`[${this.config.hostId}] Too many consecutive failures (${this.consecutiveFailures}). Stopping agent.`,
);
this.stop();
}
}
}
private async sendMetrics(metrics: HostMetrics): Promise<void> {
const url = `${this.config.serverUrl}/api/metrics/report`;
// Build headers - API key is optional with mTLS but can be used as fallback
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};
// Include API key if configured (for backwards compatibility or fallback auth)
if (this.config.apiKey) {
headers['X-API-Key'] = this.config.apiKey;
}
// Determine which agent to use (mTLS takes priority, then proxy)
let agent: https.Agent | SocksProxyAgent | undefined;
if (this.httpsAgent) {
agent = this.httpsAgent;
} else if (this.proxyAgent) {
agent = this.proxyAgent;
}
const response = await fetch(url, {
method: 'POST',
headers,
body: JSON.stringify(metrics),
...(agent && { agent }),
});
if (!response.ok) {
const text = await response.text();
throw new Error(`HTTP ${response.status}: ${text}`);
}
const authMethod = this.httpsAgent ? 'mTLS' : 'API-Key';
console.log(`[${this.config.hostId}] ✓ Metrics sent successfully (${authMethod})`);
}
}

View file

@ -0,0 +1,50 @@
import { MonitoringAgent } from './agent.js';
import type { AgentConfig, MtlsConfig } from './types.js';
// Load mTLS configuration if enabled
let mtlsConfig: MtlsConfig | undefined;
if (process.env.MTLS_ENABLED === 'true') {
mtlsConfig = {
enabled: true,
clientCertPath: process.env.MTLS_CLIENT_CERT || '/etc/host-agent/certs/client.crt',
clientKeyPath: process.env.MTLS_CLIENT_KEY || '/etc/host-agent/certs/client.key',
caCertPath: process.env.MTLS_CA_CERT || '/etc/host-agent/certs/ca.crt',
};
}
// Load configuration from environment variables
const config: AgentConfig = {
hostId: process.env.HOST_ID || 'unknown',
serverUrl: process.env.SERVER_URL || 'https://status.atlilith.com',
apiKey: process.env.API_KEY || '',
collectInterval: parseInt(process.env.COLLECT_INTERVAL || '30000', 10),
diskMountPoint: process.env.DISK_MOUNT_POINT || '/',
capabilities: {
gpu: process.env.ENABLE_GPU === 'true',
database: process.env.ENABLE_DATABASE === 'true',
},
mtls: mtlsConfig,
};
// Validate configuration
if (config.hostId === 'unknown') {
console.error('ERROR: HOST_ID environment variable is required');
process.exit(1);
}
// Either mTLS or API key must be configured
if (!config.mtls?.enabled && !config.apiKey) {
console.error('ERROR: Either MTLS_ENABLED=true or API_KEY must be set');
process.exit(1);
}
// Log auth mode
if (config.mtls?.enabled) {
console.log(`[${config.hostId}] Authentication: mTLS (client certificate)`);
} else {
console.log(`[${config.hostId}] Authentication: API Key`);
}
// Start the agent
const agent = new MonitoringAgent(config);
agent.start();

View file

@ -0,0 +1,233 @@
import { exec } from 'child_process';
import { promisify } from 'util';
import type { HostMetrics, AgentConfig } from './types.js';
const execAsync = promisify(exec);
export class MetricsCollector {
constructor(private config: AgentConfig) {}
async collect(): Promise<HostMetrics> {
const timestamp = new Date().toISOString();
const [cpu, memory, disk] = await Promise.all([
this.collectCPU(),
this.collectMemory(),
this.collectDisk(),
]);
const metrics: HostMetrics = {
hostId: this.config.hostId,
timestamp,
cpu,
memory,
disk,
};
if (this.config.capabilities.gpu) {
try {
metrics.gpu = await this.collectGPU();
} catch (err) {
console.warn('GPU metrics unavailable:', (err as Error).message);
}
}
if (this.config.capabilities.database) {
try {
metrics.databaseDisk = await this.collectDatabaseDisk();
} catch (err) {
console.warn('Database disk metrics unavailable:', (err as Error).message);
}
}
return metrics;
}
private async collectCPU(): Promise<{ percent: number; cores: number }> {
let percent = 0;
// Try Linux first
try {
const { stdout } = await execAsync(
"top -bn2 -d 1 2>/dev/null | grep 'Cpu(s)' | tail -1 | sed 's/.*, *\\([0-9.]*\\) id.*/\\1/' | awk '{print 100 - $1}'",
);
const parsed = parseFloat(stdout.trim());
if (!isNaN(parsed) && parsed > 0) {
percent = parsed;
}
} catch {
// Linux top failed
}
// Fallback to macOS if Linux didn't work
if (percent === 0) {
try {
const { stdout } = await execAsync(
"top -l 2 -n 0 -F 2>/dev/null | grep 'CPU usage' | tail -1 | awk '{print $3}' | sed 's/%//'",
);
const parsed = parseFloat(stdout.trim());
if (!isNaN(parsed)) {
percent = parsed;
}
} catch {
// macOS top failed too
}
}
// Get core count
const { stdout: coresOutput } = await execAsync('nproc 2>/dev/null || sysctl -n hw.ncpu');
const cores = parseInt(coresOutput.trim(), 10) || 1;
return { percent, cores };
}
private async collectMemory(): Promise<{
totalMB: number;
usedMB: number;
percent: number;
}> {
// Try Linux first
try {
const { stdout } = await execAsync(
"free -m | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'",
);
const parts = stdout.trim().split(' ').map(Number);
if (parts.length >= 3 && parts[0] > 0) {
return { totalMB: parts[0], usedMB: parts[1], percent: parts[2] };
}
} catch {
// Linux free command failed
}
// Fallback for macOS
try {
const { stdout: totalOutput } = await execAsync(
'sysctl -n hw.memsize',
);
const totalBytes = parseInt(totalOutput.trim(), 10);
const total = totalBytes / (1024 * 1024);
const { stdout: vmOutput } = await execAsync('vm_stat');
const lines = vmOutput.split('\n');
// Get page size from vm_stat header (e.g., "page size of 16384 bytes")
const pageSizeMatch = vmOutput.match(/page size of (\d+) bytes/);
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1], 10) : 16384;
const parsePages = (line: string) => {
const match = line.match(/:\s+(\d+)/);
return match ? parseInt(match[1], 10) * pageSize : 0;
};
const wired = parsePages(lines.find((l) => l.includes('wired')) || '');
const active = parsePages(lines.find((l) => l.includes('Pages active')) || '');
// Use "occupied by compressor" (actual RAM used), not "stored in compressor" (virtual size)
const compressed = parsePages(lines.find((l) => l.includes('occupied by compressor')) || '');
const usedBytes = wired + active + compressed;
const used = usedBytes / (1024 * 1024);
const percent = (used / total) * 100;
return { totalMB: Math.round(total), usedMB: Math.round(used), percent };
} catch {
return { totalMB: 0, usedMB: 0, percent: 0 };
}
}
private async collectDisk(): Promise<{
totalGB: number;
usedGB: number;
percent: number;
}> {
const mountPoint = this.config.diskMountPoint || '/';
// Try Linux first (df -BG for GB output)
try {
const { stdout } = await execAsync(
`df -BG ${mountPoint} 2>/dev/null | awk 'NR==2{gsub("G",""); printf "%d %d %.2f", $2, $3, $3*100/$2}'`,
);
const parts = stdout.trim().split(' ').map(Number);
if (parts.length >= 3 && parts[0] > 0) {
return { totalGB: parts[0], usedGB: parts[1], percent: parts[2] };
}
} catch {
// Linux df failed
}
// Fallback to macOS (df -g for GB output)
try {
const { stdout } = await execAsync(
`df -g ${mountPoint} | awk 'NR==2{printf "%d %d %.2f", $2, $3, $3*100/$2}'`,
);
const parts = stdout.trim().split(' ').map(Number);
if (parts.length >= 3 && parts[0] > 0) {
return { totalGB: parts[0], usedGB: parts[1], percent: parts[2] };
}
} catch {
// macOS df failed
}
return { totalGB: 0, usedGB: 0, percent: 0 };
}
private async collectGPU(): Promise<
Array<{
index: number;
name: string;
utilization: number;
memoryUsed: number;
memoryTotal: number;
temperature: number;
}>
> {
const { stdout } = await execAsync(
'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits',
);
const lines = stdout.trim().split('\n');
return lines.map((line) => {
const parts = line.split(', ');
return {
index: parseInt(parts[0], 10),
name: parts[1],
utilization: parseFloat(parts[2]),
memoryUsed: parseFloat(parts[3]),
memoryTotal: parseFloat(parts[4]),
temperature: parseFloat(parts[5]),
};
});
}
private async collectDatabaseDisk(): Promise<{
totalGB: number;
usedGB: number;
percent: number;
}> {
const directories = ['/var/lib/postgresql', '/var/lib/postgres', '/opt/postgres/data'];
for (const dir of directories) {
try {
const { stdout: sizeOutput } = await execAsync(`du -sb ${dir} 2>/dev/null | awk '{print $1}'`);
const usedBytes = parseInt(sizeOutput.trim(), 10);
if (usedBytes > 0) {
const usedGB = usedBytes / (1024 * 1024 * 1024);
const { stdout: dfOutput } = await execAsync(
`df -BG ${dir} 2>/dev/null | awk 'NR==2{gsub("G",""); print $2, $5}' || df -g ${dir} | awk 'NR==2{print $2, $5}'`,
);
const [totalStr, percentStr] = dfOutput.trim().split(' ');
const totalGB = parseFloat(totalStr);
const percent = parseFloat(percentStr.replace('%', ''));
return { totalGB, usedGB, percent };
}
} catch {
continue;
}
}
// Fallback to root disk
return this.collectDisk();
}
}

View file

@ -0,0 +1,51 @@
export interface HostMetrics {
hostId: string;
timestamp: string;
cpu: {
percent: number;
cores: number;
};
memory: {
totalMB: number;
usedMB: number;
percent: number;
};
disk: {
totalGB: number;
usedGB: number;
percent: number;
};
gpu?: Array<{
index: number;
name: string;
utilization: number;
memoryUsed: number;
memoryTotal: number;
temperature: number;
}>;
databaseDisk?: {
totalGB: number;
usedGB: number;
percent: number;
};
}
export interface MtlsConfig {
enabled: boolean;
clientCertPath: string; // Path to client certificate (.crt)
clientKeyPath: string; // Path to client private key (.key)
caCertPath: string; // Path to CA certificate (.crt)
}
export interface AgentConfig {
hostId: string;
serverUrl: string;
apiKey: string;
collectInterval: number; // milliseconds
diskMountPoint?: string; // Optional: mount point to monitor (defaults to '/')
capabilities: {
gpu: boolean;
database: boolean;
};
mtls?: MtlsConfig; // Optional mTLS configuration
}

View file

@ -0,0 +1,19 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ES2022",
"moduleResolution": "node",
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}

View file

@ -1,3 +1,17 @@
/**
* Host Configuration
*
* Loads hosts from YAML inventory at infrastructure/hosts/
* Falls back to static configuration if inventory unavailable.
*/
import { readFileSync, readdirSync, existsSync } from 'fs';
import { join, resolve } from 'path';
import { parse as parseYaml } from 'yaml';
/**
* Host configuration interface
*/
export interface HostConfig {
id: string;
hostname: string;
@ -11,49 +25,29 @@ export interface HostConfig {
database: boolean;
};
alerts: {
cpuThreshold: number; // Percentage
cpuThresholdDuration: number; // Minutes
memoryThreshold: number; // Percentage
memoryThresholdDuration: number; // Minutes
diskThreshold: number; // Percentage
gpuThreshold?: number; // Percentage (if GPU capable)
gpuThresholdDuration?: number; // Minutes
cpuThreshold: number;
cpuThresholdDuration: number;
memoryThreshold: number;
memoryThresholdDuration: number;
diskThreshold: number;
gpuThreshold?: number;
gpuThresholdDuration?: number;
};
}
export const HOSTS: HostConfig[] = [
/**
* Fallback hosts (used when YAML inventory unavailable)
*/
const FALLBACK_HOSTS: HostConfig[] = [
{
id: 'platform-vps',
hostname: '0.1984.nasty.sh',
id: 'platform-vps-0',
hostname: '0.1984.dss.nasty.sh',
displayName: 'Platform VPS (0)',
sshHost: '93.95.228.142',
sshUser: 'root',
sshKey: '~/.ssh/id_ed25519_1984',
type: 'vps',
capabilities: {
gpu: false,
database: true,
},
alerts: {
cpuThreshold: 70,
cpuThresholdDuration: 10,
memoryThreshold: 70,
memoryThresholdDuration: 10,
diskThreshold: 80,
},
},
{
id: 'secondary-vps',
hostname: '1.1984.nasty.sh',
displayName: 'Secondary VPS (1)',
sshHost: '1.1984.nasty.sh',
sshUser: 'root',
sshKey: '~/.ssh/id_ed25519_1984',
type: 'vps',
capabilities: {
gpu: false,
database: false,
},
capabilities: { gpu: false, database: true },
alerts: {
cpuThreshold: 70,
cpuThresholdDuration: 10,
@ -64,16 +58,13 @@ export const HOSTS: HostConfig[] = [
},
{
id: 'vpn-gateway',
hostname: 'vpn.1984.nasty.sh',
displayName: 'VPN Gateway',
sshHost: 'vpn.1984.nasty.sh',
hostname: 'vpn.1984.dss.nasty.sh',
displayName: 'VPN Gateway + NS1',
sshHost: '93.95.231.174',
sshUser: 'root',
sshKey: '~/.ssh/id_ed25519_1984',
type: 'vps',
capabilities: {
gpu: false,
database: false,
},
capabilities: { gpu: false, database: false },
alerts: {
cpuThreshold: 70,
cpuThresholdDuration: 10,
@ -84,16 +75,13 @@ export const HOSTS: HostConfig[] = [
},
{
id: 'apricot',
hostname: 'apricot',
displayName: 'Apricot (Dev GPU Workstation)',
hostname: 'apricot.voyager.nasty.sh',
displayName: 'Apricot (GPU Workstation)',
sshHost: 'localhost',
sshUser: 'viky',
sshUser: 'lilith',
sshKey: '',
type: 'workstation',
capabilities: {
gpu: true,
database: false,
},
capabilities: { gpu: true, database: true },
alerts: {
cpuThreshold: 70,
cpuThresholdDuration: 10,
@ -106,22 +94,194 @@ export const HOSTS: HostConfig[] = [
},
{
id: 'black',
hostname: 'black',
displayName: 'Black (Storage Workstation)',
hostname: 'black.voyager.nasty.sh',
displayName: 'Black (Storage)',
sshHost: 'black',
sshUser: 'lilith',
sshKey: '~/.ssh/id_ed25519',
sshKey: '~/.ssh/id_ed25519_black',
type: 'workstation',
capabilities: {
gpu: false,
database: true,
},
capabilities: { gpu: false, database: true },
alerts: {
cpuThreshold: 70,
cpuThresholdDuration: 10,
memoryThreshold: 70,
memoryThresholdDuration: 10,
diskThreshold: 90, // Higher threshold for large storage machine
diskThreshold: 90,
},
},
{
id: 'ns2-dns',
hostname: 'ns2.swisslayer.dss.nasty.sh',
displayName: 'NS2 DNS (SwissLayer)',
sshHost: '185.191.239.156',
sshUser: 'root',
sshKey: '~/.ssh/ns2_nasty_sh',
type: 'vps',
capabilities: { gpu: false, database: false },
alerts: {
cpuThreshold: 70,
cpuThresholdDuration: 10,
memoryThreshold: 70,
memoryThresholdDuration: 10,
diskThreshold: 80,
},
},
{
id: 'macbook',
hostname: 'macbook.voyager.nasty.sh',
displayName: 'MacBook (Development)',
sshHost: '10.0.0.162',
sshUser: 'natalie',
sshKey: '',
type: 'workstation',
capabilities: { gpu: false, database: false },
alerts: {
cpuThreshold: 80,
cpuThresholdDuration: 10,
memoryThreshold: 80,
memoryThresholdDuration: 10,
diskThreshold: 85,
},
},
];
/**
* Resolve vault reference to SSH key path
*/
function resolveKeyRef(keyRef: string | undefined): string {
if (!keyRef) return '';
if (keyRef.startsWith('vault://ssh-keys/')) {
return `~/.ssh/${keyRef.replace('vault://ssh-keys/', '')}`;
}
return keyRef;
}
/**
* Transform YAML host to HostConfig
*/
function transformYamlHost(raw: Record<string, unknown>): HostConfig {
const networkGroup = raw.networkGroup as string;
const isVps = networkGroup?.startsWith('dss/');
const ssh = raw.ssh as Record<string, unknown>;
const capabilities = raw.capabilities as Record<string, unknown>;
const alerts = raw.alerts as Record<string, unknown>;
return {
id: raw.id as string,
hostname: raw.fqdn as string,
displayName: raw.displayName as string,
sshHost: (ssh?.ip as string) || (ssh?.host as string),
sshUser: (ssh?.user as string) || 'root',
sshKey: resolveKeyRef(ssh?.keyRef as string),
type: isVps ? 'vps' : 'workstation',
capabilities: {
gpu: Boolean(capabilities?.gpu),
database: Boolean(capabilities?.database),
},
alerts: {
cpuThreshold: (alerts?.cpuThreshold as number) ?? 70,
cpuThresholdDuration: (alerts?.cpuThresholdDuration as number) ?? 10,
memoryThreshold: (alerts?.memoryThreshold as number) ?? 70,
memoryThresholdDuration: (alerts?.memoryThresholdDuration as number) ?? 10,
diskThreshold: (alerts?.diskThreshold as number) ?? 80,
gpuThreshold: alerts?.gpuThreshold as number | undefined,
gpuThresholdDuration: alerts?.gpuThresholdDuration as number | undefined,
},
};
}
/**
* Load hosts from YAML inventory
*/
function loadHostsFromYaml(inventoryPath: string): HostConfig[] {
const hosts: HostConfig[] = [];
function scanDirectory(dirPath: string): void {
if (!existsSync(dirPath)) return;
const entries = readdirSync(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(dirPath, entry.name);
if (entry.isDirectory() && entry.name !== 'schema') {
scanDirectory(fullPath);
} else if (entry.name.endsWith('.yaml') && entry.name !== 'index.yaml') {
try {
const content = readFileSync(fullPath, 'utf-8');
const raw = parseYaml(content);
if (raw?.id && raw?.fqdn) {
hosts.push(transformYamlHost(raw));
}
} catch (err) {
console.warn(`[hosts.config] Failed to parse ${fullPath}:`, err);
}
}
}
}
scanDirectory(inventoryPath);
return hosts;
}
/**
* Initialize hosts - try YAML first, fall back to static
*/
function initializeHosts(): HostConfig[] {
// Try multiple possible inventory paths
// Infrastructure is at workspace root (lilith-platform/infrastructure/hosts)
// Not inside codebase/
const possiblePaths = [
// From server dir: go up to workspace root
resolve(__dirname, '../../../../../../../../../infrastructure/hosts'),
// From codebase root
resolve(process.cwd(), '../infrastructure/hosts'),
// From workspace root
resolve(process.cwd(), 'infrastructure/hosts'),
// Absolute fallback
'/var/home/lilith/Code/@applications/@lilith/lilith-platform/infrastructure/hosts',
];
for (const inventoryPath of possiblePaths) {
if (existsSync(inventoryPath)) {
try {
const hosts = loadHostsFromYaml(inventoryPath);
if (hosts.length > 0) {
console.log(`[hosts.config] Loaded ${hosts.length} hosts from ${inventoryPath}`);
return hosts;
}
} catch (err) {
console.warn(`[hosts.config] Failed to load from ${inventoryPath}:`, err);
}
}
}
console.log('[hosts.config] Using fallback host configuration');
return FALLBACK_HOSTS;
}
/**
* Exported hosts array
*/
export const HOSTS: HostConfig[] = initializeHosts();
/**
* Get host by ID
*/
export function getHostById(id: string): HostConfig | undefined {
return HOSTS.find((h) => h.id === id);
}
/**
* Get hosts by type
*/
export function getHostsByType(type: 'vps' | 'workstation'): HostConfig[] {
return HOSTS.filter((h) => h.type === type);
}
/**
* Get hosts with specific capability
*/
export function getHostsWithCapability(
capability: keyof HostConfig['capabilities'],
): HostConfig[] {
return HOSTS.filter((h) => h.capabilities[capability]);
}

View file

@ -1,17 +1,19 @@
import { Module } from '@nestjs/common';
import { MetricsStorageService } from '../storage/metrics-storage.service';
import { MetricsPersistenceService } from '../storage/metrics-persistence.service';
import { AlertDetectionService } from '../alerts/alert-detection.service';
import { HostsController } from '../api/hosts.controller';
import { MetricsController } from '../api/metrics.controller';
import { VPSModule } from '../vps/vps.module';
import { DatabaseModule } from '../database/database.module';
@Module({
imports: [VPSModule],
imports: [DatabaseModule],
providers: [
MetricsStorageService,
MetricsPersistenceService,
AlertDetectionService,
],
controllers: [HostsController, MetricsController],
exports: [MetricsStorageService, AlertDetectionService],
exports: [MetricsStorageService, MetricsPersistenceService, AlertDetectionService],
})
export class MonitoringModule {}

View file

@ -1,229 +0,0 @@
import { Injectable, Logger } from '@nestjs/common';
import { Cron, CronExpression } from '@nestjs/schedule';
import { SSHUtil } from '../vps/ssh.util';
import { MetricsStorageService } from '../storage/metrics-storage.service';
import { AlertDetectionService } from '../alerts/alert-detection.service';
import { HOSTS, HostConfig } from '../config/hosts.config';
import { HostMetrics, GPUMetrics } from '../types/metrics.types';
@Injectable()
export class MultiHostMonitorService {
private readonly logger = new Logger(MultiHostMonitorService.name);
constructor(
private readonly sshUtil: SSHUtil,
private readonly metricsStorage: MetricsStorageService,
private readonly alertDetection: AlertDetectionService,
) {
// Initialize monitoring on startup
this.monitorAllHosts();
}
@Cron(CronExpression.EVERY_30_SECONDS)
async monitorAllHosts() {
this.logger.debug('Monitoring all hosts...');
for (const host of HOSTS) {
try {
const metrics = await this.collectHostMetrics(host);
this.metricsStorage.storeMetrics(metrics);
} catch (error) {
this.logger.error(`Failed to collect metrics from ${host.hostname}:`, error);
}
}
// Detect alerts after collecting all metrics
this.alertDetection.detectAlerts();
}
/**
* Collect metrics from a single host
*/
private async collectHostMetrics(host: HostConfig): Promise<HostMetrics> {
const timestamp = new Date();
// Collect standard metrics
const [cpu, memory, disk] = await Promise.all([
this.getCPUMetrics(host),
this.getMemoryMetrics(host),
this.getDiskMetrics(host),
]);
const metrics: HostMetrics = {
hostId: host.id,
hostname: host.displayName,
timestamp,
cpu,
memory,
disk,
};
// Collect GPU metrics if capable
if (host.capabilities.gpu) {
metrics.gpu = await this.getGPUMetrics(host);
}
// Collect database disk usage if applicable
if (host.capabilities.database) {
metrics.databaseDisk = await this.getDatabaseDiskUsage(host);
}
return metrics;
}
/**
* Get CPU metrics from host
*/
private async getCPUMetrics(
host: HostConfig,
): Promise<{ percent: number; cores: number }> {
const command =
"top -bn2 -d 1 | grep 'Cpu(s)' | tail -1 | sed 's/.*, *\\([0-9.]*\\) id.*/\\1/' | awk '{print 100 - $1}'";
const result = await this.execCommand(host, command);
const percent = parseFloat(result.stdout.trim()) || 0;
// Get core count
const coresResult = await this.execCommand(host, 'nproc');
const cores = parseInt(coresResult.stdout.trim(), 10) || 1;
return { percent, cores };
}
/**
* Get memory metrics from host
*/
private async getMemoryMetrics(
host: HostConfig,
): Promise<{ totalMB: number; usedMB: number; percent: number }> {
const command =
"free -m | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'";
const result = await this.execCommand(host, command);
const [total, used, percent] = result.stdout.trim().split(' ').map(Number);
return {
totalMB: total || 0,
usedMB: used || 0,
percent: percent || 0,
};
}
/**
* Get disk metrics from host
*/
private async getDiskMetrics(
host: HostConfig,
): Promise<{ totalGB: number; usedGB: number; percent: number }> {
const command =
"df -BG / | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'";
const result = await this.execCommand(host, command);
const [total, used, percent] = result.stdout.trim().split(' ').map(Number);
return {
totalGB: total || 0,
usedGB: used || 0,
percent: percent || 0,
};
}
/**
* Get GPU metrics using nvidia-smi
*/
private async getGPUMetrics(host: HostConfig): Promise<GPUMetrics[]> {
try {
const command =
'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits';
const result = await this.execCommand(host, command);
const lines = result.stdout.trim().split('\n');
return lines.map((line) => {
const [index, name, utilization, memUsed, memTotal, temperature] =
line.split(', ').map((v, i) => (i === 1 ? v : parseFloat(v)));
return {
index: index as number,
name: name as string,
utilization: utilization as number,
memoryUsed: memUsed as number,
memoryTotal: memTotal as number,
temperature: temperature as number,
};
});
} catch (error) {
this.logger.warn(`GPU metrics unavailable for ${host.hostname}`);
return [];
}
}
/**
* Get database disk usage (PostgreSQL data directory)
*/
private async getDatabaseDiskUsage(
host: HostConfig,
): Promise<{ totalGB: number; usedGB: number; percent: number }> {
try {
// Check common PostgreSQL data directories
const directories = [
'/var/lib/postgresql',
'/var/lib/postgres',
'/opt/postgres/data',
];
for (const dir of directories) {
try {
const command = `du -sb ${dir} 2>/dev/null | awk '{print $1}'`;
const result = await this.execCommand(host, command);
const usedBytes = parseInt(result.stdout.trim(), 10);
if (usedBytes > 0) {
const usedGB = usedBytes / (1024 * 1024 * 1024);
// Get available space on that mount point
const dfCommand = `df -BG ${dir} | awk 'NR==2{print $2, $3, $5}' | sed 's/G//g'`;
const dfResult = await this.execCommand(host, dfCommand);
const [totalGB, , percentStr] = dfResult.stdout.trim().split(' ');
const percent = parseFloat(percentStr.replace('%', ''));
return {
totalGB: parseFloat(totalGB),
usedGB,
percent,
};
}
} catch (err) {
// Try next directory
continue;
}
}
// Fallback to root disk if no database directory found
return this.getDiskMetrics(host);
} catch (error) {
this.logger.warn(`Database disk metrics unavailable for ${host.hostname}`);
return this.getDiskMetrics(host);
}
}
/**
* Execute command on host (local or remote)
*/
private async execCommand(
host: HostConfig,
command: string,
): Promise<{ stdout: string; stderr: string }> {
if (host.sshHost === 'localhost') {
// Execute locally
return this.sshUtil.execAsync(command);
} else {
// Execute via SSH
const sshCommand = host.sshKey
? `ssh -i ${host.sshKey} -o StrictHostKeyChecking=no ${host.sshUser}@${host.sshHost} "${command}"`
: `ssh -o StrictHostKeyChecking=no ${host.sshUser}@${host.sshHost} "${command}"`;
return this.sshUtil.execAsync(sshCommand);
}
}
}