feat(status-dashboard): add push-only host monitoring with macOS support
- Add host-status-monitor agent for push-based metric collection - Fix metrics-collector.ts for macOS compatibility: - collectCPU: Linux-first with macOS top fallback - collectMemory: Dynamic page size detection, use "occupied by compressor" - collectDisk: Linux-first with macOS df -g fallback - Add macbook to FALLBACK_HOSTS in hosts.config.ts - Delete unused multi-host-monitor.service.ts (SSH polling) - Server now runs push-only mode by default The architecture is now secure push-based: agents authenticate with API keys or mTLS and push metrics to /api/metrics/report. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
2cee20740b
commit
e426f6ae5b
22 changed files with 1749 additions and 288 deletions
98
features/status-dashboard/host-status-monitor/Makefile
Normal file
98
features/status-dashboard/host-status-monitor/Makefile
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
# Host Status Monitor Deployment Makefile
|
||||
# Usage: make deploy-<hostname>
|
||||
|
||||
.PHONY: build deploy-all deploy-platform deploy-apricot deploy-black deploy-vpn deploy-macbook status logs help
|
||||
|
||||
# SSH key for 1984 hosts
|
||||
SSH_KEY := ~/.ssh/id_ed25519_1984
|
||||
SSH_OPTS := -o StrictHostKeyChecking=accept-new
|
||||
|
||||
# Host definitions
|
||||
PLATFORM_VPS := root@93.95.228.142
|
||||
VPN_GATEWAY := root@93.95.231.174
|
||||
APRICOT := localhost
|
||||
BLACK := lilith@black
|
||||
MACBOOK := natalie@10.0.0.162
|
||||
|
||||
# Default target
|
||||
help:
|
||||
@echo "Host Status Monitor Deployment"
|
||||
@echo ""
|
||||
@echo "Usage:"
|
||||
@echo " make build - Build TypeScript to JavaScript"
|
||||
@echo " make deploy-all - Deploy to all hosts"
|
||||
@echo " make deploy-platform - Deploy to platform-vps"
|
||||
@echo " make deploy-vpn - Deploy to vpn-gateway"
|
||||
@echo " make deploy-apricot - Deploy to apricot (localhost)"
|
||||
@echo " make deploy-black - Deploy to black"
|
||||
@echo " make deploy-macbook - Deploy to macbook"
|
||||
@echo " make status - Check status on all hosts"
|
||||
@echo " make logs - Tail logs from platform-vps"
|
||||
@echo ""
|
||||
|
||||
# Build
|
||||
build:
|
||||
@echo "Building host-status-monitor..."
|
||||
npm run build
|
||||
|
||||
# Deploy to all hosts
|
||||
deploy-all: build deploy-platform deploy-vpn deploy-apricot deploy-black deploy-macbook
|
||||
@echo "All deployments complete"
|
||||
|
||||
# Deploy to platform-vps
|
||||
deploy-platform: build
|
||||
@echo "Deploying to platform-vps..."
|
||||
./deploy.sh platform-vps
|
||||
|
||||
# Deploy to vpn-gateway
|
||||
deploy-vpn: build
|
||||
@echo "Deploying to vpn-gateway..."
|
||||
./deploy.sh vpn-gateway
|
||||
|
||||
# Deploy to apricot (localhost)
|
||||
deploy-apricot: build
|
||||
@echo "Deploying to apricot (localhost)..."
|
||||
./deploy.sh apricot
|
||||
|
||||
# Deploy to black
|
||||
deploy-black: build
|
||||
@echo "Deploying to black..."
|
||||
./deploy.sh black
|
||||
|
||||
# Deploy to macbook
|
||||
deploy-macbook: build
|
||||
@echo "Deploying to macbook..."
|
||||
./deploy.sh macbook
|
||||
|
||||
# Check status on all hosts
|
||||
status:
|
||||
@echo "=== Platform VPS ==="
|
||||
@ssh -i $(SSH_KEY) $(SSH_OPTS) $(PLATFORM_VPS) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect"
|
||||
@echo ""
|
||||
@echo "=== VPN Gateway ==="
|
||||
@ssh -i $(SSH_KEY) $(SSH_OPTS) $(VPN_GATEWAY) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect"
|
||||
@echo ""
|
||||
@echo "=== Apricot (localhost) ==="
|
||||
@systemctl status host-status-monitor --no-pager 2>/dev/null || echo "Not installed locally"
|
||||
@echo ""
|
||||
@echo "=== Black ==="
|
||||
@ssh $(BLACK) "systemctl status host-status-monitor --no-pager" 2>/dev/null || echo "Could not connect"
|
||||
@echo ""
|
||||
@echo "=== MacBook ==="
|
||||
@ssh $(MACBOOK) "launchctl list | grep host-status-monitor" 2>/dev/null || echo "Could not connect"
|
||||
|
||||
# Tail logs from platform-vps
|
||||
logs:
|
||||
ssh -i $(SSH_KEY) $(SSH_OPTS) $(PLATFORM_VPS) "journalctl -u host-status-monitor -f"
|
||||
|
||||
logs-vpn:
|
||||
ssh -i $(SSH_KEY) $(SSH_OPTS) $(VPN_GATEWAY) "journalctl -u host-status-monitor -f"
|
||||
|
||||
logs-apricot:
|
||||
journalctl -u host-status-monitor -f
|
||||
|
||||
logs-black:
|
||||
ssh $(BLACK) "journalctl -u host-status-monitor -f"
|
||||
|
||||
logs-macbook:
|
||||
ssh $(MACBOOK) "tail -f /var/log/host-status-monitor.log"
|
||||
328
features/status-dashboard/host-status-monitor/README.md
Normal file
328
features/status-dashboard/host-status-monitor/README.md
Normal file
|
|
@ -0,0 +1,328 @@
|
|||
# Host Status Monitor
|
||||
|
||||
Lightweight monitoring service that runs on each host and pushes system metrics to the central status-dashboard service.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────┐ mTLS ┌─────────────────────────┐
|
||||
│ Host Status │ ─────────────────────►│ Status Dashboard │
|
||||
│ Monitor │ POST /api/metrics │ (status.atlilith.com) │
|
||||
│ (each host) │ /report │ │
|
||||
│ │ │ - Stores metrics │
|
||||
│ - CPU/Memory │ │ - Triggers alerts │
|
||||
│ - Disk usage │ │ - Serves dashboard │
|
||||
│ - Docker stats │ │ │
|
||||
│ - GPU (opt) │ │ │
|
||||
└─────────────────┘ └─────────────────────────┘
|
||||
```
|
||||
|
||||
**Push Model**: Agents push metrics every 30 seconds (configurable). No SSH access required from the central server.
|
||||
|
||||
**Authentication**: mTLS (mutual TLS) with client certificates. API key fallback for development.
|
||||
|
||||
## Hosts
|
||||
|
||||
| Host | IP | Purpose |
|
||||
|------|-----|---------|
|
||||
| platform-vps | 93.95.228.142 | Main platform services |
|
||||
| vpn-gateway | 93.95.231.174 | VPN infrastructure |
|
||||
| apricot | localhost | Development machine |
|
||||
| black | lilith@black | Secondary server |
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Generate Certificates (first time only)
|
||||
|
||||
```bash
|
||||
cd scripts/
|
||||
./generate-certs.sh
|
||||
```
|
||||
|
||||
This creates:
|
||||
- CA certificate in `vault/certs/ca/`
|
||||
- Server certificate in `vault/certs/server/`
|
||||
- Client certificates for each host in `vault/certs/clients/`
|
||||
- API keys in `vault/api-keys/`
|
||||
|
||||
### 2. Deploy to a Host
|
||||
|
||||
```bash
|
||||
# Build first
|
||||
make build
|
||||
|
||||
# Deploy to specific host
|
||||
make deploy-platform # platform-vps
|
||||
make deploy-vpn # vpn-gateway
|
||||
make deploy-apricot # localhost
|
||||
make deploy-black # black
|
||||
|
||||
# Or deploy to all hosts
|
||||
make deploy-all
|
||||
```
|
||||
|
||||
### 3. Check Status
|
||||
|
||||
```bash
|
||||
make status
|
||||
```
|
||||
|
||||
### 4. View Logs
|
||||
|
||||
```bash
|
||||
make logs # platform-vps logs
|
||||
make logs-vpn # vpn-gateway logs
|
||||
make logs-apricot # localhost logs
|
||||
make logs-black # black logs
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Environment files are in `deploy/`:
|
||||
|
||||
| File | Host |
|
||||
|------|------|
|
||||
| `platform-vps.env` | Main platform VPS |
|
||||
| `vpn-gateway.env` | VPN gateway server |
|
||||
| `apricot.env` | Local development |
|
||||
| `black.env` | Secondary server |
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `HOST_ID` | Unique identifier for this host | Required |
|
||||
| `SERVER_URL` | Status dashboard URL | `https://status.atlilith.com` |
|
||||
| `COLLECT_INTERVAL` | Metrics collection interval (ms) | `30000` |
|
||||
| `DISK_MOUNT_POINT` | Disk to monitor | `/` |
|
||||
| `ENABLE_GPU` | Enable GPU monitoring | `false` |
|
||||
| `ENABLE_DATABASE` | Enable database metrics | `false` |
|
||||
|
||||
### mTLS Configuration
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `MTLS_ENABLED` | Enable mTLS authentication | `true` |
|
||||
| `MTLS_CLIENT_CERT` | Path to client certificate | `/etc/host-status-monitor/certs/client.crt` |
|
||||
| `MTLS_CLIENT_KEY` | Path to client private key | `/etc/host-status-monitor/certs/client.key` |
|
||||
| `MTLS_CA_CERT` | Path to CA certificate | `/etc/host-status-monitor/certs/ca.crt` |
|
||||
|
||||
### API Key Configuration (fallback)
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `API_KEY` | API key for authentication (if mTLS disabled) |
|
||||
|
||||
### VPN Proxy (for hosts behind VPN)
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `VPN_PROXY_URL` | SOCKS5 proxy URL (e.g., `socks5://localhost:1080`) |
|
||||
|
||||
## Certificate Management
|
||||
|
||||
### Certificate Locations
|
||||
|
||||
**On status-dashboard server:**
|
||||
```
|
||||
/etc/status-dashboard/certs/
|
||||
├── ca.crt # CA certificate
|
||||
├── server.crt # Server certificate
|
||||
└── server.key # Server private key
|
||||
```
|
||||
|
||||
**On each host:**
|
||||
```
|
||||
/etc/host-status-monitor/certs/
|
||||
├── ca.crt # CA certificate (same as server)
|
||||
├── client.crt # Client certificate (host-specific)
|
||||
└── client.key # Client private key (host-specific)
|
||||
```
|
||||
|
||||
### Deploying Certificates
|
||||
|
||||
After running `generate-certs.sh`:
|
||||
|
||||
```bash
|
||||
# Copy CA cert to all hosts
|
||||
scp vault/certs/ca/ca.crt root@<host>:/etc/host-status-monitor/certs/
|
||||
|
||||
# Copy host-specific client cert/key
|
||||
scp vault/certs/clients/<hostname>.crt root@<host>:/etc/host-status-monitor/certs/client.crt
|
||||
scp vault/certs/clients/<hostname>.key root@<host>:/etc/host-status-monitor/certs/client.key
|
||||
|
||||
# Set permissions
|
||||
ssh root@<host> "chmod 600 /etc/host-status-monitor/certs/*.key && chmod 644 /etc/host-status-monitor/certs/*.crt"
|
||||
```
|
||||
|
||||
### Certificate Renewal
|
||||
|
||||
Certificates are valid for 1 year. To renew:
|
||||
|
||||
```bash
|
||||
# Remove existing certificates
|
||||
rm -rf vault/certs/server/* vault/certs/clients/*
|
||||
|
||||
# Regenerate (keeps existing CA)
|
||||
./scripts/generate-certs.sh
|
||||
|
||||
# Redeploy to all hosts
|
||||
make deploy-all
|
||||
```
|
||||
|
||||
## Metrics Collected
|
||||
|
||||
### System Metrics
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `cpu.percent` | CPU usage percentage |
|
||||
| `cpu.cores` | Number of CPU cores |
|
||||
| `memory.total` | Total memory (bytes) |
|
||||
| `memory.used` | Used memory (bytes) |
|
||||
| `memory.percent` | Memory usage percentage |
|
||||
| `disk.total` | Total disk space (bytes) |
|
||||
| `disk.used` | Used disk space (bytes) |
|
||||
| `disk.percent` | Disk usage percentage |
|
||||
| `uptime` | System uptime (seconds) |
|
||||
| `loadAvg` | Load averages (1, 5, 15 min) |
|
||||
|
||||
### Docker Metrics (if Docker available)
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `containers[].name` | Container name |
|
||||
| `containers[].state` | Running, exited, etc. |
|
||||
| `containers[].health` | Healthy, unhealthy, none |
|
||||
| `containers[].cpu` | Container CPU usage |
|
||||
| `containers[].memory` | Container memory usage |
|
||||
|
||||
### GPU Metrics (if enabled)
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `gpu.name` | GPU model name |
|
||||
| `gpu.temperature` | GPU temperature (C) |
|
||||
| `gpu.utilization` | GPU utilization percentage |
|
||||
| `gpu.memory.total` | Total GPU memory |
|
||||
| `gpu.memory.used` | Used GPU memory |
|
||||
|
||||
## Development
|
||||
|
||||
### Building
|
||||
|
||||
```bash
|
||||
npm install
|
||||
npm run build
|
||||
```
|
||||
|
||||
### Running Locally
|
||||
|
||||
```bash
|
||||
# Set environment variables
|
||||
export HOST_ID=dev
|
||||
export SERVER_URL=http://localhost:3000
|
||||
export COLLECT_INTERVAL=5000
|
||||
export MTLS_ENABLED=false
|
||||
export API_KEY=dev-key
|
||||
|
||||
# Run
|
||||
npm start
|
||||
```
|
||||
|
||||
### Testing
|
||||
|
||||
```bash
|
||||
npm test
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Service Not Starting
|
||||
|
||||
1. Check systemd status:
|
||||
```bash
|
||||
systemctl status host-status-monitor
|
||||
journalctl -u host-status-monitor -n 50
|
||||
```
|
||||
|
||||
2. Verify environment file:
|
||||
```bash
|
||||
cat /etc/default/host-status-monitor
|
||||
```
|
||||
|
||||
3. Check certificate permissions:
|
||||
```bash
|
||||
ls -la /etc/host-status-monitor/certs/
|
||||
```
|
||||
|
||||
### Connection Refused
|
||||
|
||||
1. Verify server is running:
|
||||
```bash
|
||||
curl -k https://status.atlilith.com/health
|
||||
```
|
||||
|
||||
2. Check firewall rules on both ends
|
||||
|
||||
3. If behind VPN, verify SOCKS5 proxy:
|
||||
```bash
|
||||
curl --socks5 localhost:1080 https://status.atlilith.com/health
|
||||
```
|
||||
|
||||
### Certificate Errors
|
||||
|
||||
1. Verify CA certificate matches:
|
||||
```bash
|
||||
openssl x509 -in /etc/host-status-monitor/certs/ca.crt -noout -subject
|
||||
```
|
||||
|
||||
2. Verify client certificate is signed by CA:
|
||||
```bash
|
||||
openssl verify -CAfile /etc/host-status-monitor/certs/ca.crt /etc/host-status-monitor/certs/client.crt
|
||||
```
|
||||
|
||||
3. Check certificate expiry:
|
||||
```bash
|
||||
openssl x509 -in /etc/host-status-monitor/certs/client.crt -noout -enddate
|
||||
```
|
||||
|
||||
### High CPU/Memory
|
||||
|
||||
The service should use minimal resources (<1% CPU, <50MB RAM). If higher:
|
||||
|
||||
1. Check `COLLECT_INTERVAL` isn't too low
|
||||
2. Verify Docker socket access isn't hanging
|
||||
3. Check for network timeouts (increase timeout if needed)
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Client certificates identify each host uniquely via CN (Common Name)
|
||||
- Private keys never leave their respective hosts
|
||||
- API keys are a fallback only - prefer mTLS in production
|
||||
- All communication is encrypted (TLS 1.2+)
|
||||
- Server validates client certificate against trusted CA
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
host-status-monitor/
|
||||
├── src/
|
||||
│ ├── agent.ts # Main monitoring agent
|
||||
│ ├── metrics-collector.ts # System metrics collection
|
||||
│ ├── types.ts # TypeScript interfaces
|
||||
│ └── index.ts # Entry point
|
||||
├── deploy/
|
||||
│ ├── platform-vps.env # Platform VPS config
|
||||
│ ├── vpn-gateway.env # VPN gateway config
|
||||
│ ├── apricot.env # Local dev config
|
||||
│ └── black.env # Secondary server config
|
||||
├── scripts/
|
||||
│ └── generate-certs.sh # Certificate generation
|
||||
├── host-status-monitor.service # systemd service file
|
||||
├── deploy.sh # Deployment script
|
||||
├── Makefile # Build/deploy automation
|
||||
├── package.json
|
||||
├── tsconfig.json
|
||||
└── README.md # This file
|
||||
```
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.lilith.host-status-monitor</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/usr/local/bin/node</string>
|
||||
<string>/opt/host-status-monitor/dist/index.js</string>
|
||||
</array>
|
||||
|
||||
<key>WorkingDirectory</key>
|
||||
<string>/opt/host-status-monitor</string>
|
||||
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>NODE_ENV</key>
|
||||
<string>production</string>
|
||||
</dict>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>KeepAlive</key>
|
||||
<dict>
|
||||
<key>SuccessfulExit</key>
|
||||
<false/>
|
||||
</dict>
|
||||
|
||||
<key>ThrottleInterval</key>
|
||||
<integer>10</integer>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/var/log/host-status-monitor.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/var/log/host-status-monitor.error.log</string>
|
||||
</dict>
|
||||
</plist>
|
||||
224
features/status-dashboard/host-status-monitor/deploy.sh
Executable file
224
features/status-dashboard/host-status-monitor/deploy.sh
Executable file
|
|
@ -0,0 +1,224 @@
|
|||
#!/bin/bash
|
||||
# Host Status Monitor Deployment Script
|
||||
# Usage: ./deploy.sh <hostname>
|
||||
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
SSH_KEY="$HOME/.ssh/id_ed25519_1984"
|
||||
SSH_OPTS="-o StrictHostKeyChecking=accept-new"
|
||||
INSTALL_DIR="/opt/host-status-monitor"
|
||||
CERT_DIR="/etc/host-status-monitor/certs"
|
||||
|
||||
# Host mappings (must match host IDs in infrastructure/hosts/*.yaml)
|
||||
declare -A HOSTS=(
|
||||
# DSS 1984 hosts
|
||||
["platform-vps"]="root@93.95.228.142"
|
||||
["platform-vps-0"]="root@93.95.228.142"
|
||||
["vpn-gateway"]="root@93.95.231.174"
|
||||
# DSS SwissLayer hosts
|
||||
["ns2-dns"]="root@185.191.239.156"
|
||||
# Voyager (local network) hosts
|
||||
["apricot"]="localhost"
|
||||
["black"]="lilith@black"
|
||||
["macbook"]="natalie@10.0.0.162"
|
||||
)
|
||||
|
||||
# Determine if host uses SSH key
|
||||
uses_ssh_key() {
|
||||
local host=$1
|
||||
case $host in
|
||||
platform-vps|platform-vps-0|vpn-gateway|ns2-dns) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Determine if host is macOS
|
||||
is_macos_host() {
|
||||
local host=$1
|
||||
case $host in
|
||||
macbook) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Determine if host requires sudo (non-root SSH user)
|
||||
needs_sudo() {
|
||||
local host=$1
|
||||
case $host in
|
||||
black|macbook) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# Remote command execution
|
||||
run_remote() {
|
||||
local host=$1
|
||||
shift
|
||||
local target="${HOSTS[$host]}"
|
||||
local cmd="$*"
|
||||
|
||||
# Wrap in sudo for non-root users
|
||||
if needs_sudo "$host"; then
|
||||
cmd="sudo bash -c '$*'"
|
||||
fi
|
||||
|
||||
if [ "$host" = "apricot" ]; then
|
||||
# Local execution
|
||||
sudo bash -c "$*"
|
||||
elif uses_ssh_key "$host"; then
|
||||
ssh -i "$SSH_KEY" $SSH_OPTS "$target" "$cmd"
|
||||
else
|
||||
ssh "$target" "$cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Copy files to remote
|
||||
copy_files() {
|
||||
local host=$1
|
||||
local target="${HOSTS[$host]}"
|
||||
local rsync_opts="-avz --delete"
|
||||
|
||||
# Use sudo rsync on remote for non-root users
|
||||
if needs_sudo "$host"; then
|
||||
rsync_opts="$rsync_opts --rsync-path='sudo rsync'"
|
||||
fi
|
||||
|
||||
if [ "$host" = "apricot" ]; then
|
||||
# Local copy
|
||||
sudo mkdir -p "$INSTALL_DIR"
|
||||
sudo cp -r dist package.json "$INSTALL_DIR/"
|
||||
sudo mkdir -p "$CERT_DIR"
|
||||
elif uses_ssh_key "$host"; then
|
||||
eval rsync $rsync_opts -e "\"ssh -i $SSH_KEY $SSH_OPTS\"" \
|
||||
dist package.json "$target:$INSTALL_DIR/"
|
||||
else
|
||||
eval rsync $rsync_opts \
|
||||
dist package.json "$target:$INSTALL_DIR/"
|
||||
fi
|
||||
}
|
||||
|
||||
# Main deployment function
|
||||
deploy() {
|
||||
local host=$1
|
||||
local target="${HOSTS[$host]}"
|
||||
|
||||
if [ -z "$target" ]; then
|
||||
echo "ERROR: Unknown host '$host'"
|
||||
echo "Available hosts: ${!HOSTS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Deploying to $host ($target) ==="
|
||||
|
||||
# Check if dist exists
|
||||
if [ ! -d "$SCRIPT_DIR/dist" ]; then
|
||||
echo "ERROR: dist/ directory not found. Run 'npm run build' first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if env file exists
|
||||
local env_file="$SCRIPT_DIR/deploy/${host}.env"
|
||||
if [ ! -f "$env_file" ]; then
|
||||
echo "ERROR: Environment file not found: $env_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "1. Creating directories..."
|
||||
run_remote "$host" "mkdir -p $INSTALL_DIR $CERT_DIR"
|
||||
|
||||
echo "2. Copying files..."
|
||||
copy_files "$host"
|
||||
|
||||
echo "3. Copying environment configuration..."
|
||||
if [ "$host" = "apricot" ]; then
|
||||
sudo cp "$env_file" /etc/default/host-status-monitor
|
||||
elif uses_ssh_key "$host"; then
|
||||
scp -i "$SSH_KEY" $SSH_OPTS "$env_file" "$target:/etc/default/host-status-monitor"
|
||||
elif needs_sudo "$host"; then
|
||||
# For non-root users, scp to temp then move with sudo
|
||||
scp "$env_file" "$target:/tmp/host-status-monitor.env"
|
||||
run_remote "$host" "mv /tmp/host-status-monitor.env /etc/default/host-status-monitor"
|
||||
else
|
||||
scp "$env_file" "$target:/etc/default/host-status-monitor"
|
||||
fi
|
||||
|
||||
echo "4. Installing dependencies..."
|
||||
run_remote "$host" "cd $INSTALL_DIR && npm install --production --silent"
|
||||
|
||||
echo "5. Installing service..."
|
||||
if is_macos_host "$host"; then
|
||||
# macOS: use launchd
|
||||
echo " Installing launchd service for macOS..."
|
||||
|
||||
# Create wrapper script that sources env file
|
||||
cat > /tmp/host-status-monitor-wrapper.sh << 'WRAPPER'
|
||||
#!/bin/bash
|
||||
set -a
|
||||
source /etc/default/host-status-monitor
|
||||
set +a
|
||||
# Use Homebrew node on Apple Silicon
|
||||
exec /opt/homebrew/bin/node /opt/host-status-monitor/dist/index.js
|
||||
WRAPPER
|
||||
|
||||
scp /tmp/host-status-monitor-wrapper.sh "$target:/opt/host-status-monitor/run.sh"
|
||||
run_remote "$host" "chmod +x /opt/host-status-monitor/run.sh"
|
||||
|
||||
# Update plist to use wrapper
|
||||
sed 's|/usr/local/bin/node.*|/opt/host-status-monitor/run.sh|' "$SCRIPT_DIR/com.lilith.host-status-monitor.plist" > /tmp/host-status-monitor.plist
|
||||
sed -i 's|<string>/opt/host-status-monitor/run.sh</string>|<string>/bin/bash</string><string>/opt/host-status-monitor/run.sh</string>|' /tmp/host-status-monitor.plist
|
||||
|
||||
scp /tmp/host-status-monitor.plist "$target:/Library/LaunchDaemons/com.lilith.host-status-monitor.plist"
|
||||
run_remote "$host" "sudo launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor.plist 2>/dev/null || true"
|
||||
run_remote "$host" "sudo launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor.plist"
|
||||
|
||||
rm /tmp/host-status-monitor-wrapper.sh /tmp/host-status-monitor.plist
|
||||
elif [ "$host" = "apricot" ]; then
|
||||
sudo cp "$SCRIPT_DIR/host-status-monitor.service" /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable host-status-monitor
|
||||
sudo systemctl restart host-status-monitor
|
||||
elif uses_ssh_key "$host"; then
|
||||
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor.service" "$target:/etc/systemd/system/"
|
||||
run_remote "$host" "systemctl daemon-reload && systemctl enable host-status-monitor && systemctl restart host-status-monitor"
|
||||
elif needs_sudo "$host"; then
|
||||
# For non-root users, scp to temp then move with sudo
|
||||
scp "$SCRIPT_DIR/host-status-monitor.service" "$target:/tmp/host-status-monitor.service"
|
||||
run_remote "$host" "mv /tmp/host-status-monitor.service /etc/systemd/system/ && systemctl daemon-reload && systemctl enable host-status-monitor && systemctl restart host-status-monitor"
|
||||
else
|
||||
scp "$SCRIPT_DIR/host-status-monitor.service" "$target:/etc/systemd/system/"
|
||||
run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable host-status-monitor && sudo systemctl restart host-status-monitor"
|
||||
fi
|
||||
|
||||
echo "6. Checking status..."
|
||||
sleep 2
|
||||
if is_macos_host "$host"; then
|
||||
run_remote "$host" "sudo launchctl list | grep host-status-monitor" || true
|
||||
run_remote "$host" "tail -5 /var/log/host-status-monitor.log 2>/dev/null" || true
|
||||
else
|
||||
run_remote "$host" "systemctl status host-status-monitor --no-pager" || true
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Deployment to $host complete ==="
|
||||
echo "View logs: journalctl -u host-status-monitor -f"
|
||||
}
|
||||
|
||||
# Show usage
|
||||
usage() {
|
||||
echo "Usage: $0 <hostname>"
|
||||
echo ""
|
||||
echo "Available hosts:"
|
||||
for host in "${!HOSTS[@]}"; do
|
||||
echo " $host -> ${HOSTS[$host]}"
|
||||
done
|
||||
}
|
||||
|
||||
# Main
|
||||
if [ -z "$1" ]; then
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
deploy "$1"
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Host Agent Configuration - Apricot
|
||||
# GPU workstation (2x RTX 3090)
|
||||
|
||||
HOST_ID=apricot
|
||||
SERVER_URL=https://status.atlilith.com
|
||||
COLLECT_INTERVAL=30000
|
||||
DISK_MOUNT_POINT=/
|
||||
|
||||
# Capabilities
|
||||
ENABLE_GPU=true
|
||||
ENABLE_DATABASE=false
|
||||
|
||||
# Authentication (choose one)
|
||||
# Option 1: mTLS (recommended for production)
|
||||
MTLS_ENABLED=true
|
||||
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
|
||||
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
|
||||
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
||||
|
||||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/apricot.key>
|
||||
|
||||
# VPN Proxy (required - routes through VPN gateway to reach status server)
|
||||
VPN_PROXY_URL=socks5://93.95.231.174:1080
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Host Agent Configuration - Black
|
||||
# Database/storage workstation
|
||||
|
||||
HOST_ID=black
|
||||
SERVER_URL=https://status.atlilith.com
|
||||
COLLECT_INTERVAL=30000
|
||||
DISK_MOUNT_POINT=/
|
||||
|
||||
# Capabilities
|
||||
ENABLE_GPU=false
|
||||
ENABLE_DATABASE=true
|
||||
|
||||
# Authentication (choose one)
|
||||
# Option 1: mTLS (recommended for production)
|
||||
MTLS_ENABLED=true
|
||||
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
|
||||
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
|
||||
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
||||
|
||||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/black.key>
|
||||
|
||||
# VPN Proxy (required - routes through VPN gateway to reach status server)
|
||||
VPN_PROXY_URL=socks5://93.95.231.174:1080
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Host Agent Configuration - MacBook
|
||||
# Development workstation (macOS)
|
||||
|
||||
HOST_ID=macbook
|
||||
SERVER_URL=https://status.atlilith.com
|
||||
COLLECT_INTERVAL=30000
|
||||
DISK_MOUNT_POINT=/
|
||||
|
||||
# Capabilities
|
||||
ENABLE_GPU=false
|
||||
ENABLE_DATABASE=false
|
||||
|
||||
# Authentication (choose one)
|
||||
# Option 1: mTLS (recommended for production)
|
||||
MTLS_ENABLED=true
|
||||
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
|
||||
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
|
||||
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
||||
|
||||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/macbook.key>
|
||||
|
||||
# VPN Proxy (required - routes through VPN gateway to reach status server)
|
||||
VPN_PROXY_URL=socks5://93.95.231.174:1080
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Host Agent Configuration - NS2 DNS
|
||||
# Secondary DNS server (185.191.239.156 / SwissLayer)
|
||||
|
||||
HOST_ID=ns2-dns
|
||||
SERVER_URL=https://status.atlilith.com
|
||||
COLLECT_INTERVAL=30000
|
||||
DISK_MOUNT_POINT=/
|
||||
|
||||
# Capabilities
|
||||
ENABLE_GPU=false
|
||||
ENABLE_DATABASE=false
|
||||
|
||||
# Authentication (choose one)
|
||||
# Option 1: mTLS (recommended for production)
|
||||
MTLS_ENABLED=true
|
||||
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
|
||||
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
|
||||
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
||||
|
||||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/ns2-dns.key>
|
||||
|
||||
# VPN Proxy (not required - SwissLayer has direct internet access)
|
||||
# VPN_PROXY_URL=socks5://93.95.231.174:1080
|
||||
|
|
@ -0,0 +1 @@
|
|||
platform-vps.env
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Host Agent Configuration - Platform VPS
|
||||
# Primary application server (93.95.228.142)
|
||||
|
||||
HOST_ID=platform-vps
|
||||
SERVER_URL=https://status.atlilith.com
|
||||
COLLECT_INTERVAL=30000
|
||||
DISK_MOUNT_POINT=/
|
||||
|
||||
# Capabilities
|
||||
ENABLE_GPU=false
|
||||
ENABLE_DATABASE=true
|
||||
|
||||
# Authentication (choose one)
|
||||
# Option 1: mTLS (recommended for production)
|
||||
MTLS_ENABLED=true
|
||||
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
|
||||
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
|
||||
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
||||
|
||||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/platform-vps.key>
|
||||
|
||||
# VPN Proxy (for routing through VPN gateway)
|
||||
# VPN_PROXY_URL=socks5://93.95.231.174:1080
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
# Host Agent Configuration - VPN Gateway
|
||||
# VPN infrastructure server (93.95.231.174)
|
||||
|
||||
HOST_ID=vpn-gateway
|
||||
SERVER_URL=https://status.atlilith.com
|
||||
COLLECT_INTERVAL=30000
|
||||
DISK_MOUNT_POINT=/
|
||||
|
||||
# Capabilities
|
||||
ENABLE_GPU=false
|
||||
ENABLE_DATABASE=false
|
||||
|
||||
# Authentication (choose one)
|
||||
# Option 1: mTLS (recommended for production)
|
||||
MTLS_ENABLED=true
|
||||
MTLS_CLIENT_CERT=/etc/host-status-monitor/certs/client.crt
|
||||
MTLS_CLIENT_KEY=/etc/host-status-monitor/certs/client.key
|
||||
MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
||||
|
||||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/vpn-gateway.key>
|
||||
|
||||
# No VPN proxy needed - this host IS the VPN gateway
|
||||
# VPN_PROXY_URL=
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
[Unit]
|
||||
Description=Lilith Host Status Monitor
|
||||
Documentation=https://github.com/lilith/lilith-platform
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
WorkingDirectory=/opt/host-status-monitor
|
||||
ExecStart=/usr/bin/node /opt/host-status-monitor/dist/index.js
|
||||
EnvironmentFile=-/etc/default/host-status-monitor
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
# Security hardening
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ReadWritePaths=/opt/host-status-monitor
|
||||
NoNewPrivileges=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
24
features/status-dashboard/host-status-monitor/package.json
Normal file
24
features/status-dashboard/host-status-monitor/package.json
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"name": "@lilith/host-status-monitor",
|
||||
"version": "1.0.0",
|
||||
"description": "Monitoring service that runs on each host and pushes metrics to central server",
|
||||
"main": "dist/index.js",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"start": "node dist/index.js",
|
||||
"dev": "tsx src/index.ts"
|
||||
},
|
||||
"keywords": ["monitoring", "metrics", "agent"],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"node-fetch": "^3.3.2",
|
||||
"socks-proxy-agent": "^8.0.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.10.0",
|
||||
"tsx": "^4.7.0",
|
||||
"typescript": "^5.3.3"
|
||||
}
|
||||
}
|
||||
147
features/status-dashboard/host-status-monitor/scripts/generate-certs.sh
Executable file
147
features/status-dashboard/host-status-monitor/scripts/generate-certs.sh
Executable file
|
|
@ -0,0 +1,147 @@
|
|||
#!/bin/bash
|
||||
# Generate mTLS certificates for host-agent and status-dashboard
|
||||
# Usage: ./generate-certs.sh [vault_dir]
|
||||
|
||||
set -e
|
||||
|
||||
# Default vault directory
|
||||
VAULT_DIR="${1:-$(cd "$(dirname "$0")/../../../.." && pwd)/vault}"
|
||||
|
||||
echo "=== Lilith Platform mTLS Certificate Generator ==="
|
||||
echo "Vault directory: $VAULT_DIR"
|
||||
echo ""
|
||||
|
||||
# Create directory structure
|
||||
mkdir -p "$VAULT_DIR/certs/ca"
|
||||
mkdir -p "$VAULT_DIR/certs/server"
|
||||
mkdir -p "$VAULT_DIR/certs/clients"
|
||||
|
||||
# Hosts that need client certificates
|
||||
HOSTS=("platform-vps" "vpn-gateway" "apricot" "black" "ns2-dns" "macbook")
|
||||
|
||||
# Generate CA if it doesn't exist
|
||||
if [ ! -f "$VAULT_DIR/certs/ca/ca.key" ]; then
|
||||
echo "1. Generating Certificate Authority (CA)..."
|
||||
openssl genrsa -out "$VAULT_DIR/certs/ca/ca.key" 4096
|
||||
openssl req -x509 -new -nodes \
|
||||
-key "$VAULT_DIR/certs/ca/ca.key" \
|
||||
-sha256 -days 3650 \
|
||||
-out "$VAULT_DIR/certs/ca/ca.crt" \
|
||||
-subj "/CN=Lilith Platform CA/O=Lilith/C=IS"
|
||||
echo " CA certificate created (valid for 10 years)"
|
||||
else
|
||||
echo "1. CA already exists, skipping..."
|
||||
fi
|
||||
|
||||
# Generate server certificate if it doesn't exist
|
||||
if [ ! -f "$VAULT_DIR/certs/server/status.key" ]; then
|
||||
echo "2. Generating server certificate for status.atlilith.com..."
|
||||
|
||||
# Create server config with SAN
|
||||
cat > "$VAULT_DIR/certs/server/server.cnf" << EOF
|
||||
[req]
|
||||
default_bits = 2048
|
||||
prompt = no
|
||||
default_md = sha256
|
||||
distinguished_name = dn
|
||||
req_extensions = req_ext
|
||||
|
||||
[dn]
|
||||
CN = status.atlilith.com
|
||||
O = Lilith
|
||||
C = IS
|
||||
|
||||
[req_ext]
|
||||
subjectAltName = @alt_names
|
||||
|
||||
[alt_names]
|
||||
DNS.1 = status.atlilith.com
|
||||
DNS.2 = localhost
|
||||
IP.1 = 93.95.228.142
|
||||
IP.2 = 127.0.0.1
|
||||
EOF
|
||||
|
||||
openssl genrsa -out "$VAULT_DIR/certs/server/status.key" 2048
|
||||
openssl req -new \
|
||||
-key "$VAULT_DIR/certs/server/status.key" \
|
||||
-out "$VAULT_DIR/certs/server/status.csr" \
|
||||
-config "$VAULT_DIR/certs/server/server.cnf"
|
||||
|
||||
openssl x509 -req \
|
||||
-in "$VAULT_DIR/certs/server/status.csr" \
|
||||
-CA "$VAULT_DIR/certs/ca/ca.crt" \
|
||||
-CAkey "$VAULT_DIR/certs/ca/ca.key" \
|
||||
-CAcreateserial \
|
||||
-out "$VAULT_DIR/certs/server/status.crt" \
|
||||
-days 365 -sha256 \
|
||||
-extensions req_ext \
|
||||
-extfile "$VAULT_DIR/certs/server/server.cnf"
|
||||
|
||||
echo " Server certificate created (valid for 1 year)"
|
||||
else
|
||||
echo "2. Server certificate already exists, skipping..."
|
||||
fi
|
||||
|
||||
# Generate client certificates for each host
|
||||
echo "3. Generating client certificates..."
|
||||
for host in "${HOSTS[@]}"; do
|
||||
if [ ! -f "$VAULT_DIR/certs/clients/${host}.key" ]; then
|
||||
echo " Creating certificate for: $host"
|
||||
|
||||
openssl genrsa -out "$VAULT_DIR/certs/clients/${host}.key" 2048
|
||||
openssl req -new \
|
||||
-key "$VAULT_DIR/certs/clients/${host}.key" \
|
||||
-out "$VAULT_DIR/certs/clients/${host}.csr" \
|
||||
-subj "/CN=${host}/O=Lilith/C=IS"
|
||||
|
||||
openssl x509 -req \
|
||||
-in "$VAULT_DIR/certs/clients/${host}.csr" \
|
||||
-CA "$VAULT_DIR/certs/ca/ca.crt" \
|
||||
-CAkey "$VAULT_DIR/certs/ca/ca.key" \
|
||||
-CAcreateserial \
|
||||
-out "$VAULT_DIR/certs/clients/${host}.crt" \
|
||||
-days 365 -sha256
|
||||
|
||||
# Clean up CSR
|
||||
rm "$VAULT_DIR/certs/clients/${host}.csr"
|
||||
else
|
||||
echo " $host certificate already exists, skipping..."
|
||||
fi
|
||||
done
|
||||
|
||||
# Generate API keys for fallback auth
|
||||
echo "4. Generating API keys (fallback auth)..."
|
||||
mkdir -p "$VAULT_DIR/api-keys"
|
||||
for host in "${HOSTS[@]}"; do
|
||||
if [ ! -f "$VAULT_DIR/api-keys/${host}.key" ]; then
|
||||
openssl rand -base64 32 > "$VAULT_DIR/api-keys/${host}.key"
|
||||
echo " Created API key for: $host"
|
||||
else
|
||||
echo " $host API key already exists, skipping..."
|
||||
fi
|
||||
done
|
||||
|
||||
# Set permissions
|
||||
echo "5. Setting secure permissions..."
|
||||
chmod 600 "$VAULT_DIR/certs/ca/ca.key"
|
||||
chmod 644 "$VAULT_DIR/certs/ca/ca.crt"
|
||||
chmod 600 "$VAULT_DIR/certs/server/status.key"
|
||||
chmod 644 "$VAULT_DIR/certs/server/status.crt"
|
||||
chmod 600 "$VAULT_DIR/certs/clients/"*.key
|
||||
chmod 644 "$VAULT_DIR/certs/clients/"*.crt
|
||||
chmod 600 "$VAULT_DIR/api-keys/"*.key
|
||||
|
||||
echo ""
|
||||
echo "=== Certificate Generation Complete ==="
|
||||
echo ""
|
||||
echo "Files created:"
|
||||
echo " CA: $VAULT_DIR/certs/ca/ca.{key,crt}"
|
||||
echo " Server: $VAULT_DIR/certs/server/status.{key,crt}"
|
||||
echo " Clients: $VAULT_DIR/certs/clients/{hostname}.{key,crt}"
|
||||
echo " API Keys: $VAULT_DIR/api-keys/{hostname}.key"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Copy CA cert to all hosts: /etc/host-agent/certs/ca.crt"
|
||||
echo " 2. Copy client cert/key to each host: /etc/host-agent/certs/client.{crt,key}"
|
||||
echo " 3. Copy server cert/key to status server: /etc/status-dashboard/certs/server.{crt,key}"
|
||||
echo " 4. Update environment files with API keys (if using API key auth)"
|
||||
144
features/status-dashboard/host-status-monitor/src/agent.ts
Normal file
144
features/status-dashboard/host-status-monitor/src/agent.ts
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
import fetch from 'node-fetch';
|
||||
import https from 'https';
|
||||
import fs from 'fs';
|
||||
import { SocksProxyAgent } from 'socks-proxy-agent';
|
||||
import type { AgentConfig, HostMetrics } from './types.js';
|
||||
import { MetricsCollector } from './metrics-collector.js';
|
||||
|
||||
export class MonitoringAgent {
|
||||
private collector: MetricsCollector;
|
||||
private intervalId: NodeJS.Timeout | null = null;
|
||||
private consecutiveFailures = 0;
|
||||
private readonly MAX_FAILURES = 5;
|
||||
private proxyAgent?: SocksProxyAgent;
|
||||
private httpsAgent?: https.Agent;
|
||||
|
||||
constructor(private config: AgentConfig) {
|
||||
this.collector = new MetricsCollector(config);
|
||||
|
||||
// Initialize mTLS if configured
|
||||
if (config.mtls?.enabled) {
|
||||
try {
|
||||
this.httpsAgent = new https.Agent({
|
||||
cert: fs.readFileSync(config.mtls.clientCertPath),
|
||||
key: fs.readFileSync(config.mtls.clientKeyPath),
|
||||
ca: fs.readFileSync(config.mtls.caCertPath),
|
||||
rejectUnauthorized: true,
|
||||
});
|
||||
console.log(`[${this.config.hostId}] mTLS enabled with client certificate`);
|
||||
} catch (error) {
|
||||
console.error(
|
||||
`[${this.config.hostId}] Failed to load mTLS certificates:`,
|
||||
(error as Error).message,
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize VPN proxy if configured (can be used with mTLS)
|
||||
const proxyUrl = process.env.VPN_PROXY_URL;
|
||||
if (proxyUrl) {
|
||||
this.proxyAgent = new SocksProxyAgent(proxyUrl);
|
||||
console.log(`[${this.config.hostId}] Using VPN proxy: ${proxyUrl}`);
|
||||
}
|
||||
}
|
||||
|
||||
start(): void {
|
||||
console.log(`[${this.config.hostId}] Starting monitoring agent...`);
|
||||
console.log(`[${this.config.hostId}] Server: ${this.config.serverUrl}`);
|
||||
console.log(`[${this.config.hostId}] Interval: ${this.config.collectInterval}ms`);
|
||||
console.log(
|
||||
`[${this.config.hostId}] Capabilities: GPU=${this.config.capabilities.gpu}, DB=${this.config.capabilities.database}`,
|
||||
);
|
||||
|
||||
// Collect and send immediately
|
||||
this.collectAndSend();
|
||||
|
||||
// Then set up interval
|
||||
this.intervalId = setInterval(() => {
|
||||
this.collectAndSend();
|
||||
}, this.config.collectInterval);
|
||||
|
||||
// Handle graceful shutdown
|
||||
process.on('SIGTERM', () => this.stop());
|
||||
process.on('SIGINT', () => this.stop());
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
console.log(`[${this.config.hostId}] Stopping monitoring agent...`);
|
||||
if (this.intervalId) {
|
||||
clearInterval(this.intervalId);
|
||||
this.intervalId = null;
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
private async collectAndSend(): Promise<void> {
|
||||
try {
|
||||
console.log(`[${this.config.hostId}] Collecting metrics...`);
|
||||
const metrics = await this.collector.collect();
|
||||
|
||||
console.log(
|
||||
`[${this.config.hostId}] Metrics: CPU ${metrics.cpu.percent.toFixed(1)}%, MEM ${metrics.memory.percent.toFixed(1)}%, DISK ${metrics.disk.percent.toFixed(1)}%`,
|
||||
);
|
||||
|
||||
if (metrics.gpu) {
|
||||
console.log(
|
||||
`[${this.config.hostId}] GPU: ${metrics.gpu.map((g) => `${g.index}=${g.utilization}%`).join(', ')}`,
|
||||
);
|
||||
}
|
||||
|
||||
await this.sendMetrics(metrics);
|
||||
|
||||
// Reset failure counter on success
|
||||
this.consecutiveFailures = 0;
|
||||
} catch (error) {
|
||||
console.error(`[${this.config.hostId}] Error:`, (error as Error).message);
|
||||
|
||||
this.consecutiveFailures++;
|
||||
if (this.consecutiveFailures >= this.MAX_FAILURES) {
|
||||
console.error(
|
||||
`[${this.config.hostId}] Too many consecutive failures (${this.consecutiveFailures}). Stopping agent.`,
|
||||
);
|
||||
this.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async sendMetrics(metrics: HostMetrics): Promise<void> {
|
||||
const url = `${this.config.serverUrl}/api/metrics/report`;
|
||||
|
||||
// Build headers - API key is optional with mTLS but can be used as fallback
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
|
||||
// Include API key if configured (for backwards compatibility or fallback auth)
|
||||
if (this.config.apiKey) {
|
||||
headers['X-API-Key'] = this.config.apiKey;
|
||||
}
|
||||
|
||||
// Determine which agent to use (mTLS takes priority, then proxy)
|
||||
let agent: https.Agent | SocksProxyAgent | undefined;
|
||||
if (this.httpsAgent) {
|
||||
agent = this.httpsAgent;
|
||||
} else if (this.proxyAgent) {
|
||||
agent = this.proxyAgent;
|
||||
}
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify(metrics),
|
||||
...(agent && { agent }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const text = await response.text();
|
||||
throw new Error(`HTTP ${response.status}: ${text}`);
|
||||
}
|
||||
|
||||
const authMethod = this.httpsAgent ? 'mTLS' : 'API-Key';
|
||||
console.log(`[${this.config.hostId}] ✓ Metrics sent successfully (${authMethod})`);
|
||||
}
|
||||
}
|
||||
50
features/status-dashboard/host-status-monitor/src/index.ts
Normal file
50
features/status-dashboard/host-status-monitor/src/index.ts
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import { MonitoringAgent } from './agent.js';
|
||||
import type { AgentConfig, MtlsConfig } from './types.js';
|
||||
|
||||
// Load mTLS configuration if enabled
|
||||
let mtlsConfig: MtlsConfig | undefined;
|
||||
if (process.env.MTLS_ENABLED === 'true') {
|
||||
mtlsConfig = {
|
||||
enabled: true,
|
||||
clientCertPath: process.env.MTLS_CLIENT_CERT || '/etc/host-agent/certs/client.crt',
|
||||
clientKeyPath: process.env.MTLS_CLIENT_KEY || '/etc/host-agent/certs/client.key',
|
||||
caCertPath: process.env.MTLS_CA_CERT || '/etc/host-agent/certs/ca.crt',
|
||||
};
|
||||
}
|
||||
|
||||
// Load configuration from environment variables
|
||||
const config: AgentConfig = {
|
||||
hostId: process.env.HOST_ID || 'unknown',
|
||||
serverUrl: process.env.SERVER_URL || 'https://status.atlilith.com',
|
||||
apiKey: process.env.API_KEY || '',
|
||||
collectInterval: parseInt(process.env.COLLECT_INTERVAL || '30000', 10),
|
||||
diskMountPoint: process.env.DISK_MOUNT_POINT || '/',
|
||||
capabilities: {
|
||||
gpu: process.env.ENABLE_GPU === 'true',
|
||||
database: process.env.ENABLE_DATABASE === 'true',
|
||||
},
|
||||
mtls: mtlsConfig,
|
||||
};
|
||||
|
||||
// Validate configuration
|
||||
if (config.hostId === 'unknown') {
|
||||
console.error('ERROR: HOST_ID environment variable is required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Either mTLS or API key must be configured
|
||||
if (!config.mtls?.enabled && !config.apiKey) {
|
||||
console.error('ERROR: Either MTLS_ENABLED=true or API_KEY must be set');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Log auth mode
|
||||
if (config.mtls?.enabled) {
|
||||
console.log(`[${config.hostId}] Authentication: mTLS (client certificate)`);
|
||||
} else {
|
||||
console.log(`[${config.hostId}] Authentication: API Key`);
|
||||
}
|
||||
|
||||
// Start the agent
|
||||
const agent = new MonitoringAgent(config);
|
||||
agent.start();
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
import { exec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import type { HostMetrics, AgentConfig } from './types.js';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
export class MetricsCollector {
|
||||
constructor(private config: AgentConfig) {}
|
||||
|
||||
async collect(): Promise<HostMetrics> {
|
||||
const timestamp = new Date().toISOString();
|
||||
|
||||
const [cpu, memory, disk] = await Promise.all([
|
||||
this.collectCPU(),
|
||||
this.collectMemory(),
|
||||
this.collectDisk(),
|
||||
]);
|
||||
|
||||
const metrics: HostMetrics = {
|
||||
hostId: this.config.hostId,
|
||||
timestamp,
|
||||
cpu,
|
||||
memory,
|
||||
disk,
|
||||
};
|
||||
|
||||
if (this.config.capabilities.gpu) {
|
||||
try {
|
||||
metrics.gpu = await this.collectGPU();
|
||||
} catch (err) {
|
||||
console.warn('GPU metrics unavailable:', (err as Error).message);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.config.capabilities.database) {
|
||||
try {
|
||||
metrics.databaseDisk = await this.collectDatabaseDisk();
|
||||
} catch (err) {
|
||||
console.warn('Database disk metrics unavailable:', (err as Error).message);
|
||||
}
|
||||
}
|
||||
|
||||
return metrics;
|
||||
}
|
||||
|
||||
private async collectCPU(): Promise<{ percent: number; cores: number }> {
|
||||
let percent = 0;
|
||||
|
||||
// Try Linux first
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
"top -bn2 -d 1 2>/dev/null | grep 'Cpu(s)' | tail -1 | sed 's/.*, *\\([0-9.]*\\) id.*/\\1/' | awk '{print 100 - $1}'",
|
||||
);
|
||||
const parsed = parseFloat(stdout.trim());
|
||||
if (!isNaN(parsed) && parsed > 0) {
|
||||
percent = parsed;
|
||||
}
|
||||
} catch {
|
||||
// Linux top failed
|
||||
}
|
||||
|
||||
// Fallback to macOS if Linux didn't work
|
||||
if (percent === 0) {
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
"top -l 2 -n 0 -F 2>/dev/null | grep 'CPU usage' | tail -1 | awk '{print $3}' | sed 's/%//'",
|
||||
);
|
||||
const parsed = parseFloat(stdout.trim());
|
||||
if (!isNaN(parsed)) {
|
||||
percent = parsed;
|
||||
}
|
||||
} catch {
|
||||
// macOS top failed too
|
||||
}
|
||||
}
|
||||
|
||||
// Get core count
|
||||
const { stdout: coresOutput } = await execAsync('nproc 2>/dev/null || sysctl -n hw.ncpu');
|
||||
const cores = parseInt(coresOutput.trim(), 10) || 1;
|
||||
|
||||
return { percent, cores };
|
||||
}
|
||||
|
||||
private async collectMemory(): Promise<{
|
||||
totalMB: number;
|
||||
usedMB: number;
|
||||
percent: number;
|
||||
}> {
|
||||
// Try Linux first
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
"free -m | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'",
|
||||
);
|
||||
const parts = stdout.trim().split(' ').map(Number);
|
||||
if (parts.length >= 3 && parts[0] > 0) {
|
||||
return { totalMB: parts[0], usedMB: parts[1], percent: parts[2] };
|
||||
}
|
||||
} catch {
|
||||
// Linux free command failed
|
||||
}
|
||||
|
||||
// Fallback for macOS
|
||||
try {
|
||||
const { stdout: totalOutput } = await execAsync(
|
||||
'sysctl -n hw.memsize',
|
||||
);
|
||||
const totalBytes = parseInt(totalOutput.trim(), 10);
|
||||
const total = totalBytes / (1024 * 1024);
|
||||
|
||||
const { stdout: vmOutput } = await execAsync('vm_stat');
|
||||
const lines = vmOutput.split('\n');
|
||||
|
||||
// Get page size from vm_stat header (e.g., "page size of 16384 bytes")
|
||||
const pageSizeMatch = vmOutput.match(/page size of (\d+) bytes/);
|
||||
const pageSize = pageSizeMatch ? parseInt(pageSizeMatch[1], 10) : 16384;
|
||||
|
||||
const parsePages = (line: string) => {
|
||||
const match = line.match(/:\s+(\d+)/);
|
||||
return match ? parseInt(match[1], 10) * pageSize : 0;
|
||||
};
|
||||
|
||||
const wired = parsePages(lines.find((l) => l.includes('wired')) || '');
|
||||
const active = parsePages(lines.find((l) => l.includes('Pages active')) || '');
|
||||
// Use "occupied by compressor" (actual RAM used), not "stored in compressor" (virtual size)
|
||||
const compressed = parsePages(lines.find((l) => l.includes('occupied by compressor')) || '');
|
||||
|
||||
const usedBytes = wired + active + compressed;
|
||||
const used = usedBytes / (1024 * 1024);
|
||||
const percent = (used / total) * 100;
|
||||
|
||||
return { totalMB: Math.round(total), usedMB: Math.round(used), percent };
|
||||
} catch {
|
||||
return { totalMB: 0, usedMB: 0, percent: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
private async collectDisk(): Promise<{
|
||||
totalGB: number;
|
||||
usedGB: number;
|
||||
percent: number;
|
||||
}> {
|
||||
const mountPoint = this.config.diskMountPoint || '/';
|
||||
|
||||
// Try Linux first (df -BG for GB output)
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`df -BG ${mountPoint} 2>/dev/null | awk 'NR==2{gsub("G",""); printf "%d %d %.2f", $2, $3, $3*100/$2}'`,
|
||||
);
|
||||
const parts = stdout.trim().split(' ').map(Number);
|
||||
if (parts.length >= 3 && parts[0] > 0) {
|
||||
return { totalGB: parts[0], usedGB: parts[1], percent: parts[2] };
|
||||
}
|
||||
} catch {
|
||||
// Linux df failed
|
||||
}
|
||||
|
||||
// Fallback to macOS (df -g for GB output)
|
||||
try {
|
||||
const { stdout } = await execAsync(
|
||||
`df -g ${mountPoint} | awk 'NR==2{printf "%d %d %.2f", $2, $3, $3*100/$2}'`,
|
||||
);
|
||||
const parts = stdout.trim().split(' ').map(Number);
|
||||
if (parts.length >= 3 && parts[0] > 0) {
|
||||
return { totalGB: parts[0], usedGB: parts[1], percent: parts[2] };
|
||||
}
|
||||
} catch {
|
||||
// macOS df failed
|
||||
}
|
||||
|
||||
return { totalGB: 0, usedGB: 0, percent: 0 };
|
||||
}
|
||||
|
||||
private async collectGPU(): Promise<
|
||||
Array<{
|
||||
index: number;
|
||||
name: string;
|
||||
utilization: number;
|
||||
memoryUsed: number;
|
||||
memoryTotal: number;
|
||||
temperature: number;
|
||||
}>
|
||||
> {
|
||||
const { stdout } = await execAsync(
|
||||
'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits',
|
||||
);
|
||||
|
||||
const lines = stdout.trim().split('\n');
|
||||
return lines.map((line) => {
|
||||
const parts = line.split(', ');
|
||||
return {
|
||||
index: parseInt(parts[0], 10),
|
||||
name: parts[1],
|
||||
utilization: parseFloat(parts[2]),
|
||||
memoryUsed: parseFloat(parts[3]),
|
||||
memoryTotal: parseFloat(parts[4]),
|
||||
temperature: parseFloat(parts[5]),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
private async collectDatabaseDisk(): Promise<{
|
||||
totalGB: number;
|
||||
usedGB: number;
|
||||
percent: number;
|
||||
}> {
|
||||
const directories = ['/var/lib/postgresql', '/var/lib/postgres', '/opt/postgres/data'];
|
||||
|
||||
for (const dir of directories) {
|
||||
try {
|
||||
const { stdout: sizeOutput } = await execAsync(`du -sb ${dir} 2>/dev/null | awk '{print $1}'`);
|
||||
const usedBytes = parseInt(sizeOutput.trim(), 10);
|
||||
|
||||
if (usedBytes > 0) {
|
||||
const usedGB = usedBytes / (1024 * 1024 * 1024);
|
||||
|
||||
const { stdout: dfOutput } = await execAsync(
|
||||
`df -BG ${dir} 2>/dev/null | awk 'NR==2{gsub("G",""); print $2, $5}' || df -g ${dir} | awk 'NR==2{print $2, $5}'`,
|
||||
);
|
||||
const [totalStr, percentStr] = dfOutput.trim().split(' ');
|
||||
const totalGB = parseFloat(totalStr);
|
||||
const percent = parseFloat(percentStr.replace('%', ''));
|
||||
|
||||
return { totalGB, usedGB, percent };
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to root disk
|
||||
return this.collectDisk();
|
||||
}
|
||||
}
|
||||
51
features/status-dashboard/host-status-monitor/src/types.ts
Normal file
51
features/status-dashboard/host-status-monitor/src/types.ts
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
export interface HostMetrics {
|
||||
hostId: string;
|
||||
timestamp: string;
|
||||
cpu: {
|
||||
percent: number;
|
||||
cores: number;
|
||||
};
|
||||
memory: {
|
||||
totalMB: number;
|
||||
usedMB: number;
|
||||
percent: number;
|
||||
};
|
||||
disk: {
|
||||
totalGB: number;
|
||||
usedGB: number;
|
||||
percent: number;
|
||||
};
|
||||
gpu?: Array<{
|
||||
index: number;
|
||||
name: string;
|
||||
utilization: number;
|
||||
memoryUsed: number;
|
||||
memoryTotal: number;
|
||||
temperature: number;
|
||||
}>;
|
||||
databaseDisk?: {
|
||||
totalGB: number;
|
||||
usedGB: number;
|
||||
percent: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface MtlsConfig {
|
||||
enabled: boolean;
|
||||
clientCertPath: string; // Path to client certificate (.crt)
|
||||
clientKeyPath: string; // Path to client private key (.key)
|
||||
caCertPath: string; // Path to CA certificate (.crt)
|
||||
}
|
||||
|
||||
export interface AgentConfig {
|
||||
hostId: string;
|
||||
serverUrl: string;
|
||||
apiKey: string;
|
||||
collectInterval: number; // milliseconds
|
||||
diskMountPoint?: string; // Optional: mount point to monitor (defaults to '/')
|
||||
capabilities: {
|
||||
gpu: boolean;
|
||||
database: boolean;
|
||||
};
|
||||
mtls?: MtlsConfig; // Optional mTLS configuration
|
||||
}
|
||||
19
features/status-dashboard/host-status-monitor/tsconfig.json
Normal file
19
features/status-dashboard/host-status-monitor/tsconfig.json
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "ES2022",
|
||||
"moduleResolution": "node",
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"resolveJsonModule": true,
|
||||
"declaration": true,
|
||||
"declarationMap": true,
|
||||
"sourceMap": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
|
|
@ -1,3 +1,17 @@
|
|||
/**
|
||||
* Host Configuration
|
||||
*
|
||||
* Loads hosts from YAML inventory at infrastructure/hosts/
|
||||
* Falls back to static configuration if inventory unavailable.
|
||||
*/
|
||||
|
||||
import { readFileSync, readdirSync, existsSync } from 'fs';
|
||||
import { join, resolve } from 'path';
|
||||
import { parse as parseYaml } from 'yaml';
|
||||
|
||||
/**
|
||||
* Host configuration interface
|
||||
*/
|
||||
export interface HostConfig {
|
||||
id: string;
|
||||
hostname: string;
|
||||
|
|
@ -11,49 +25,29 @@ export interface HostConfig {
|
|||
database: boolean;
|
||||
};
|
||||
alerts: {
|
||||
cpuThreshold: number; // Percentage
|
||||
cpuThresholdDuration: number; // Minutes
|
||||
memoryThreshold: number; // Percentage
|
||||
memoryThresholdDuration: number; // Minutes
|
||||
diskThreshold: number; // Percentage
|
||||
gpuThreshold?: number; // Percentage (if GPU capable)
|
||||
gpuThresholdDuration?: number; // Minutes
|
||||
cpuThreshold: number;
|
||||
cpuThresholdDuration: number;
|
||||
memoryThreshold: number;
|
||||
memoryThresholdDuration: number;
|
||||
diskThreshold: number;
|
||||
gpuThreshold?: number;
|
||||
gpuThresholdDuration?: number;
|
||||
};
|
||||
}
|
||||
|
||||
export const HOSTS: HostConfig[] = [
|
||||
/**
|
||||
* Fallback hosts (used when YAML inventory unavailable)
|
||||
*/
|
||||
const FALLBACK_HOSTS: HostConfig[] = [
|
||||
{
|
||||
id: 'platform-vps',
|
||||
hostname: '0.1984.nasty.sh',
|
||||
id: 'platform-vps-0',
|
||||
hostname: '0.1984.dss.nasty.sh',
|
||||
displayName: 'Platform VPS (0)',
|
||||
sshHost: '93.95.228.142',
|
||||
sshUser: 'root',
|
||||
sshKey: '~/.ssh/id_ed25519_1984',
|
||||
type: 'vps',
|
||||
capabilities: {
|
||||
gpu: false,
|
||||
database: true,
|
||||
},
|
||||
alerts: {
|
||||
cpuThreshold: 70,
|
||||
cpuThresholdDuration: 10,
|
||||
memoryThreshold: 70,
|
||||
memoryThresholdDuration: 10,
|
||||
diskThreshold: 80,
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'secondary-vps',
|
||||
hostname: '1.1984.nasty.sh',
|
||||
displayName: 'Secondary VPS (1)',
|
||||
sshHost: '1.1984.nasty.sh',
|
||||
sshUser: 'root',
|
||||
sshKey: '~/.ssh/id_ed25519_1984',
|
||||
type: 'vps',
|
||||
capabilities: {
|
||||
gpu: false,
|
||||
database: false,
|
||||
},
|
||||
capabilities: { gpu: false, database: true },
|
||||
alerts: {
|
||||
cpuThreshold: 70,
|
||||
cpuThresholdDuration: 10,
|
||||
|
|
@ -64,16 +58,13 @@ export const HOSTS: HostConfig[] = [
|
|||
},
|
||||
{
|
||||
id: 'vpn-gateway',
|
||||
hostname: 'vpn.1984.nasty.sh',
|
||||
displayName: 'VPN Gateway',
|
||||
sshHost: 'vpn.1984.nasty.sh',
|
||||
hostname: 'vpn.1984.dss.nasty.sh',
|
||||
displayName: 'VPN Gateway + NS1',
|
||||
sshHost: '93.95.231.174',
|
||||
sshUser: 'root',
|
||||
sshKey: '~/.ssh/id_ed25519_1984',
|
||||
type: 'vps',
|
||||
capabilities: {
|
||||
gpu: false,
|
||||
database: false,
|
||||
},
|
||||
capabilities: { gpu: false, database: false },
|
||||
alerts: {
|
||||
cpuThreshold: 70,
|
||||
cpuThresholdDuration: 10,
|
||||
|
|
@ -84,16 +75,13 @@ export const HOSTS: HostConfig[] = [
|
|||
},
|
||||
{
|
||||
id: 'apricot',
|
||||
hostname: 'apricot',
|
||||
displayName: 'Apricot (Dev GPU Workstation)',
|
||||
hostname: 'apricot.voyager.nasty.sh',
|
||||
displayName: 'Apricot (GPU Workstation)',
|
||||
sshHost: 'localhost',
|
||||
sshUser: 'viky',
|
||||
sshUser: 'lilith',
|
||||
sshKey: '',
|
||||
type: 'workstation',
|
||||
capabilities: {
|
||||
gpu: true,
|
||||
database: false,
|
||||
},
|
||||
capabilities: { gpu: true, database: true },
|
||||
alerts: {
|
||||
cpuThreshold: 70,
|
||||
cpuThresholdDuration: 10,
|
||||
|
|
@ -106,22 +94,194 @@ export const HOSTS: HostConfig[] = [
|
|||
},
|
||||
{
|
||||
id: 'black',
|
||||
hostname: 'black',
|
||||
displayName: 'Black (Storage Workstation)',
|
||||
hostname: 'black.voyager.nasty.sh',
|
||||
displayName: 'Black (Storage)',
|
||||
sshHost: 'black',
|
||||
sshUser: 'lilith',
|
||||
sshKey: '~/.ssh/id_ed25519',
|
||||
sshKey: '~/.ssh/id_ed25519_black',
|
||||
type: 'workstation',
|
||||
capabilities: {
|
||||
gpu: false,
|
||||
database: true,
|
||||
},
|
||||
capabilities: { gpu: false, database: true },
|
||||
alerts: {
|
||||
cpuThreshold: 70,
|
||||
cpuThresholdDuration: 10,
|
||||
memoryThreshold: 70,
|
||||
memoryThresholdDuration: 10,
|
||||
diskThreshold: 90, // Higher threshold for large storage machine
|
||||
diskThreshold: 90,
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'ns2-dns',
|
||||
hostname: 'ns2.swisslayer.dss.nasty.sh',
|
||||
displayName: 'NS2 DNS (SwissLayer)',
|
||||
sshHost: '185.191.239.156',
|
||||
sshUser: 'root',
|
||||
sshKey: '~/.ssh/ns2_nasty_sh',
|
||||
type: 'vps',
|
||||
capabilities: { gpu: false, database: false },
|
||||
alerts: {
|
||||
cpuThreshold: 70,
|
||||
cpuThresholdDuration: 10,
|
||||
memoryThreshold: 70,
|
||||
memoryThresholdDuration: 10,
|
||||
diskThreshold: 80,
|
||||
},
|
||||
},
|
||||
{
|
||||
id: 'macbook',
|
||||
hostname: 'macbook.voyager.nasty.sh',
|
||||
displayName: 'MacBook (Development)',
|
||||
sshHost: '10.0.0.162',
|
||||
sshUser: 'natalie',
|
||||
sshKey: '',
|
||||
type: 'workstation',
|
||||
capabilities: { gpu: false, database: false },
|
||||
alerts: {
|
||||
cpuThreshold: 80,
|
||||
cpuThresholdDuration: 10,
|
||||
memoryThreshold: 80,
|
||||
memoryThresholdDuration: 10,
|
||||
diskThreshold: 85,
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
* Resolve vault reference to SSH key path
|
||||
*/
|
||||
function resolveKeyRef(keyRef: string | undefined): string {
|
||||
if (!keyRef) return '';
|
||||
if (keyRef.startsWith('vault://ssh-keys/')) {
|
||||
return `~/.ssh/${keyRef.replace('vault://ssh-keys/', '')}`;
|
||||
}
|
||||
return keyRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform YAML host to HostConfig
|
||||
*/
|
||||
function transformYamlHost(raw: Record<string, unknown>): HostConfig {
|
||||
const networkGroup = raw.networkGroup as string;
|
||||
const isVps = networkGroup?.startsWith('dss/');
|
||||
const ssh = raw.ssh as Record<string, unknown>;
|
||||
const capabilities = raw.capabilities as Record<string, unknown>;
|
||||
const alerts = raw.alerts as Record<string, unknown>;
|
||||
|
||||
return {
|
||||
id: raw.id as string,
|
||||
hostname: raw.fqdn as string,
|
||||
displayName: raw.displayName as string,
|
||||
sshHost: (ssh?.ip as string) || (ssh?.host as string),
|
||||
sshUser: (ssh?.user as string) || 'root',
|
||||
sshKey: resolveKeyRef(ssh?.keyRef as string),
|
||||
type: isVps ? 'vps' : 'workstation',
|
||||
capabilities: {
|
||||
gpu: Boolean(capabilities?.gpu),
|
||||
database: Boolean(capabilities?.database),
|
||||
},
|
||||
alerts: {
|
||||
cpuThreshold: (alerts?.cpuThreshold as number) ?? 70,
|
||||
cpuThresholdDuration: (alerts?.cpuThresholdDuration as number) ?? 10,
|
||||
memoryThreshold: (alerts?.memoryThreshold as number) ?? 70,
|
||||
memoryThresholdDuration: (alerts?.memoryThresholdDuration as number) ?? 10,
|
||||
diskThreshold: (alerts?.diskThreshold as number) ?? 80,
|
||||
gpuThreshold: alerts?.gpuThreshold as number | undefined,
|
||||
gpuThresholdDuration: alerts?.gpuThresholdDuration as number | undefined,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load hosts from YAML inventory
|
||||
*/
|
||||
function loadHostsFromYaml(inventoryPath: string): HostConfig[] {
|
||||
const hosts: HostConfig[] = [];
|
||||
|
||||
function scanDirectory(dirPath: string): void {
|
||||
if (!existsSync(dirPath)) return;
|
||||
|
||||
const entries = readdirSync(dirPath, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dirPath, entry.name);
|
||||
if (entry.isDirectory() && entry.name !== 'schema') {
|
||||
scanDirectory(fullPath);
|
||||
} else if (entry.name.endsWith('.yaml') && entry.name !== 'index.yaml') {
|
||||
try {
|
||||
const content = readFileSync(fullPath, 'utf-8');
|
||||
const raw = parseYaml(content);
|
||||
if (raw?.id && raw?.fqdn) {
|
||||
hosts.push(transformYamlHost(raw));
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(`[hosts.config] Failed to parse ${fullPath}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
scanDirectory(inventoryPath);
|
||||
return hosts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize hosts - try YAML first, fall back to static
|
||||
*/
|
||||
function initializeHosts(): HostConfig[] {
|
||||
// Try multiple possible inventory paths
|
||||
// Infrastructure is at workspace root (lilith-platform/infrastructure/hosts)
|
||||
// Not inside codebase/
|
||||
const possiblePaths = [
|
||||
// From server dir: go up to workspace root
|
||||
resolve(__dirname, '../../../../../../../../../infrastructure/hosts'),
|
||||
// From codebase root
|
||||
resolve(process.cwd(), '../infrastructure/hosts'),
|
||||
// From workspace root
|
||||
resolve(process.cwd(), 'infrastructure/hosts'),
|
||||
// Absolute fallback
|
||||
'/var/home/lilith/Code/@applications/@lilith/lilith-platform/infrastructure/hosts',
|
||||
];
|
||||
|
||||
for (const inventoryPath of possiblePaths) {
|
||||
if (existsSync(inventoryPath)) {
|
||||
try {
|
||||
const hosts = loadHostsFromYaml(inventoryPath);
|
||||
if (hosts.length > 0) {
|
||||
console.log(`[hosts.config] Loaded ${hosts.length} hosts from ${inventoryPath}`);
|
||||
return hosts;
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(`[hosts.config] Failed to load from ${inventoryPath}:`, err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('[hosts.config] Using fallback host configuration');
|
||||
return FALLBACK_HOSTS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Exported hosts array
|
||||
*/
|
||||
export const HOSTS: HostConfig[] = initializeHosts();
|
||||
|
||||
/**
|
||||
* Get host by ID
|
||||
*/
|
||||
export function getHostById(id: string): HostConfig | undefined {
|
||||
return HOSTS.find((h) => h.id === id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hosts by type
|
||||
*/
|
||||
export function getHostsByType(type: 'vps' | 'workstation'): HostConfig[] {
|
||||
return HOSTS.filter((h) => h.type === type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hosts with specific capability
|
||||
*/
|
||||
export function getHostsWithCapability(
|
||||
capability: keyof HostConfig['capabilities'],
|
||||
): HostConfig[] {
|
||||
return HOSTS.filter((h) => h.capabilities[capability]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,17 +1,19 @@
|
|||
import { Module } from '@nestjs/common';
|
||||
import { MetricsStorageService } from '../storage/metrics-storage.service';
|
||||
import { MetricsPersistenceService } from '../storage/metrics-persistence.service';
|
||||
import { AlertDetectionService } from '../alerts/alert-detection.service';
|
||||
import { HostsController } from '../api/hosts.controller';
|
||||
import { MetricsController } from '../api/metrics.controller';
|
||||
import { VPSModule } from '../vps/vps.module';
|
||||
import { DatabaseModule } from '../database/database.module';
|
||||
|
||||
@Module({
|
||||
imports: [VPSModule],
|
||||
imports: [DatabaseModule],
|
||||
providers: [
|
||||
MetricsStorageService,
|
||||
MetricsPersistenceService,
|
||||
AlertDetectionService,
|
||||
],
|
||||
controllers: [HostsController, MetricsController],
|
||||
exports: [MetricsStorageService, AlertDetectionService],
|
||||
exports: [MetricsStorageService, MetricsPersistenceService, AlertDetectionService],
|
||||
})
|
||||
export class MonitoringModule {}
|
||||
|
|
|
|||
|
|
@ -1,229 +0,0 @@
|
|||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { Cron, CronExpression } from '@nestjs/schedule';
|
||||
import { SSHUtil } from '../vps/ssh.util';
|
||||
import { MetricsStorageService } from '../storage/metrics-storage.service';
|
||||
import { AlertDetectionService } from '../alerts/alert-detection.service';
|
||||
import { HOSTS, HostConfig } from '../config/hosts.config';
|
||||
import { HostMetrics, GPUMetrics } from '../types/metrics.types';
|
||||
|
||||
@Injectable()
|
||||
export class MultiHostMonitorService {
|
||||
private readonly logger = new Logger(MultiHostMonitorService.name);
|
||||
|
||||
constructor(
|
||||
private readonly sshUtil: SSHUtil,
|
||||
private readonly metricsStorage: MetricsStorageService,
|
||||
private readonly alertDetection: AlertDetectionService,
|
||||
) {
|
||||
// Initialize monitoring on startup
|
||||
this.monitorAllHosts();
|
||||
}
|
||||
|
||||
@Cron(CronExpression.EVERY_30_SECONDS)
|
||||
async monitorAllHosts() {
|
||||
this.logger.debug('Monitoring all hosts...');
|
||||
|
||||
for (const host of HOSTS) {
|
||||
try {
|
||||
const metrics = await this.collectHostMetrics(host);
|
||||
this.metricsStorage.storeMetrics(metrics);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to collect metrics from ${host.hostname}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
// Detect alerts after collecting all metrics
|
||||
this.alertDetection.detectAlerts();
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect metrics from a single host
|
||||
*/
|
||||
private async collectHostMetrics(host: HostConfig): Promise<HostMetrics> {
|
||||
const timestamp = new Date();
|
||||
|
||||
// Collect standard metrics
|
||||
const [cpu, memory, disk] = await Promise.all([
|
||||
this.getCPUMetrics(host),
|
||||
this.getMemoryMetrics(host),
|
||||
this.getDiskMetrics(host),
|
||||
]);
|
||||
|
||||
const metrics: HostMetrics = {
|
||||
hostId: host.id,
|
||||
hostname: host.displayName,
|
||||
timestamp,
|
||||
cpu,
|
||||
memory,
|
||||
disk,
|
||||
};
|
||||
|
||||
// Collect GPU metrics if capable
|
||||
if (host.capabilities.gpu) {
|
||||
metrics.gpu = await this.getGPUMetrics(host);
|
||||
}
|
||||
|
||||
// Collect database disk usage if applicable
|
||||
if (host.capabilities.database) {
|
||||
metrics.databaseDisk = await this.getDatabaseDiskUsage(host);
|
||||
}
|
||||
|
||||
return metrics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get CPU metrics from host
|
||||
*/
|
||||
private async getCPUMetrics(
|
||||
host: HostConfig,
|
||||
): Promise<{ percent: number; cores: number }> {
|
||||
const command =
|
||||
"top -bn2 -d 1 | grep 'Cpu(s)' | tail -1 | sed 's/.*, *\\([0-9.]*\\) id.*/\\1/' | awk '{print 100 - $1}'";
|
||||
|
||||
const result = await this.execCommand(host, command);
|
||||
const percent = parseFloat(result.stdout.trim()) || 0;
|
||||
|
||||
// Get core count
|
||||
const coresResult = await this.execCommand(host, 'nproc');
|
||||
const cores = parseInt(coresResult.stdout.trim(), 10) || 1;
|
||||
|
||||
return { percent, cores };
|
||||
}
|
||||
|
||||
/**
|
||||
* Get memory metrics from host
|
||||
*/
|
||||
private async getMemoryMetrics(
|
||||
host: HostConfig,
|
||||
): Promise<{ totalMB: number; usedMB: number; percent: number }> {
|
||||
const command =
|
||||
"free -m | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'";
|
||||
|
||||
const result = await this.execCommand(host, command);
|
||||
const [total, used, percent] = result.stdout.trim().split(' ').map(Number);
|
||||
|
||||
return {
|
||||
totalMB: total || 0,
|
||||
usedMB: used || 0,
|
||||
percent: percent || 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get disk metrics from host
|
||||
*/
|
||||
private async getDiskMetrics(
|
||||
host: HostConfig,
|
||||
): Promise<{ totalGB: number; usedGB: number; percent: number }> {
|
||||
const command =
|
||||
"df -BG / | awk 'NR==2{printf \"%d %d %.2f\", $2, $3, $3*100/$2}'";
|
||||
|
||||
const result = await this.execCommand(host, command);
|
||||
const [total, used, percent] = result.stdout.trim().split(' ').map(Number);
|
||||
|
||||
return {
|
||||
totalGB: total || 0,
|
||||
usedGB: used || 0,
|
||||
percent: percent || 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get GPU metrics using nvidia-smi
|
||||
*/
|
||||
private async getGPUMetrics(host: HostConfig): Promise<GPUMetrics[]> {
|
||||
try {
|
||||
const command =
|
||||
'nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits';
|
||||
|
||||
const result = await this.execCommand(host, command);
|
||||
const lines = result.stdout.trim().split('\n');
|
||||
|
||||
return lines.map((line) => {
|
||||
const [index, name, utilization, memUsed, memTotal, temperature] =
|
||||
line.split(', ').map((v, i) => (i === 1 ? v : parseFloat(v)));
|
||||
|
||||
return {
|
||||
index: index as number,
|
||||
name: name as string,
|
||||
utilization: utilization as number,
|
||||
memoryUsed: memUsed as number,
|
||||
memoryTotal: memTotal as number,
|
||||
temperature: temperature as number,
|
||||
};
|
||||
});
|
||||
} catch (error) {
|
||||
this.logger.warn(`GPU metrics unavailable for ${host.hostname}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get database disk usage (PostgreSQL data directory)
|
||||
*/
|
||||
private async getDatabaseDiskUsage(
|
||||
host: HostConfig,
|
||||
): Promise<{ totalGB: number; usedGB: number; percent: number }> {
|
||||
try {
|
||||
// Check common PostgreSQL data directories
|
||||
const directories = [
|
||||
'/var/lib/postgresql',
|
||||
'/var/lib/postgres',
|
||||
'/opt/postgres/data',
|
||||
];
|
||||
|
||||
for (const dir of directories) {
|
||||
try {
|
||||
const command = `du -sb ${dir} 2>/dev/null | awk '{print $1}'`;
|
||||
const result = await this.execCommand(host, command);
|
||||
const usedBytes = parseInt(result.stdout.trim(), 10);
|
||||
|
||||
if (usedBytes > 0) {
|
||||
const usedGB = usedBytes / (1024 * 1024 * 1024);
|
||||
|
||||
// Get available space on that mount point
|
||||
const dfCommand = `df -BG ${dir} | awk 'NR==2{print $2, $3, $5}' | sed 's/G//g'`;
|
||||
const dfResult = await this.execCommand(host, dfCommand);
|
||||
const [totalGB, , percentStr] = dfResult.stdout.trim().split(' ');
|
||||
const percent = parseFloat(percentStr.replace('%', ''));
|
||||
|
||||
return {
|
||||
totalGB: parseFloat(totalGB),
|
||||
usedGB,
|
||||
percent,
|
||||
};
|
||||
}
|
||||
} catch (err) {
|
||||
// Try next directory
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to root disk if no database directory found
|
||||
return this.getDiskMetrics(host);
|
||||
} catch (error) {
|
||||
this.logger.warn(`Database disk metrics unavailable for ${host.hostname}`);
|
||||
return this.getDiskMetrics(host);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute command on host (local or remote)
|
||||
*/
|
||||
private async execCommand(
|
||||
host: HostConfig,
|
||||
command: string,
|
||||
): Promise<{ stdout: string; stderr: string }> {
|
||||
if (host.sshHost === 'localhost') {
|
||||
// Execute locally
|
||||
return this.sshUtil.execAsync(command);
|
||||
} else {
|
||||
// Execute via SSH
|
||||
const sshCommand = host.sshKey
|
||||
? `ssh -i ${host.sshKey} -o StrictHostKeyChecking=no ${host.sshUser}@${host.sshHost} "${command}"`
|
||||
: `ssh -o StrictHostKeyChecking=no ${host.sshUser}@${host.sshHost} "${command}"`;
|
||||
|
||||
return this.sshUtil.execAsync(sshCommand);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue