feat(host-status-monitor): add cross-platform health check infrastructure

Add automatic service health monitoring with restart capability:

- Cross-platform health check script (Linux systemd + macOS launchd)
- Detects hung services by checking for recent success vs error logs
- Auto-restarts service after 3+ consecutive failures with no successes
- Runs every 2 minutes via systemd timer or launchd StartInterval

Deployment updates:
- deploy.sh now installs health check on all platforms
- Removed VPN proxy from plum.env (no WireGuard on macOS)

Files added:
- host-status-monitor-healthcheck (cross-platform bash script)
- host-status-monitor-healthcheck.service (systemd oneshot)
- host-status-monitor-healthcheck.timer (2-minute interval)
- com.lilith.host-status-monitor-healthcheck.plist (macOS launchd)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Quinn Ftw 2025-12-26 01:09:15 -08:00
parent 567d703cf6
commit ff6f4528ce
6 changed files with 183 additions and 3 deletions

View file

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.lilith.host-status-monitor-healthcheck</string>
<key>ProgramArguments</key>
<array>
<string>/bin/bash</string>
<string>/opt/host-status-monitor/healthcheck</string>
</array>
<key>StartInterval</key>
<integer>120</integer>
<key>RunAtLoad</key>
<true/>
<key>StandardOutPath</key>
<string>/var/log/host-status-monitor-healthcheck.log</string>
<key>StandardErrorPath</key>
<string>/var/log/host-status-monitor-healthcheck.log</string>
</dict>
</plist>

View file

@ -231,13 +231,55 @@ WRAPPER
run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable host-status-monitor && sudo systemctl restart host-status-monitor"
fi
echo "7. Checking status..."
echo "7. Installing health check..."
if is_macos_host "$host"; then
# macOS: use launchd
scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:/tmp/healthcheck"
scp "$SCRIPT_DIR/com.lilith.host-status-monitor-healthcheck.plist" "$target:/tmp/"
run_remote "$host" "mv /tmp/healthcheck $INSTALL_DIR/healthcheck && chmod +x $INSTALL_DIR/healthcheck"
run_remote "$host" "launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist 2>/dev/null || true"
run_remote "$host" "mv /tmp/com.lilith.host-status-monitor-healthcheck.plist /Library/LaunchDaemons/"
run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist"
elif [ "$host" = "apricot" ]; then
# Local Linux host
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$INSTALL_DIR/healthcheck"
sudo chmod +x "$INSTALL_DIR/healthcheck"
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" /etc/systemd/system/
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable --now host-status-monitor-healthcheck.timer
elif uses_ssh_key "$host"; then
# Remote Linux with SSH key
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:$INSTALL_DIR/healthcheck"
run_remote "$host" "chmod +x $INSTALL_DIR/healthcheck"
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/etc/systemd/system/"
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/etc/systemd/system/"
run_remote "$host" "systemctl daemon-reload && systemctl enable --now host-status-monitor-healthcheck.timer"
elif needs_sudo "$host"; then
# Remote Linux with sudo
scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:/tmp/healthcheck"
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/tmp/host-status-monitor-healthcheck.service"
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/tmp/host-status-monitor-healthcheck.timer"
run_remote "$host" "mv /tmp/healthcheck $INSTALL_DIR/healthcheck && chmod +x $INSTALL_DIR/healthcheck"
run_remote "$host" "mv /tmp/host-status-monitor-healthcheck.service /tmp/host-status-monitor-healthcheck.timer /etc/systemd/system/"
run_remote "$host" "systemctl daemon-reload && systemctl enable --now host-status-monitor-healthcheck.timer"
else
# Remote Linux without sudo
scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:$INSTALL_DIR/healthcheck"
run_remote "$host" "chmod +x $INSTALL_DIR/healthcheck"
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/etc/systemd/system/"
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/etc/systemd/system/"
run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable --now host-status-monitor-healthcheck.timer"
fi
echo "8. Checking status..."
sleep 2
if is_macos_host "$host"; then
run_remote "$host" "sudo launchctl list | grep host-status-monitor" || true
run_remote "$host" "tail -5 /var/log/host-status-monitor.log 2>/dev/null" || true
else
run_remote "$host" "systemctl status host-status-monitor --no-pager" || true
run_remote "$host" "systemctl list-timers host-status-monitor-healthcheck.timer --no-pager" || true
fi
echo ""

View file

@ -20,5 +20,5 @@ MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
# Option 2: API Key (fallback)
# API_KEY=<from vault/api-keys/plum.key>
# VPN Proxy (route through VPN gateway for controlled egress)
VPN_PROXY_URL=socks5://10.8.0.1:1080
# VPN Proxy - disabled for plum (no WireGuard installed)
# VPN_PROXY_URL=socks5://10.8.0.1:1080

View file

@ -0,0 +1,95 @@
#!/bin/bash
# Health check for host-status-monitor service
# Cross-platform: Linux (systemd) and macOS (launchd)
SERVICE="host-status-monitor"
PLIST_LABEL="com.lilith.host-status-monitor"
LOG_TAG="host-status-monitor-healthcheck"
MAX_FAILURES=3
# Detect OS
if [[ "$(uname)" == "Darwin" ]]; then
IS_MACOS=true
else
IS_MACOS=false
fi
log_message() {
if $IS_MACOS; then
/usr/bin/logger -t "$LOG_TAG" "$1"
else
logger -t "$LOG_TAG" "$1"
fi
}
# Check if service is running
is_service_running() {
if $IS_MACOS; then
launchctl list | grep -q "$PLIST_LABEL"
else
systemctl is-active --quiet "$SERVICE"
fi
}
# Start/restart service
restart_service() {
if $IS_MACOS; then
launchctl kickstart -k "system/$PLIST_LABEL" 2>/dev/null || \
(launchctl unload "/Library/LaunchDaemons/${PLIST_LABEL}.plist" 2>/dev/null; \
launchctl load "/Library/LaunchDaemons/${PLIST_LABEL}.plist")
else
systemctl restart "$SERVICE"
fi
}
# Get recent logs
get_recent_logs() {
if $IS_MACOS; then
# macOS: check log file
if [[ -f /var/log/host-status-monitor.log ]]; then
tail -50 /var/log/host-status-monitor.log 2>/dev/null
else
# Try system log
log show --predicate "process == 'node'" --last 2m 2>/dev/null | grep -i "host-status-monitor" || true
fi
else
journalctl -u "$SERVICE" --since "2 minutes ago" -q --no-pager 2>/dev/null
fi
}
# Check if service is active
if ! is_service_running; then
log_message "Service not running, starting"
restart_service
exit $?
fi
# Check recent logs for success or failure patterns
RECENT_LOGS=$(get_recent_logs)
# Count recent successes and failures
SUCCESS_COUNT=$(echo "$RECENT_LOGS" | grep -c "Metrics sent successfully" || true)
FAILURE_COUNT=$(echo "$RECENT_LOGS" | grep -c "Error:" || true)
# If we have successes recently, we're healthy
if [[ "$SUCCESS_COUNT" -gt 0 ]]; then
exit 0
fi
# If we have too many failures and no successes, restart
if [[ "$FAILURE_COUNT" -ge "$MAX_FAILURES" ]]; then
log_message "Too many failures ($FAILURE_COUNT), no successes - restarting"
restart_service
sleep 5
if is_service_running; then
log_message "Service restarted successfully"
exit 0
else
log_message "Service failed to restart"
exit 1
fi
fi
# Service is running but no recent activity - might be starting up, allow it
exit 0

View file

@ -0,0 +1,7 @@
[Unit]
Description=Host Status Monitor Health Check
After=host-status-monitor.service
[Service]
Type=oneshot
ExecStart=/opt/host-status-monitor/healthcheck

View file

@ -0,0 +1,10 @@
[Unit]
Description=Run host-status-monitor health check every 2 minutes
[Timer]
OnBootSec=180
OnUnitActiveSec=120
AccuracySec=30
[Install]
WantedBy=timers.target