feat(host-status-monitor): add cross-platform health check infrastructure
Add automatic service health monitoring with restart capability: - Cross-platform health check script (Linux systemd + macOS launchd) - Detects hung services by checking for recent success vs error logs - Auto-restarts service after 3+ consecutive failures with no successes - Runs every 2 minutes via systemd timer or launchd StartInterval Deployment updates: - deploy.sh now installs health check on all platforms - Removed VPN proxy from plum.env (no WireGuard on macOS) Files added: - host-status-monitor-healthcheck (cross-platform bash script) - host-status-monitor-healthcheck.service (systemd oneshot) - host-status-monitor-healthcheck.timer (2-minute interval) - com.lilith.host-status-monitor-healthcheck.plist (macOS launchd) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
567d703cf6
commit
ff6f4528ce
6 changed files with 183 additions and 3 deletions
|
|
@ -0,0 +1,26 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>com.lilith.host-status-monitor-healthcheck</string>
|
||||
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>/opt/host-status-monitor/healthcheck</string>
|
||||
</array>
|
||||
|
||||
<key>StartInterval</key>
|
||||
<integer>120</integer>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
||||
<key>StandardOutPath</key>
|
||||
<string>/var/log/host-status-monitor-healthcheck.log</string>
|
||||
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/var/log/host-status-monitor-healthcheck.log</string>
|
||||
</dict>
|
||||
</plist>
|
||||
|
|
@ -231,13 +231,55 @@ WRAPPER
|
|||
run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable host-status-monitor && sudo systemctl restart host-status-monitor"
|
||||
fi
|
||||
|
||||
echo "7. Checking status..."
|
||||
echo "7. Installing health check..."
|
||||
if is_macos_host "$host"; then
|
||||
# macOS: use launchd
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:/tmp/healthcheck"
|
||||
scp "$SCRIPT_DIR/com.lilith.host-status-monitor-healthcheck.plist" "$target:/tmp/"
|
||||
run_remote "$host" "mv /tmp/healthcheck $INSTALL_DIR/healthcheck && chmod +x $INSTALL_DIR/healthcheck"
|
||||
run_remote "$host" "launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist 2>/dev/null || true"
|
||||
run_remote "$host" "mv /tmp/com.lilith.host-status-monitor-healthcheck.plist /Library/LaunchDaemons/"
|
||||
run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist"
|
||||
elif [ "$host" = "apricot" ]; then
|
||||
# Local Linux host
|
||||
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$INSTALL_DIR/healthcheck"
|
||||
sudo chmod +x "$INSTALL_DIR/healthcheck"
|
||||
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" /etc/systemd/system/
|
||||
sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now host-status-monitor-healthcheck.timer
|
||||
elif uses_ssh_key "$host"; then
|
||||
# Remote Linux with SSH key
|
||||
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:$INSTALL_DIR/healthcheck"
|
||||
run_remote "$host" "chmod +x $INSTALL_DIR/healthcheck"
|
||||
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/etc/systemd/system/"
|
||||
scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/etc/systemd/system/"
|
||||
run_remote "$host" "systemctl daemon-reload && systemctl enable --now host-status-monitor-healthcheck.timer"
|
||||
elif needs_sudo "$host"; then
|
||||
# Remote Linux with sudo
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:/tmp/healthcheck"
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/tmp/host-status-monitor-healthcheck.service"
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/tmp/host-status-monitor-healthcheck.timer"
|
||||
run_remote "$host" "mv /tmp/healthcheck $INSTALL_DIR/healthcheck && chmod +x $INSTALL_DIR/healthcheck"
|
||||
run_remote "$host" "mv /tmp/host-status-monitor-healthcheck.service /tmp/host-status-monitor-healthcheck.timer /etc/systemd/system/"
|
||||
run_remote "$host" "systemctl daemon-reload && systemctl enable --now host-status-monitor-healthcheck.timer"
|
||||
else
|
||||
# Remote Linux without sudo
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:$INSTALL_DIR/healthcheck"
|
||||
run_remote "$host" "chmod +x $INSTALL_DIR/healthcheck"
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/etc/systemd/system/"
|
||||
scp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/etc/systemd/system/"
|
||||
run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable --now host-status-monitor-healthcheck.timer"
|
||||
fi
|
||||
|
||||
echo "8. Checking status..."
|
||||
sleep 2
|
||||
if is_macos_host "$host"; then
|
||||
run_remote "$host" "sudo launchctl list | grep host-status-monitor" || true
|
||||
run_remote "$host" "tail -5 /var/log/host-status-monitor.log 2>/dev/null" || true
|
||||
else
|
||||
run_remote "$host" "systemctl status host-status-monitor --no-pager" || true
|
||||
run_remote "$host" "systemctl list-timers host-status-monitor-healthcheck.timer --no-pager" || true
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
|
|
|||
|
|
@ -20,5 +20,5 @@ MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt
|
|||
# Option 2: API Key (fallback)
|
||||
# API_KEY=<from vault/api-keys/plum.key>
|
||||
|
||||
# VPN Proxy (route through VPN gateway for controlled egress)
|
||||
VPN_PROXY_URL=socks5://10.8.0.1:1080
|
||||
# VPN Proxy - disabled for plum (no WireGuard installed)
|
||||
# VPN_PROXY_URL=socks5://10.8.0.1:1080
|
||||
|
|
|
|||
|
|
@ -0,0 +1,95 @@
|
|||
#!/bin/bash
|
||||
# Health check for host-status-monitor service
|
||||
# Cross-platform: Linux (systemd) and macOS (launchd)
|
||||
|
||||
SERVICE="host-status-monitor"
|
||||
PLIST_LABEL="com.lilith.host-status-monitor"
|
||||
LOG_TAG="host-status-monitor-healthcheck"
|
||||
MAX_FAILURES=3
|
||||
|
||||
# Detect OS
|
||||
if [[ "$(uname)" == "Darwin" ]]; then
|
||||
IS_MACOS=true
|
||||
else
|
||||
IS_MACOS=false
|
||||
fi
|
||||
|
||||
log_message() {
|
||||
if $IS_MACOS; then
|
||||
/usr/bin/logger -t "$LOG_TAG" "$1"
|
||||
else
|
||||
logger -t "$LOG_TAG" "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if service is running
|
||||
is_service_running() {
|
||||
if $IS_MACOS; then
|
||||
launchctl list | grep -q "$PLIST_LABEL"
|
||||
else
|
||||
systemctl is-active --quiet "$SERVICE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Start/restart service
|
||||
restart_service() {
|
||||
if $IS_MACOS; then
|
||||
launchctl kickstart -k "system/$PLIST_LABEL" 2>/dev/null || \
|
||||
(launchctl unload "/Library/LaunchDaemons/${PLIST_LABEL}.plist" 2>/dev/null; \
|
||||
launchctl load "/Library/LaunchDaemons/${PLIST_LABEL}.plist")
|
||||
else
|
||||
systemctl restart "$SERVICE"
|
||||
fi
|
||||
}
|
||||
|
||||
# Get recent logs
|
||||
get_recent_logs() {
|
||||
if $IS_MACOS; then
|
||||
# macOS: check log file
|
||||
if [[ -f /var/log/host-status-monitor.log ]]; then
|
||||
tail -50 /var/log/host-status-monitor.log 2>/dev/null
|
||||
else
|
||||
# Try system log
|
||||
log show --predicate "process == 'node'" --last 2m 2>/dev/null | grep -i "host-status-monitor" || true
|
||||
fi
|
||||
else
|
||||
journalctl -u "$SERVICE" --since "2 minutes ago" -q --no-pager 2>/dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if service is active
|
||||
if ! is_service_running; then
|
||||
log_message "Service not running, starting"
|
||||
restart_service
|
||||
exit $?
|
||||
fi
|
||||
|
||||
# Check recent logs for success or failure patterns
|
||||
RECENT_LOGS=$(get_recent_logs)
|
||||
|
||||
# Count recent successes and failures
|
||||
SUCCESS_COUNT=$(echo "$RECENT_LOGS" | grep -c "Metrics sent successfully" || true)
|
||||
FAILURE_COUNT=$(echo "$RECENT_LOGS" | grep -c "Error:" || true)
|
||||
|
||||
# If we have successes recently, we're healthy
|
||||
if [[ "$SUCCESS_COUNT" -gt 0 ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# If we have too many failures and no successes, restart
|
||||
if [[ "$FAILURE_COUNT" -ge "$MAX_FAILURES" ]]; then
|
||||
log_message "Too many failures ($FAILURE_COUNT), no successes - restarting"
|
||||
restart_service
|
||||
sleep 5
|
||||
|
||||
if is_service_running; then
|
||||
log_message "Service restarted successfully"
|
||||
exit 0
|
||||
else
|
||||
log_message "Service failed to restart"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Service is running but no recent activity - might be starting up, allow it
|
||||
exit 0
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
[Unit]
|
||||
Description=Host Status Monitor Health Check
|
||||
After=host-status-monitor.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/opt/host-status-monitor/healthcheck
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
[Unit]
|
||||
Description=Run host-status-monitor health check every 2 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=180
|
||||
OnUnitActiveSec=120
|
||||
AccuracySec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
Loading…
Add table
Reference in a new issue