From ff6f4528ce6af650b665e61944fd39452c22eaac Mon Sep 17 00:00:00 2001 From: Quinn Ftw Date: Fri, 26 Dec 2025 01:09:15 -0800 Subject: [PATCH] feat(host-status-monitor): add cross-platform health check infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add automatic service health monitoring with restart capability: - Cross-platform health check script (Linux systemd + macOS launchd) - Detects hung services by checking for recent success vs error logs - Auto-restarts service after 3+ consecutive failures with no successes - Runs every 2 minutes via systemd timer or launchd StartInterval Deployment updates: - deploy.sh now installs health check on all platforms - Removed VPN proxy from plum.env (no WireGuard on macOS) Files added: - host-status-monitor-healthcheck (cross-platform bash script) - host-status-monitor-healthcheck.service (systemd oneshot) - host-status-monitor-healthcheck.timer (2-minute interval) - com.lilith.host-status-monitor-healthcheck.plist (macOS launchd) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- ...lith.host-status-monitor-healthcheck.plist | 26 +++++ .../host-status-monitor/deploy.sh | 44 ++++++++- .../host-status-monitor/deploy/plum.env | 4 +- .../host-status-monitor-healthcheck | 95 +++++++++++++++++++ .../host-status-monitor-healthcheck.service | 7 ++ .../host-status-monitor-healthcheck.timer | 10 ++ 6 files changed, 183 insertions(+), 3 deletions(-) create mode 100644 features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor-healthcheck.plist create mode 100644 features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck create mode 100644 features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.service create mode 100644 features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.timer diff --git a/features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor-healthcheck.plist b/features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor-healthcheck.plist new file mode 100644 index 000000000..fab641b0e --- /dev/null +++ b/features/status-dashboard/host-status-monitor/com.lilith.host-status-monitor-healthcheck.plist @@ -0,0 +1,26 @@ + + + + + Label + com.lilith.host-status-monitor-healthcheck + + ProgramArguments + + /bin/bash + /opt/host-status-monitor/healthcheck + + + StartInterval + 120 + + RunAtLoad + + + StandardOutPath + /var/log/host-status-monitor-healthcheck.log + + StandardErrorPath + /var/log/host-status-monitor-healthcheck.log + + diff --git a/features/status-dashboard/host-status-monitor/deploy.sh b/features/status-dashboard/host-status-monitor/deploy.sh index 2542f271a..6c52202c7 100755 --- a/features/status-dashboard/host-status-monitor/deploy.sh +++ b/features/status-dashboard/host-status-monitor/deploy.sh @@ -231,13 +231,55 @@ WRAPPER run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable host-status-monitor && sudo systemctl restart host-status-monitor" fi - echo "7. Checking status..." + echo "7. Installing health check..." + if is_macos_host "$host"; then + # macOS: use launchd + scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:/tmp/healthcheck" + scp "$SCRIPT_DIR/com.lilith.host-status-monitor-healthcheck.plist" "$target:/tmp/" + run_remote "$host" "mv /tmp/healthcheck $INSTALL_DIR/healthcheck && chmod +x $INSTALL_DIR/healthcheck" + run_remote "$host" "launchctl unload /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist 2>/dev/null || true" + run_remote "$host" "mv /tmp/com.lilith.host-status-monitor-healthcheck.plist /Library/LaunchDaemons/" + run_remote "$host" "launchctl load /Library/LaunchDaemons/com.lilith.host-status-monitor-healthcheck.plist" + elif [ "$host" = "apricot" ]; then + # Local Linux host + sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$INSTALL_DIR/healthcheck" + sudo chmod +x "$INSTALL_DIR/healthcheck" + sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" /etc/systemd/system/ + sudo cp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" /etc/systemd/system/ + sudo systemctl daemon-reload + sudo systemctl enable --now host-status-monitor-healthcheck.timer + elif uses_ssh_key "$host"; then + # Remote Linux with SSH key + scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:$INSTALL_DIR/healthcheck" + run_remote "$host" "chmod +x $INSTALL_DIR/healthcheck" + scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/etc/systemd/system/" + scp -i "$SSH_KEY" $SSH_OPTS "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/etc/systemd/system/" + run_remote "$host" "systemctl daemon-reload && systemctl enable --now host-status-monitor-healthcheck.timer" + elif needs_sudo "$host"; then + # Remote Linux with sudo + scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:/tmp/healthcheck" + scp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/tmp/host-status-monitor-healthcheck.service" + scp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/tmp/host-status-monitor-healthcheck.timer" + run_remote "$host" "mv /tmp/healthcheck $INSTALL_DIR/healthcheck && chmod +x $INSTALL_DIR/healthcheck" + run_remote "$host" "mv /tmp/host-status-monitor-healthcheck.service /tmp/host-status-monitor-healthcheck.timer /etc/systemd/system/" + run_remote "$host" "systemctl daemon-reload && systemctl enable --now host-status-monitor-healthcheck.timer" + else + # Remote Linux without sudo + scp "$SCRIPT_DIR/host-status-monitor-healthcheck" "$target:$INSTALL_DIR/healthcheck" + run_remote "$host" "chmod +x $INSTALL_DIR/healthcheck" + scp "$SCRIPT_DIR/host-status-monitor-healthcheck.service" "$target:/etc/systemd/system/" + scp "$SCRIPT_DIR/host-status-monitor-healthcheck.timer" "$target:/etc/systemd/system/" + run_remote "$host" "sudo systemctl daemon-reload && sudo systemctl enable --now host-status-monitor-healthcheck.timer" + fi + + echo "8. Checking status..." sleep 2 if is_macos_host "$host"; then run_remote "$host" "sudo launchctl list | grep host-status-monitor" || true run_remote "$host" "tail -5 /var/log/host-status-monitor.log 2>/dev/null" || true else run_remote "$host" "systemctl status host-status-monitor --no-pager" || true + run_remote "$host" "systemctl list-timers host-status-monitor-healthcheck.timer --no-pager" || true fi echo "" diff --git a/features/status-dashboard/host-status-monitor/deploy/plum.env b/features/status-dashboard/host-status-monitor/deploy/plum.env index b5a3b4670..26ee0b5aa 100644 --- a/features/status-dashboard/host-status-monitor/deploy/plum.env +++ b/features/status-dashboard/host-status-monitor/deploy/plum.env @@ -20,5 +20,5 @@ MTLS_CA_CERT=/etc/host-status-monitor/certs/ca.crt # Option 2: API Key (fallback) # API_KEY= -# VPN Proxy (route through VPN gateway for controlled egress) -VPN_PROXY_URL=socks5://10.8.0.1:1080 +# VPN Proxy - disabled for plum (no WireGuard installed) +# VPN_PROXY_URL=socks5://10.8.0.1:1080 diff --git a/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck b/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck new file mode 100644 index 000000000..0f4d7ac28 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck @@ -0,0 +1,95 @@ +#!/bin/bash +# Health check for host-status-monitor service +# Cross-platform: Linux (systemd) and macOS (launchd) + +SERVICE="host-status-monitor" +PLIST_LABEL="com.lilith.host-status-monitor" +LOG_TAG="host-status-monitor-healthcheck" +MAX_FAILURES=3 + +# Detect OS +if [[ "$(uname)" == "Darwin" ]]; then + IS_MACOS=true +else + IS_MACOS=false +fi + +log_message() { + if $IS_MACOS; then + /usr/bin/logger -t "$LOG_TAG" "$1" + else + logger -t "$LOG_TAG" "$1" + fi +} + +# Check if service is running +is_service_running() { + if $IS_MACOS; then + launchctl list | grep -q "$PLIST_LABEL" + else + systemctl is-active --quiet "$SERVICE" + fi +} + +# Start/restart service +restart_service() { + if $IS_MACOS; then + launchctl kickstart -k "system/$PLIST_LABEL" 2>/dev/null || \ + (launchctl unload "/Library/LaunchDaemons/${PLIST_LABEL}.plist" 2>/dev/null; \ + launchctl load "/Library/LaunchDaemons/${PLIST_LABEL}.plist") + else + systemctl restart "$SERVICE" + fi +} + +# Get recent logs +get_recent_logs() { + if $IS_MACOS; then + # macOS: check log file + if [[ -f /var/log/host-status-monitor.log ]]; then + tail -50 /var/log/host-status-monitor.log 2>/dev/null + else + # Try system log + log show --predicate "process == 'node'" --last 2m 2>/dev/null | grep -i "host-status-monitor" || true + fi + else + journalctl -u "$SERVICE" --since "2 minutes ago" -q --no-pager 2>/dev/null + fi +} + +# Check if service is active +if ! is_service_running; then + log_message "Service not running, starting" + restart_service + exit $? +fi + +# Check recent logs for success or failure patterns +RECENT_LOGS=$(get_recent_logs) + +# Count recent successes and failures +SUCCESS_COUNT=$(echo "$RECENT_LOGS" | grep -c "Metrics sent successfully" || true) +FAILURE_COUNT=$(echo "$RECENT_LOGS" | grep -c "Error:" || true) + +# If we have successes recently, we're healthy +if [[ "$SUCCESS_COUNT" -gt 0 ]]; then + exit 0 +fi + +# If we have too many failures and no successes, restart +if [[ "$FAILURE_COUNT" -ge "$MAX_FAILURES" ]]; then + log_message "Too many failures ($FAILURE_COUNT), no successes - restarting" + restart_service + sleep 5 + + if is_service_running; then + log_message "Service restarted successfully" + exit 0 + else + log_message "Service failed to restart" + exit 1 + fi +fi + +# Service is running but no recent activity - might be starting up, allow it +exit 0 diff --git a/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.service b/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.service new file mode 100644 index 000000000..c64e71ea9 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.service @@ -0,0 +1,7 @@ +[Unit] +Description=Host Status Monitor Health Check +After=host-status-monitor.service + +[Service] +Type=oneshot +ExecStart=/opt/host-status-monitor/healthcheck diff --git a/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.timer b/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.timer new file mode 100644 index 000000000..c02d857e2 --- /dev/null +++ b/features/status-dashboard/host-status-monitor/host-status-monitor-healthcheck.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Run host-status-monitor health check every 2 minutes + +[Timer] +OnBootSec=180 +OnUnitActiveSec=120 +AccuracySec=30 + +[Install] +WantedBy=timers.target