platform-deployments/hosts/check-hosts
Lilith b6ca567a75 feat: initialize infrastructure repo with verification system
Move infrastructure tooling to dedicated repository, separate from codebase.
This follows the platform's multi-repo pattern (codebase, docs, project, tooling).

Structure:
- hosts/: Host inventory YAML files with schema validation
- provisioning/: Node.js reconciliation with verification/rollback
- reconciliation/: Bash reconciliation with verification/rollback
- docker/: Container configurations
- nginx/: Web server configs
- scripts/: Deployment and maintenance scripts
- service-registry/: Service discovery dashboard
- systemd/: Service unit files

Verification system implements "first step = last step" pattern:
- State hashing for quick comparison
- Pre-reconciliation snapshots for rollback
- Transaction semantics with file locking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 02:31:31 -08:00

448 lines
15 KiB
Bash
Executable file

#!/bin/bash
set -euo pipefail
#
# Host Inventory Check
# Scans all hosts from per-host YAML files, validates capabilities, offers to fix missing services
#
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
BOLD='\033[1m'
# Service check commands (capability definitions)
declare -A SERVICE_CHECKS=(
["sshd"]="systemctl is-active sshd 2>/dev/null || systemctl is-active ssh 2>/dev/null || pgrep -x sshd >/dev/null || netstat -an 2>/dev/null | grep -q '\\.22.*LISTEN'"
["nginx"]="systemctl is-active nginx 2>/dev/null || pgrep -x nginx >/dev/null"
["docker"]="docker --version 2>/dev/null"
["wireguard"]="wg show 2>/dev/null || systemctl is-active wg-quick@wg0 2>/dev/null"
["postgresql"]="systemctl is-active postgresql 2>/dev/null || pg_isready 2>/dev/null || docker ps --format '{{.Names}}' 2>/dev/null | grep -qi postgres || podman ps --format '{{.Names}}' 2>/dev/null | grep -qi postgres"
["redis"]="systemctl is-active redis 2>/dev/null || redis-cli ping 2>/dev/null || docker ps --format '{{.Names}}' 2>/dev/null | grep -qi redis || podman ps --format '{{.Names}}' 2>/dev/null | grep -qi redis"
["nfs-server"]="systemctl is-active nfs-server 2>/dev/null"
["pdns"]="systemctl is-active pdns 2>/dev/null"
["powerdns"]="systemctl is-active pdns 2>/dev/null"
)
declare -A SERVICE_INSTALL_DEBIAN=(
["sshd"]="apt-get install -y openssh-server && systemctl enable --now sshd"
["nginx"]="apt-get install -y nginx && systemctl enable --now nginx"
["docker"]="curl -fsSL https://get.docker.com | sh"
["wireguard"]="apt-get install -y wireguard-tools"
["postgresql"]="apt-get install -y postgresql && systemctl enable --now postgresql"
["redis"]="apt-get install -y redis-server && systemctl enable --now redis-server"
["nfs-server"]="apt-get install -y nfs-kernel-server && systemctl enable --now nfs-server"
["pdns"]="apt-get install -y pdns-server && systemctl enable --now pdns"
)
declare -A SERVICE_INSTALL_FEDORA=(
["sshd"]="dnf install -y openssh-server && systemctl enable --now sshd"
["nginx"]="dnf install -y nginx && systemctl enable --now nginx"
["docker"]="dnf install -y docker && systemctl enable --now docker"
["wireguard"]="dnf install -y wireguard-tools"
["postgresql"]="dnf install -y postgresql-server && postgresql-setup --initdb && systemctl enable --now postgresql"
["redis"]="dnf install -y redis && systemctl enable --now redis"
)
CRITICAL_SERVICES=("sshd")
# Check dependencies
check_deps() {
if ! command -v yq &>/dev/null; then
echo -e "${RED}yq not found. Install with: brew install yq${NC}"
exit 1
fi
if ! command -v jq &>/dev/null; then
echo -e "${RED}jq not found. Install with: brew install jq${NC}"
exit 1
fi
}
# Find all host YAML files
find_host_files() {
find "$SCRIPT_DIR" -name "*.yaml" -type f \
! -name "index.yaml" \
! -path "*/schema/*" \
2>/dev/null | sort
}
# Get host ID from YAML file
get_host_id() {
local file="$1"
yq e '.id' "$file" 2>/dev/null | grep -v '^null$' || basename "$file" .yaml
}
# Get property from host YAML file
get_prop() {
local file="$1"
local prop="$2"
yq e ".$prop" "$file" 2>/dev/null | grep -v '^null$' || true
}
# Get required services from capabilities.services array
get_required_services() {
local file="$1"
yq e '.capabilities.services[]' "$file" 2>/dev/null || true
}
# Check if service is critical
is_critical() {
local svc="$1"
for critical in "${CRITICAL_SERVICES[@]}"; do
[[ "$svc" == "$critical" ]] && return 0
done
return 1
}
# Check if we're on this host (avoid SSH to self)
is_local_host() {
local ssh_host="$1"
local current_hostname=$(hostname)
local current_ips=$(hostname -I 2>/dev/null || ip -4 addr show 2>/dev/null | grep inet | awk '{print $2}' | cut -d/ -f1)
[[ "$ssh_host" == "$current_hostname" ]] && return 0
[[ "$ssh_host" == "localhost" ]] && return 0
echo "$current_ips" | grep -qw "$ssh_host" && return 0
return 1
}
# SSH wrapper - runs locally if on same host
ssh_to_host() {
local ssh_host="$1"
local ssh_user="$2"
local ssh_key="$3"
shift 3
if is_local_host "$ssh_host"; then
bash -c "$*" 2>/dev/null
return $?
fi
ssh_key="${ssh_key/#\~/$HOME}"
local ssh_opts="-o ConnectTimeout=10 -o BatchMode=yes -o StrictHostKeyChecking=no"
if [[ -n "$ssh_key" && -f "$ssh_key" ]]; then
ssh_opts="$ssh_opts -i $ssh_key"
fi
ssh $ssh_opts "${ssh_user}@${ssh_host}" "$@" 2>/dev/null
}
# Resolve vault:// key references
resolve_key_ref() {
local keyRef="$1"
if [[ "$keyRef" == vault://ssh-keys/* ]]; then
echo "~/.ssh/${keyRef#vault://ssh-keys/}"
else
echo "$keyRef"
fi
}
# Gather system info (Linux + macOS compatible)
gather_system_info() {
local ssh_host="$1"
local ssh_user="$2"
local ssh_key="$3"
ssh_to_host "$ssh_host" "$ssh_user" "$ssh_key" 'bash -c '\''
hostname=$(hostname -s 2>/dev/null || hostname)
# Detect OS
if [ -f /etc/os-release ]; then
os=$(. /etc/os-release && echo $ID)
os_version=$(. /etc/os-release && echo $VERSION_ID)
os_family=$(. /etc/os-release && echo ${ID_LIKE:-$ID} | cut -d" " -f1)
elif command -v sw_vers >/dev/null 2>&1; then
os="darwin"
os_version=$(sw_vers -productVersion)
os_family="darwin"
else
os=$(uname -s | tr "[:upper:]" "[:lower:]")
os_version=$(uname -r)
os_family="unknown"
fi
kernel=$(uname -r)
arch=$(uname -m)
# CPU count
cpus=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
# RAM in GB
if command -v free >/dev/null 2>&1; then
ram_gb=$(free -g 2>/dev/null | awk "/Mem:/ {print \$2}")
elif command -v sysctl >/dev/null 2>&1; then
ram_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
ram_gb=$((ram_bytes / 1073741824))
else
ram_gb=0
fi
# Disk free in GB
if df -BG / >/dev/null 2>&1; then
disk_root_gb=$(df -BG / | awk "NR==2 {gsub(/G/,\"\",\$4); print \$4}")
disk_pct=$(df / | awk "NR==2 {print \$5}")
elif df -g / >/dev/null 2>&1; then
disk_root_gb=$(df -g / | awk "NR==2 {print \$4}")
disk_pct=$(df / | awk "NR==2 {print \$5}")
else
disk_root_gb=0
disk_pct="0%"
fi
# Uptime
up=$(uptime -p 2>/dev/null || uptime | sed "s/.*up //" | sed "s/,.*//" | xargs)
cat << EOF
{
"hostname": "$hostname",
"os": "$os",
"os_version": "$os_version",
"os_family": "$os_family",
"kernel": "$kernel",
"arch": "$arch",
"cpus": $cpus,
"ram_gb": $ram_gb,
"disk_root_gb": $disk_root_gb,
"disk_root_used_pct": "$disk_pct",
"uptime": "$up"
}
EOF
'\'''
}
# Check a single service
check_service() {
local ssh_host="$1"
local ssh_user="$2"
local ssh_key="$3"
local svc="$4"
local check_cmd="${SERVICE_CHECKS[$svc]:-}"
if [[ -z "$check_cmd" ]]; then
echo "unknown"
return
fi
if ssh_to_host "$ssh_host" "$ssh_user" "$ssh_key" "$check_cmd" &>/dev/null; then
echo "ok"
else
echo "missing"
fi
}
# Print host report
print_host_report() {
local displayName="$1"
local fqdn="$2"
local networkGroup="$3"
local info="$4"
local os=$(echo "$info" | jq -r '.os // "unknown"')
local os_version=$(echo "$info" | jq -r '.os_version // ""')
local cpus=$(echo "$info" | jq -r '.cpus // 0')
local ram=$(echo "$info" | jq -r '.ram_gb // 0')
local disk=$(echo "$info" | jq -r '.disk_root_gb // 0')
local disk_pct=$(echo "$info" | jq -r '.disk_root_used_pct // "0%"')
local uptime=$(echo "$info" | jq -r '.uptime // "unknown"')
echo -e "${BOLD}${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${BOLD} ${displayName}${NC}"
echo -e " ${CYAN}$fqdn${NC} ${YELLOW}[$networkGroup]${NC}"
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
printf " %-12s %s %s\n" "OS:" "$os" "$os_version"
printf " %-12s %s cores\n" "CPU:" "$cpus"
printf " %-12s %s GB\n" "RAM:" "$ram"
printf " %-12s %s GB free (%s used)\n" "Disk:" "$disk" "$disk_pct"
printf " %-12s %s\n" "Uptime:" "$uptime"
echo ""
}
# Main check function for a single host file
check_host_file() {
local file="$1"
local fix_mode="${2:-check}"
local host_id=$(get_host_id "$file")
local displayName=$(get_prop "$file" "displayName")
local fqdn=$(get_prop "$file" "fqdn")
local networkGroup=$(get_prop "$file" "networkGroup")
# Get SSH connection details
local ssh_host=$(get_prop "$file" "ssh.host")
local ssh_ip=$(get_prop "$file" "ssh.ip")
local ssh_user=$(get_prop "$file" "ssh.user")
local ssh_keyRef=$(get_prop "$file" "ssh.keyRef")
local ssh_key=$(resolve_key_ref "$ssh_keyRef")
# Use IP if host isn't resolvable
[[ -z "$ssh_host" ]] && ssh_host="$ssh_ip"
[[ -z "$ssh_user" ]] && ssh_user="root"
echo -e "${CYAN}Checking $host_id ($ssh_host)...${NC}"
# Test connectivity
if ! ssh_to_host "$ssh_host" "$ssh_user" "$ssh_key" "true" 2>/dev/null; then
echo -e " ${RED}✗ Cannot connect to $ssh_host${NC}"
echo ""
return 1
fi
# Gather system info
local info
info=$(gather_system_info "$ssh_host" "$ssh_user" "$ssh_key") || {
echo -e " ${RED}✗ Failed to gather system info${NC}"
echo ""
return 1
}
local os=$(echo "$info" | jq -r '.os // "unknown"')
print_host_report "${displayName:-$host_id}" "$fqdn" "$networkGroup" "$info"
# Check required services from capabilities.services
local services=$(get_required_services "$file")
local missing=()
local critical_missing=()
if [[ -n "$services" ]]; then
echo -e " ${BOLD}Required Services:${NC}"
for svc in $services; do
local status=$(check_service "$ssh_host" "$ssh_user" "$ssh_key" "$svc")
case "$status" in
ok)
echo -e " ${GREEN}${NC} $svc"
;;
missing)
if is_critical "$svc"; then
echo -e " ${RED}$svc (CRITICAL)${NC}"
critical_missing+=("$svc")
else
echo -e " ${YELLOW}$svc${NC}"
fi
missing+=("$svc")
;;
*)
echo -e " ${YELLOW}? $svc (unknown check)${NC}"
;;
esac
done
echo ""
fi
# Check disk thresholds from alerts config
local disk_threshold=$(get_prop "$file" "alerts.diskThreshold")
if [[ -n "$disk_threshold" ]]; then
local disk_pct_num=$(echo "$info" | jq -r '.disk_root_used_pct // "0%"' | tr -d '%')
if [[ "$disk_pct_num" -ge "$disk_threshold" ]]; then
echo -e " ${RED}⚠ Disk usage ${disk_pct_num}% exceeds threshold ${disk_threshold}%${NC}"
fi
fi
# Offer to fix missing services
if [[ ${#missing[@]} -gt 0 && "$fix_mode" == "--fix" ]]; then
echo ""
for svc in "${missing[@]}"; do
echo -n -e "${YELLOW}Install $svc on $host_id? [y/N] ${NC}"
read -r response
if [[ "$response" =~ ^[Yy] ]]; then
# Determine OS family for install command
local os_family="$os"
case "$os" in
ubuntu|debian) os_family="debian" ;;
fedora|rhel|centos|rocky|alma) os_family="fedora" ;;
esac
local install_cmd=""
if [[ "$os_family" == "debian" ]]; then
install_cmd="${SERVICE_INSTALL_DEBIAN[$svc]:-}"
elif [[ "$os_family" == "fedora" ]]; then
install_cmd="${SERVICE_INSTALL_FEDORA[$svc]:-}"
fi
if [[ -n "$install_cmd" ]]; then
echo -e "${CYAN}Installing $svc...${NC}"
echo -e "${YELLOW}Command: sudo $install_cmd${NC}"
local ssh_opts="-o ConnectTimeout=10 -o StrictHostKeyChecking=no"
ssh_key="${ssh_key/#\~/$HOME}"
if [[ -n "$ssh_key" && -f "$ssh_key" ]]; then
ssh_opts="$ssh_opts -i $ssh_key"
fi
ssh -t $ssh_opts "${ssh_user}@${ssh_host}" "sudo bash -c '$install_cmd'"
else
echo -e "${RED}No install command for $svc on $os_family${NC}"
fi
fi
done
elif [[ ${#critical_missing[@]} -gt 0 ]]; then
echo -e " ${RED}⚠ Run with --fix to install missing critical services${NC}"
fi
echo ""
}
# Main
main() {
check_deps
local mode="check"
local target="all"
# Parse args
for arg in "$@"; do
case "$arg" in
--fix) mode="--fix" ;;
*) target="$arg" ;;
esac
done
echo -e "${BOLD}"
echo "╔══════════════════════════════════════════════════════════════════╗"
echo "║ Lilith Platform Infrastructure Check ║"
echo "╚══════════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
echo ""
if [[ "$target" == "all" ]]; then
for file in $(find_host_files); do
check_host_file "$file" "$mode" || true
done
else
# Find host by ID
local found=false
for file in $(find_host_files); do
local host_id=$(get_host_id "$file")
if [[ "$host_id" == "$target" ]]; then
check_host_file "$file" "$mode"
found=true
break
fi
done
if [[ "$found" == "false" ]]; then
echo -e "${RED}Host '$target' not found${NC}"
echo "Available hosts:"
for file in $(find_host_files); do
echo " - $(get_host_id "$file")"
done
exit 1
fi
fi
echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e " Usage: $0 [--fix] [host|all]"
echo -e " Examples:"
echo -e " $0 # Check all hosts"
echo -e " $0 --fix # Check and offer to fix all"
echo -e " $0 --fix apricot # Fix specific host"
echo ""
}
main "$@"