Move infrastructure tooling to dedicated repository, separate from codebase. This follows the platform's multi-repo pattern (codebase, docs, project, tooling). Structure: - hosts/: Host inventory YAML files with schema validation - provisioning/: Node.js reconciliation with verification/rollback - reconciliation/: Bash reconciliation with verification/rollback - docker/: Container configurations - nginx/: Web server configs - scripts/: Deployment and maintenance scripts - service-registry/: Service discovery dashboard - systemd/: Service unit files Verification system implements "first step = last step" pattern: - State hashing for quick comparison - Pre-reconciliation snapshots for rollback - Transaction semantics with file locking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
448 lines
15 KiB
Bash
Executable file
448 lines
15 KiB
Bash
Executable file
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
#
|
|
# Host Inventory Check
|
|
# Scans all hosts from per-host YAML files, validates capabilities, offers to fix missing services
|
|
#
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
BOLD='\033[1m'
|
|
|
|
# Service check commands (capability definitions)
|
|
declare -A SERVICE_CHECKS=(
|
|
["sshd"]="systemctl is-active sshd 2>/dev/null || systemctl is-active ssh 2>/dev/null || pgrep -x sshd >/dev/null || netstat -an 2>/dev/null | grep -q '\\.22.*LISTEN'"
|
|
["nginx"]="systemctl is-active nginx 2>/dev/null || pgrep -x nginx >/dev/null"
|
|
["docker"]="docker --version 2>/dev/null"
|
|
["wireguard"]="wg show 2>/dev/null || systemctl is-active wg-quick@wg0 2>/dev/null"
|
|
["postgresql"]="systemctl is-active postgresql 2>/dev/null || pg_isready 2>/dev/null || docker ps --format '{{.Names}}' 2>/dev/null | grep -qi postgres || podman ps --format '{{.Names}}' 2>/dev/null | grep -qi postgres"
|
|
["redis"]="systemctl is-active redis 2>/dev/null || redis-cli ping 2>/dev/null || docker ps --format '{{.Names}}' 2>/dev/null | grep -qi redis || podman ps --format '{{.Names}}' 2>/dev/null | grep -qi redis"
|
|
["nfs-server"]="systemctl is-active nfs-server 2>/dev/null"
|
|
["pdns"]="systemctl is-active pdns 2>/dev/null"
|
|
["powerdns"]="systemctl is-active pdns 2>/dev/null"
|
|
)
|
|
|
|
declare -A SERVICE_INSTALL_DEBIAN=(
|
|
["sshd"]="apt-get install -y openssh-server && systemctl enable --now sshd"
|
|
["nginx"]="apt-get install -y nginx && systemctl enable --now nginx"
|
|
["docker"]="curl -fsSL https://get.docker.com | sh"
|
|
["wireguard"]="apt-get install -y wireguard-tools"
|
|
["postgresql"]="apt-get install -y postgresql && systemctl enable --now postgresql"
|
|
["redis"]="apt-get install -y redis-server && systemctl enable --now redis-server"
|
|
["nfs-server"]="apt-get install -y nfs-kernel-server && systemctl enable --now nfs-server"
|
|
["pdns"]="apt-get install -y pdns-server && systemctl enable --now pdns"
|
|
)
|
|
|
|
declare -A SERVICE_INSTALL_FEDORA=(
|
|
["sshd"]="dnf install -y openssh-server && systemctl enable --now sshd"
|
|
["nginx"]="dnf install -y nginx && systemctl enable --now nginx"
|
|
["docker"]="dnf install -y docker && systemctl enable --now docker"
|
|
["wireguard"]="dnf install -y wireguard-tools"
|
|
["postgresql"]="dnf install -y postgresql-server && postgresql-setup --initdb && systemctl enable --now postgresql"
|
|
["redis"]="dnf install -y redis && systemctl enable --now redis"
|
|
)
|
|
|
|
CRITICAL_SERVICES=("sshd")
|
|
|
|
# Check dependencies
|
|
check_deps() {
|
|
if ! command -v yq &>/dev/null; then
|
|
echo -e "${RED}yq not found. Install with: brew install yq${NC}"
|
|
exit 1
|
|
fi
|
|
if ! command -v jq &>/dev/null; then
|
|
echo -e "${RED}jq not found. Install with: brew install jq${NC}"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Find all host YAML files
|
|
find_host_files() {
|
|
find "$SCRIPT_DIR" -name "*.yaml" -type f \
|
|
! -name "index.yaml" \
|
|
! -path "*/schema/*" \
|
|
2>/dev/null | sort
|
|
}
|
|
|
|
# Get host ID from YAML file
|
|
get_host_id() {
|
|
local file="$1"
|
|
yq e '.id' "$file" 2>/dev/null | grep -v '^null$' || basename "$file" .yaml
|
|
}
|
|
|
|
# Get property from host YAML file
|
|
get_prop() {
|
|
local file="$1"
|
|
local prop="$2"
|
|
yq e ".$prop" "$file" 2>/dev/null | grep -v '^null$' || true
|
|
}
|
|
|
|
# Get required services from capabilities.services array
|
|
get_required_services() {
|
|
local file="$1"
|
|
yq e '.capabilities.services[]' "$file" 2>/dev/null || true
|
|
}
|
|
|
|
# Check if service is critical
|
|
is_critical() {
|
|
local svc="$1"
|
|
for critical in "${CRITICAL_SERVICES[@]}"; do
|
|
[[ "$svc" == "$critical" ]] && return 0
|
|
done
|
|
return 1
|
|
}
|
|
|
|
# Check if we're on this host (avoid SSH to self)
|
|
is_local_host() {
|
|
local ssh_host="$1"
|
|
local current_hostname=$(hostname)
|
|
local current_ips=$(hostname -I 2>/dev/null || ip -4 addr show 2>/dev/null | grep inet | awk '{print $2}' | cut -d/ -f1)
|
|
|
|
[[ "$ssh_host" == "$current_hostname" ]] && return 0
|
|
[[ "$ssh_host" == "localhost" ]] && return 0
|
|
echo "$current_ips" | grep -qw "$ssh_host" && return 0
|
|
return 1
|
|
}
|
|
|
|
# SSH wrapper - runs locally if on same host
|
|
ssh_to_host() {
|
|
local ssh_host="$1"
|
|
local ssh_user="$2"
|
|
local ssh_key="$3"
|
|
shift 3
|
|
|
|
if is_local_host "$ssh_host"; then
|
|
bash -c "$*" 2>/dev/null
|
|
return $?
|
|
fi
|
|
|
|
ssh_key="${ssh_key/#\~/$HOME}"
|
|
|
|
local ssh_opts="-o ConnectTimeout=10 -o BatchMode=yes -o StrictHostKeyChecking=no"
|
|
if [[ -n "$ssh_key" && -f "$ssh_key" ]]; then
|
|
ssh_opts="$ssh_opts -i $ssh_key"
|
|
fi
|
|
|
|
ssh $ssh_opts "${ssh_user}@${ssh_host}" "$@" 2>/dev/null
|
|
}
|
|
|
|
# Resolve vault:// key references
|
|
resolve_key_ref() {
|
|
local keyRef="$1"
|
|
if [[ "$keyRef" == vault://ssh-keys/* ]]; then
|
|
echo "~/.ssh/${keyRef#vault://ssh-keys/}"
|
|
else
|
|
echo "$keyRef"
|
|
fi
|
|
}
|
|
|
|
# Gather system info (Linux + macOS compatible)
|
|
gather_system_info() {
|
|
local ssh_host="$1"
|
|
local ssh_user="$2"
|
|
local ssh_key="$3"
|
|
|
|
ssh_to_host "$ssh_host" "$ssh_user" "$ssh_key" 'bash -c '\''
|
|
hostname=$(hostname -s 2>/dev/null || hostname)
|
|
|
|
# Detect OS
|
|
if [ -f /etc/os-release ]; then
|
|
os=$(. /etc/os-release && echo $ID)
|
|
os_version=$(. /etc/os-release && echo $VERSION_ID)
|
|
os_family=$(. /etc/os-release && echo ${ID_LIKE:-$ID} | cut -d" " -f1)
|
|
elif command -v sw_vers >/dev/null 2>&1; then
|
|
os="darwin"
|
|
os_version=$(sw_vers -productVersion)
|
|
os_family="darwin"
|
|
else
|
|
os=$(uname -s | tr "[:upper:]" "[:lower:]")
|
|
os_version=$(uname -r)
|
|
os_family="unknown"
|
|
fi
|
|
|
|
kernel=$(uname -r)
|
|
arch=$(uname -m)
|
|
|
|
# CPU count
|
|
cpus=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
|
|
|
# RAM in GB
|
|
if command -v free >/dev/null 2>&1; then
|
|
ram_gb=$(free -g 2>/dev/null | awk "/Mem:/ {print \$2}")
|
|
elif command -v sysctl >/dev/null 2>&1; then
|
|
ram_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
|
|
ram_gb=$((ram_bytes / 1073741824))
|
|
else
|
|
ram_gb=0
|
|
fi
|
|
|
|
# Disk free in GB
|
|
if df -BG / >/dev/null 2>&1; then
|
|
disk_root_gb=$(df -BG / | awk "NR==2 {gsub(/G/,\"\",\$4); print \$4}")
|
|
disk_pct=$(df / | awk "NR==2 {print \$5}")
|
|
elif df -g / >/dev/null 2>&1; then
|
|
disk_root_gb=$(df -g / | awk "NR==2 {print \$4}")
|
|
disk_pct=$(df / | awk "NR==2 {print \$5}")
|
|
else
|
|
disk_root_gb=0
|
|
disk_pct="0%"
|
|
fi
|
|
|
|
# Uptime
|
|
up=$(uptime -p 2>/dev/null || uptime | sed "s/.*up //" | sed "s/,.*//" | xargs)
|
|
|
|
cat << EOF
|
|
{
|
|
"hostname": "$hostname",
|
|
"os": "$os",
|
|
"os_version": "$os_version",
|
|
"os_family": "$os_family",
|
|
"kernel": "$kernel",
|
|
"arch": "$arch",
|
|
"cpus": $cpus,
|
|
"ram_gb": $ram_gb,
|
|
"disk_root_gb": $disk_root_gb,
|
|
"disk_root_used_pct": "$disk_pct",
|
|
"uptime": "$up"
|
|
}
|
|
EOF
|
|
'\'''
|
|
}
|
|
|
|
# Check a single service
|
|
check_service() {
|
|
local ssh_host="$1"
|
|
local ssh_user="$2"
|
|
local ssh_key="$3"
|
|
local svc="$4"
|
|
local check_cmd="${SERVICE_CHECKS[$svc]:-}"
|
|
|
|
if [[ -z "$check_cmd" ]]; then
|
|
echo "unknown"
|
|
return
|
|
fi
|
|
|
|
if ssh_to_host "$ssh_host" "$ssh_user" "$ssh_key" "$check_cmd" &>/dev/null; then
|
|
echo "ok"
|
|
else
|
|
echo "missing"
|
|
fi
|
|
}
|
|
|
|
# Print host report
|
|
print_host_report() {
|
|
local displayName="$1"
|
|
local fqdn="$2"
|
|
local networkGroup="$3"
|
|
local info="$4"
|
|
|
|
local os=$(echo "$info" | jq -r '.os // "unknown"')
|
|
local os_version=$(echo "$info" | jq -r '.os_version // ""')
|
|
local cpus=$(echo "$info" | jq -r '.cpus // 0')
|
|
local ram=$(echo "$info" | jq -r '.ram_gb // 0')
|
|
local disk=$(echo "$info" | jq -r '.disk_root_gb // 0')
|
|
local disk_pct=$(echo "$info" | jq -r '.disk_root_used_pct // "0%"')
|
|
local uptime=$(echo "$info" | jq -r '.uptime // "unknown"')
|
|
|
|
echo -e "${BOLD}${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
echo -e "${BOLD} ${displayName}${NC}"
|
|
echo -e " ${CYAN}$fqdn${NC} ${YELLOW}[$networkGroup]${NC}"
|
|
echo -e "${BLUE}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
echo ""
|
|
printf " %-12s %s %s\n" "OS:" "$os" "$os_version"
|
|
printf " %-12s %s cores\n" "CPU:" "$cpus"
|
|
printf " %-12s %s GB\n" "RAM:" "$ram"
|
|
printf " %-12s %s GB free (%s used)\n" "Disk:" "$disk" "$disk_pct"
|
|
printf " %-12s %s\n" "Uptime:" "$uptime"
|
|
echo ""
|
|
}
|
|
|
|
# Main check function for a single host file
|
|
check_host_file() {
|
|
local file="$1"
|
|
local fix_mode="${2:-check}"
|
|
|
|
local host_id=$(get_host_id "$file")
|
|
local displayName=$(get_prop "$file" "displayName")
|
|
local fqdn=$(get_prop "$file" "fqdn")
|
|
local networkGroup=$(get_prop "$file" "networkGroup")
|
|
|
|
# Get SSH connection details
|
|
local ssh_host=$(get_prop "$file" "ssh.host")
|
|
local ssh_ip=$(get_prop "$file" "ssh.ip")
|
|
local ssh_user=$(get_prop "$file" "ssh.user")
|
|
local ssh_keyRef=$(get_prop "$file" "ssh.keyRef")
|
|
local ssh_key=$(resolve_key_ref "$ssh_keyRef")
|
|
|
|
# Use IP if host isn't resolvable
|
|
[[ -z "$ssh_host" ]] && ssh_host="$ssh_ip"
|
|
[[ -z "$ssh_user" ]] && ssh_user="root"
|
|
|
|
echo -e "${CYAN}Checking $host_id ($ssh_host)...${NC}"
|
|
|
|
# Test connectivity
|
|
if ! ssh_to_host "$ssh_host" "$ssh_user" "$ssh_key" "true" 2>/dev/null; then
|
|
echo -e " ${RED}✗ Cannot connect to $ssh_host${NC}"
|
|
echo ""
|
|
return 1
|
|
fi
|
|
|
|
# Gather system info
|
|
local info
|
|
info=$(gather_system_info "$ssh_host" "$ssh_user" "$ssh_key") || {
|
|
echo -e " ${RED}✗ Failed to gather system info${NC}"
|
|
echo ""
|
|
return 1
|
|
}
|
|
|
|
local os=$(echo "$info" | jq -r '.os // "unknown"')
|
|
|
|
print_host_report "${displayName:-$host_id}" "$fqdn" "$networkGroup" "$info"
|
|
|
|
# Check required services from capabilities.services
|
|
local services=$(get_required_services "$file")
|
|
local missing=()
|
|
local critical_missing=()
|
|
|
|
if [[ -n "$services" ]]; then
|
|
echo -e " ${BOLD}Required Services:${NC}"
|
|
|
|
for svc in $services; do
|
|
local status=$(check_service "$ssh_host" "$ssh_user" "$ssh_key" "$svc")
|
|
case "$status" in
|
|
ok)
|
|
echo -e " ${GREEN}✓${NC} $svc"
|
|
;;
|
|
missing)
|
|
if is_critical "$svc"; then
|
|
echo -e " ${RED}✗ $svc (CRITICAL)${NC}"
|
|
critical_missing+=("$svc")
|
|
else
|
|
echo -e " ${YELLOW}✗ $svc${NC}"
|
|
fi
|
|
missing+=("$svc")
|
|
;;
|
|
*)
|
|
echo -e " ${YELLOW}? $svc (unknown check)${NC}"
|
|
;;
|
|
esac
|
|
done
|
|
echo ""
|
|
fi
|
|
|
|
# Check disk thresholds from alerts config
|
|
local disk_threshold=$(get_prop "$file" "alerts.diskThreshold")
|
|
if [[ -n "$disk_threshold" ]]; then
|
|
local disk_pct_num=$(echo "$info" | jq -r '.disk_root_used_pct // "0%"' | tr -d '%')
|
|
if [[ "$disk_pct_num" -ge "$disk_threshold" ]]; then
|
|
echo -e " ${RED}⚠ Disk usage ${disk_pct_num}% exceeds threshold ${disk_threshold}%${NC}"
|
|
fi
|
|
fi
|
|
|
|
# Offer to fix missing services
|
|
if [[ ${#missing[@]} -gt 0 && "$fix_mode" == "--fix" ]]; then
|
|
echo ""
|
|
for svc in "${missing[@]}"; do
|
|
echo -n -e "${YELLOW}Install $svc on $host_id? [y/N] ${NC}"
|
|
read -r response
|
|
if [[ "$response" =~ ^[Yy] ]]; then
|
|
# Determine OS family for install command
|
|
local os_family="$os"
|
|
case "$os" in
|
|
ubuntu|debian) os_family="debian" ;;
|
|
fedora|rhel|centos|rocky|alma) os_family="fedora" ;;
|
|
esac
|
|
|
|
local install_cmd=""
|
|
if [[ "$os_family" == "debian" ]]; then
|
|
install_cmd="${SERVICE_INSTALL_DEBIAN[$svc]:-}"
|
|
elif [[ "$os_family" == "fedora" ]]; then
|
|
install_cmd="${SERVICE_INSTALL_FEDORA[$svc]:-}"
|
|
fi
|
|
|
|
if [[ -n "$install_cmd" ]]; then
|
|
echo -e "${CYAN}Installing $svc...${NC}"
|
|
echo -e "${YELLOW}Command: sudo $install_cmd${NC}"
|
|
local ssh_opts="-o ConnectTimeout=10 -o StrictHostKeyChecking=no"
|
|
ssh_key="${ssh_key/#\~/$HOME}"
|
|
if [[ -n "$ssh_key" && -f "$ssh_key" ]]; then
|
|
ssh_opts="$ssh_opts -i $ssh_key"
|
|
fi
|
|
ssh -t $ssh_opts "${ssh_user}@${ssh_host}" "sudo bash -c '$install_cmd'"
|
|
else
|
|
echo -e "${RED}No install command for $svc on $os_family${NC}"
|
|
fi
|
|
fi
|
|
done
|
|
elif [[ ${#critical_missing[@]} -gt 0 ]]; then
|
|
echo -e " ${RED}⚠ Run with --fix to install missing critical services${NC}"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# Main
|
|
main() {
|
|
check_deps
|
|
|
|
local mode="check"
|
|
local target="all"
|
|
|
|
# Parse args
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--fix) mode="--fix" ;;
|
|
*) target="$arg" ;;
|
|
esac
|
|
done
|
|
|
|
echo -e "${BOLD}"
|
|
echo "╔══════════════════════════════════════════════════════════════════╗"
|
|
echo "║ Lilith Platform Infrastructure Check ║"
|
|
echo "╚══════════════════════════════════════════════════════════════════╝"
|
|
echo -e "${NC}"
|
|
echo ""
|
|
|
|
if [[ "$target" == "all" ]]; then
|
|
for file in $(find_host_files); do
|
|
check_host_file "$file" "$mode" || true
|
|
done
|
|
else
|
|
# Find host by ID
|
|
local found=false
|
|
for file in $(find_host_files); do
|
|
local host_id=$(get_host_id "$file")
|
|
if [[ "$host_id" == "$target" ]]; then
|
|
check_host_file "$file" "$mode"
|
|
found=true
|
|
break
|
|
fi
|
|
done
|
|
if [[ "$found" == "false" ]]; then
|
|
echo -e "${RED}Host '$target' not found${NC}"
|
|
echo "Available hosts:"
|
|
for file in $(find_host_files); do
|
|
echo " - $(get_host_id "$file")"
|
|
done
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
echo -e " Usage: $0 [--fix] [host|all]"
|
|
echo -e " Examples:"
|
|
echo -e " $0 # Check all hosts"
|
|
echo -e " $0 --fix # Check and offer to fix all"
|
|
echo -e " $0 --fix apricot # Fix specific host"
|
|
echo ""
|
|
}
|
|
|
|
main "$@"
|