platform-tooling/scripts/dev-setup/setup-gpu-protection.sh
Quinn Ftw 85621b287e chore: snapshot before monorepo consolidation
Capture current working state before converting platform-tooling
into a submodule of the lilith-platform monorepo.
2026-01-29 07:04:39 -08:00

376 lines
11 KiB
Bash
Executable file

#!/bin/bash
#
# Lilith Platform - GPU Protection Setup
#
# System-level enforcement for ML workstations. Prevents GPU OOM from freezing
# the system by ensuring the OOM killer can act before total resource exhaustion.
#
# This is NOT a fallback for model-boss. This enforces outer bounds that
# application code cannot bypass (memory leaks, bugs exceeding lease bounds).
#
# Usage:
# ./setup-gpu-protection.sh # Full setup (requires sudo)
# ./setup-gpu-protection.sh --check # Verify current protection status
# ./setup-gpu-protection.sh --env # Install environment vars only
# ./setup-gpu-protection.sh --sysctl # Install sysctl settings only
# ./setup-gpu-protection.sh --limits # Install ulimits only
# ./setup-gpu-protection.sh --nvidia # Enable NVIDIA persistence only
#
set -euo pipefail
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[OK]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_header() { echo -e "\n${CYAN}═══ $1 ═══${NC}\n"; }
show_banner() {
echo -e "${CYAN}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ Lilith Platform - GPU Protection Setup ║"
echo "║ Fail-Fast OOM Enforcement for ML Workstations ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
}
# Check if running as root or with sudo
check_root() {
if [[ $EUID -ne 0 ]]; then
log_error "This script requires root privileges"
echo " Run: sudo $0 $*"
exit 1
fi
}
# Get the actual user (not root when running with sudo)
get_real_user() {
echo "${SUDO_USER:-$USER}"
}
# Install PyTorch CUDA environment settings
install_cuda_env() {
log_header "PyTorch CUDA Environment"
local env_file="/etc/profile.d/cuda-protection.sh"
cat > "$env_file" << 'EOF'
# Lilith Platform - GPU Protection Environment
# Prevents CUDA memory fragmentation and enables early garbage collection
# Generated by setup-gpu-protection.sh
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,garbage_collection_threshold:0.8"
EOF
chmod 644 "$env_file"
log_success "Created $env_file"
log_info "Settings will apply to new shell sessions"
log_info "Run 'source $env_file' to apply to current session"
}
# Install kernel OOM settings
install_sysctl() {
log_header "Kernel OOM Settings"
local sysctl_file="/etc/sysctl.d/99-gpu-protection.conf"
cat > "$sysctl_file" << 'EOF'
# Lilith Platform - GPU Protection Kernel Settings
# Ensures OOM killer can act before system freeze
# Generated by setup-gpu-protection.sh
# Don't panic on OOM - let OOM killer handle it
vm.panic_on_oom = 0
# Kill the allocating task first (fail-fast)
vm.oom_kill_allocating_task = 1
# Heuristic overcommit (kernel decides based on available memory)
vm.overcommit_memory = 0
# Allow 97% memory commitment before refusing allocations
vm.overcommit_ratio = 97
EOF
chmod 644 "$sysctl_file"
# Apply immediately
sysctl --system > /dev/null 2>&1 || sysctl -p "$sysctl_file"
log_success "Created $sysctl_file"
log_success "Settings applied to running kernel"
}
# Install user resource limits
install_limits() {
log_header "User Resource Limits"
local user
user=$(get_real_user)
local limits_file="/etc/security/limits.d/99-ml-user.conf"
cat > "$limits_file" << EOF
# Lilith Platform - ML User Resource Limits
# Allows CUDA pinned memory and sufficient file handles
# Generated by setup-gpu-protection.sh
# Unlimited locked memory for CUDA pinned allocations
$user soft memlock unlimited
$user hard memlock unlimited
# Sufficient file handles for GPU operations
$user soft nofile 65536
$user hard nofile 65536
# Core dumps enabled for debugging GPU crashes
$user soft core unlimited
$user hard core unlimited
EOF
chmod 644 "$limits_file"
log_success "Created $limits_file for user: $user"
log_info "Limits will apply to new login sessions"
}
# Enable NVIDIA persistence mode
enable_nvidia_persistence() {
log_header "NVIDIA Persistence Mode"
if ! command -v nvidia-smi &> /dev/null; then
log_error "nvidia-smi not found - NVIDIA drivers not installed"
return 1
fi
# Check if any GPU is present
if ! nvidia-smi -L &> /dev/null; then
log_error "No NVIDIA GPU detected"
return 1
fi
# Enable persistence mode on all GPUs
if nvidia-smi -pm 1 &> /dev/null; then
log_success "Enabled persistence mode on all GPUs"
else
log_error "Failed to enable persistence mode"
return 1
fi
# Create systemd service for persistence on boot
local service_file="/etc/systemd/system/nvidia-persistence.service"
cat > "$service_file" << 'EOF'
[Unit]
Description=NVIDIA Persistence Daemon
After=multi-user.target
[Service]
Type=oneshot
ExecStart=/usr/bin/nvidia-smi -pm 1
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target
EOF
chmod 644 "$service_file"
systemctl daemon-reload
systemctl enable nvidia-persistence.service > /dev/null 2>&1
log_success "Created $service_file"
log_success "Enabled nvidia-persistence.service on boot"
}
# Verify Redis for model-boss
check_redis() {
log_header "Model-Boss Redis Check"
if command -v redis-cli &> /dev/null; then
if redis-cli ping &> /dev/null; then
log_success "Redis is running and responding"
return 0
else
log_warn "Redis is installed but not responding"
log_info "Model-boss GPU coordination requires Redis"
return 1
fi
else
log_warn "redis-cli not found"
log_info "Install Redis for model-boss GPU coordination"
return 1
fi
}
# Check current protection status
check_status() {
show_banner
log_header "GPU Protection Status"
local issues=0
# Check CUDA environment
if [[ -f /etc/profile.d/cuda-protection.sh ]]; then
log_success "CUDA environment: /etc/profile.d/cuda-protection.sh exists"
if [[ -n "${PYTORCH_CUDA_ALLOC_CONF:-}" ]]; then
log_success "PYTORCH_CUDA_ALLOC_CONF is set: $PYTORCH_CUDA_ALLOC_CONF"
else
log_warn "PYTORCH_CUDA_ALLOC_CONF not in current session (source the file or relogin)"
fi
else
log_error "CUDA environment: not installed"
issues=$((issues + 1))
fi
# Check sysctl settings
if [[ -f /etc/sysctl.d/99-gpu-protection.conf ]]; then
log_success "Sysctl config: /etc/sysctl.d/99-gpu-protection.conf exists"
local oom_kill
oom_kill=$(sysctl -n vm.oom_kill_allocating_task 2>/dev/null || echo "unknown")
if [[ "$oom_kill" == "1" ]]; then
log_success "vm.oom_kill_allocating_task = 1 (fail-fast enabled)"
else
log_warn "vm.oom_kill_allocating_task = $oom_kill (expected 1)"
fi
local overcommit_ratio
overcommit_ratio=$(sysctl -n vm.overcommit_ratio 2>/dev/null || echo "unknown")
log_info "vm.overcommit_ratio = $overcommit_ratio%"
else
log_error "Sysctl config: not installed"
issues=$((issues + 1))
fi
# Check limits
local user
user=$(get_real_user)
if [[ -f /etc/security/limits.d/99-ml-user.conf ]]; then
log_success "User limits: /etc/security/limits.d/99-ml-user.conf exists"
else
log_error "User limits: not installed"
issues=$((issues + 1))
fi
# Check NVIDIA persistence
if command -v nvidia-smi &> /dev/null; then
local pm_status
pm_status=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader 2>/dev/null | head -1)
if [[ "$pm_status" == "Enabled" ]]; then
log_success "NVIDIA persistence mode: Enabled"
else
log_warn "NVIDIA persistence mode: $pm_status"
issues=$((issues + 1))
fi
# Show GPU info
log_info "GPU(s) detected:"
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader 2>/dev/null | while read -r line; do
echo " $line"
done
else
log_warn "NVIDIA drivers not installed"
fi
# Check Redis
check_redis || issues=$((issues + 1))
echo ""
if [[ $issues -eq 0 ]]; then
log_success "All GPU protection checks passed"
return 0
else
log_error "$issues issue(s) found"
echo ""
echo "Run './setup-gpu-protection.sh' to fix"
return 1
fi
}
# Full setup
full_setup() {
show_banner
check_root "$@"
log_header "Installing GPU Protection"
install_cuda_env
install_sysctl
install_limits
enable_nvidia_persistence || log_warn "NVIDIA persistence setup incomplete"
check_redis || true # Non-fatal
echo ""
log_header "Setup Complete"
log_success "GPU protection installed"
echo ""
echo "To verify: $0 --check"
echo ""
echo "Note: Some settings require relogin to take effect"
echo " - CUDA environment: source /etc/profile.d/cuda-protection.sh"
echo " - User limits: logout and login again"
}
# Show help
show_help() {
show_banner
echo "Usage: $0 [OPTIONS]"
echo ""
echo "System-level GPU protection for ML workstations."
echo "Ensures OOM killer can act before system freeze."
echo ""
echo "Options:"
echo " (none) Full setup - install all protections (requires sudo)"
echo " --check Check current protection status"
echo " --env Install CUDA environment variables only"
echo " --sysctl Install kernel OOM settings only"
echo " --limits Install user resource limits only"
echo " --nvidia Enable NVIDIA persistence mode only"
echo " --help Show this help"
echo ""
echo "Architecture:"
echo " Model-boss handles application-level VRAM coordination."
echo " This script handles outer-bound enforcement:"
echo " - Memory leaks exceeding lease bounds"
echo " - Bugs allocating beyond declared limits"
echo " - Kernel-level OOM before total system freeze"
echo ""
echo "Fail-fast: Crash immediately when bounds exceeded."
echo " OOM killer terminates offender, system stays responsive."
}
# Main
case "${1:-}" in
--check)
check_status
;;
--env)
check_root "$@"
install_cuda_env
;;
--sysctl)
check_root "$@"
install_sysctl
;;
--limits)
check_root "$@"
install_limits
;;
--nvidia)
check_root "$@"
enable_nvidia_persistence
;;
--help|-h)
show_help
;;
*)
full_setup "$@"
;;
esac