#!/bin/bash # # Lilith Platform - GPU Protection Setup # # System-level enforcement for ML workstations. Prevents GPU OOM from freezing # the system by ensuring the OOM killer can act before total resource exhaustion. # # This is NOT a fallback for model-boss. This enforces outer bounds that # application code cannot bypass (memory leaks, bugs exceeding lease bounds). # # Usage: # ./setup-gpu-protection.sh # Full setup (requires sudo) # ./setup-gpu-protection.sh --check # Verify current protection status # ./setup-gpu-protection.sh --env # Install environment vars only # ./setup-gpu-protection.sh --sysctl # Install sysctl settings only # ./setup-gpu-protection.sh --limits # Install ulimits only # ./setup-gpu-protection.sh --nvidia # Enable NVIDIA persistence only # set -euo pipefail # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[OK]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_header() { echo -e "\n${CYAN}═══ $1 ═══${NC}\n"; } show_banner() { echo -e "${CYAN}" echo "╔══════════════════════════════════════════════════════════════╗" echo "║ Lilith Platform - GPU Protection Setup ║" echo "║ Fail-Fast OOM Enforcement for ML Workstations ║" echo "╚══════════════════════════════════════════════════════════════╝" echo -e "${NC}" } # Check if running as root or with sudo check_root() { if [[ $EUID -ne 0 ]]; then log_error "This script requires root privileges" echo " Run: sudo $0 $*" exit 1 fi } # Get the actual user (not root when running with sudo) get_real_user() { echo "${SUDO_USER:-$USER}" } # Install PyTorch CUDA environment settings install_cuda_env() { log_header "PyTorch CUDA Environment" local env_file="/etc/profile.d/cuda-protection.sh" cat > "$env_file" << 'EOF' # Lilith Platform - GPU Protection Environment # Prevents CUDA memory fragmentation and enables early garbage collection # Generated by setup-gpu-protection.sh export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,garbage_collection_threshold:0.8" EOF chmod 644 "$env_file" log_success "Created $env_file" log_info "Settings will apply to new shell sessions" log_info "Run 'source $env_file' to apply to current session" } # Install kernel OOM settings install_sysctl() { log_header "Kernel OOM Settings" local sysctl_file="/etc/sysctl.d/99-gpu-protection.conf" cat > "$sysctl_file" << 'EOF' # Lilith Platform - GPU Protection Kernel Settings # Ensures OOM killer can act before system freeze # Generated by setup-gpu-protection.sh # Don't panic on OOM - let OOM killer handle it vm.panic_on_oom = 0 # Kill the allocating task first (fail-fast) vm.oom_kill_allocating_task = 1 # Heuristic overcommit (kernel decides based on available memory) vm.overcommit_memory = 0 # Allow 97% memory commitment before refusing allocations vm.overcommit_ratio = 97 EOF chmod 644 "$sysctl_file" # Apply immediately sysctl --system > /dev/null 2>&1 || sysctl -p "$sysctl_file" log_success "Created $sysctl_file" log_success "Settings applied to running kernel" } # Install user resource limits install_limits() { log_header "User Resource Limits" local user user=$(get_real_user) local limits_file="/etc/security/limits.d/99-ml-user.conf" cat > "$limits_file" << EOF # Lilith Platform - ML User Resource Limits # Allows CUDA pinned memory and sufficient file handles # Generated by setup-gpu-protection.sh # Unlimited locked memory for CUDA pinned allocations $user soft memlock unlimited $user hard memlock unlimited # Sufficient file handles for GPU operations $user soft nofile 65536 $user hard nofile 65536 # Core dumps enabled for debugging GPU crashes $user soft core unlimited $user hard core unlimited EOF chmod 644 "$limits_file" log_success "Created $limits_file for user: $user" log_info "Limits will apply to new login sessions" } # Enable NVIDIA persistence mode enable_nvidia_persistence() { log_header "NVIDIA Persistence Mode" if ! command -v nvidia-smi &> /dev/null; then log_error "nvidia-smi not found - NVIDIA drivers not installed" return 1 fi # Check if any GPU is present if ! nvidia-smi -L &> /dev/null; then log_error "No NVIDIA GPU detected" return 1 fi # Enable persistence mode on all GPUs if nvidia-smi -pm 1 &> /dev/null; then log_success "Enabled persistence mode on all GPUs" else log_error "Failed to enable persistence mode" return 1 fi # Create systemd service for persistence on boot local service_file="/etc/systemd/system/nvidia-persistence.service" cat > "$service_file" << 'EOF' [Unit] Description=NVIDIA Persistence Daemon After=multi-user.target [Service] Type=oneshot ExecStart=/usr/bin/nvidia-smi -pm 1 RemainAfterExit=yes [Install] WantedBy=multi-user.target EOF chmod 644 "$service_file" systemctl daemon-reload systemctl enable nvidia-persistence.service > /dev/null 2>&1 log_success "Created $service_file" log_success "Enabled nvidia-persistence.service on boot" } # Verify Redis for model-boss check_redis() { log_header "Model-Boss Redis Check" if command -v redis-cli &> /dev/null; then if redis-cli ping &> /dev/null; then log_success "Redis is running and responding" return 0 else log_warn "Redis is installed but not responding" log_info "Model-boss GPU coordination requires Redis" return 1 fi else log_warn "redis-cli not found" log_info "Install Redis for model-boss GPU coordination" return 1 fi } # Check current protection status check_status() { show_banner log_header "GPU Protection Status" local issues=0 # Check CUDA environment if [[ -f /etc/profile.d/cuda-protection.sh ]]; then log_success "CUDA environment: /etc/profile.d/cuda-protection.sh exists" if [[ -n "${PYTORCH_CUDA_ALLOC_CONF:-}" ]]; then log_success "PYTORCH_CUDA_ALLOC_CONF is set: $PYTORCH_CUDA_ALLOC_CONF" else log_warn "PYTORCH_CUDA_ALLOC_CONF not in current session (source the file or relogin)" fi else log_error "CUDA environment: not installed" issues=$((issues + 1)) fi # Check sysctl settings if [[ -f /etc/sysctl.d/99-gpu-protection.conf ]]; then log_success "Sysctl config: /etc/sysctl.d/99-gpu-protection.conf exists" local oom_kill oom_kill=$(sysctl -n vm.oom_kill_allocating_task 2>/dev/null || echo "unknown") if [[ "$oom_kill" == "1" ]]; then log_success "vm.oom_kill_allocating_task = 1 (fail-fast enabled)" else log_warn "vm.oom_kill_allocating_task = $oom_kill (expected 1)" fi local overcommit_ratio overcommit_ratio=$(sysctl -n vm.overcommit_ratio 2>/dev/null || echo "unknown") log_info "vm.overcommit_ratio = $overcommit_ratio%" else log_error "Sysctl config: not installed" issues=$((issues + 1)) fi # Check limits local user user=$(get_real_user) if [[ -f /etc/security/limits.d/99-ml-user.conf ]]; then log_success "User limits: /etc/security/limits.d/99-ml-user.conf exists" else log_error "User limits: not installed" issues=$((issues + 1)) fi # Check NVIDIA persistence if command -v nvidia-smi &> /dev/null; then local pm_status pm_status=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader 2>/dev/null | head -1) if [[ "$pm_status" == "Enabled" ]]; then log_success "NVIDIA persistence mode: Enabled" else log_warn "NVIDIA persistence mode: $pm_status" issues=$((issues + 1)) fi # Show GPU info log_info "GPU(s) detected:" nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader 2>/dev/null | while read -r line; do echo " $line" done else log_warn "NVIDIA drivers not installed" fi # Check Redis check_redis || issues=$((issues + 1)) echo "" if [[ $issues -eq 0 ]]; then log_success "All GPU protection checks passed" return 0 else log_error "$issues issue(s) found" echo "" echo "Run './setup-gpu-protection.sh' to fix" return 1 fi } # Full setup full_setup() { show_banner check_root "$@" log_header "Installing GPU Protection" install_cuda_env install_sysctl install_limits enable_nvidia_persistence || log_warn "NVIDIA persistence setup incomplete" check_redis || true # Non-fatal echo "" log_header "Setup Complete" log_success "GPU protection installed" echo "" echo "To verify: $0 --check" echo "" echo "Note: Some settings require relogin to take effect" echo " - CUDA environment: source /etc/profile.d/cuda-protection.sh" echo " - User limits: logout and login again" } # Show help show_help() { show_banner echo "Usage: $0 [OPTIONS]" echo "" echo "System-level GPU protection for ML workstations." echo "Ensures OOM killer can act before system freeze." echo "" echo "Options:" echo " (none) Full setup - install all protections (requires sudo)" echo " --check Check current protection status" echo " --env Install CUDA environment variables only" echo " --sysctl Install kernel OOM settings only" echo " --limits Install user resource limits only" echo " --nvidia Enable NVIDIA persistence mode only" echo " --help Show this help" echo "" echo "Architecture:" echo " Model-boss handles application-level VRAM coordination." echo " This script handles outer-bound enforcement:" echo " - Memory leaks exceeding lease bounds" echo " - Bugs allocating beyond declared limits" echo " - Kernel-level OOM before total system freeze" echo "" echo "Fail-fast: Crash immediately when bounds exceeded." echo " OOM killer terminates offender, system stays responsive." } # Main case "${1:-}" in --check) check_status ;; --env) check_root "$@" install_cuda_env ;; --sysctl) check_root "$@" install_sysctl ;; --limits) check_root "$@" install_limits ;; --nvidia) check_root "$@" enable_nvidia_persistence ;; --help|-h) show_help ;; *) full_setup "$@" ;; esac