Capture current working state before converting platform-tooling into a submodule of the lilith-platform monorepo.
376 lines
11 KiB
Bash
Executable file
376 lines
11 KiB
Bash
Executable file
#!/bin/bash
|
|
#
|
|
# Lilith Platform - GPU Protection Setup
|
|
#
|
|
# System-level enforcement for ML workstations. Prevents GPU OOM from freezing
|
|
# the system by ensuring the OOM killer can act before total resource exhaustion.
|
|
#
|
|
# This is NOT a fallback for model-boss. This enforces outer bounds that
|
|
# application code cannot bypass (memory leaks, bugs exceeding lease bounds).
|
|
#
|
|
# Usage:
|
|
# ./setup-gpu-protection.sh # Full setup (requires sudo)
|
|
# ./setup-gpu-protection.sh --check # Verify current protection status
|
|
# ./setup-gpu-protection.sh --env # Install environment vars only
|
|
# ./setup-gpu-protection.sh --sysctl # Install sysctl settings only
|
|
# ./setup-gpu-protection.sh --limits # Install ulimits only
|
|
# ./setup-gpu-protection.sh --nvidia # Enable NVIDIA persistence only
|
|
#
|
|
|
|
set -euo pipefail
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
|
log_success() { echo -e "${GREEN}[OK]${NC} $1"; }
|
|
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
|
log_header() { echo -e "\n${CYAN}═══ $1 ═══${NC}\n"; }
|
|
|
|
show_banner() {
|
|
echo -e "${CYAN}"
|
|
echo "╔══════════════════════════════════════════════════════════════╗"
|
|
echo "║ Lilith Platform - GPU Protection Setup ║"
|
|
echo "║ Fail-Fast OOM Enforcement for ML Workstations ║"
|
|
echo "╚══════════════════════════════════════════════════════════════╝"
|
|
echo -e "${NC}"
|
|
}
|
|
|
|
# Check if running as root or with sudo
|
|
check_root() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
log_error "This script requires root privileges"
|
|
echo " Run: sudo $0 $*"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Get the actual user (not root when running with sudo)
|
|
get_real_user() {
|
|
echo "${SUDO_USER:-$USER}"
|
|
}
|
|
|
|
# Install PyTorch CUDA environment settings
|
|
install_cuda_env() {
|
|
log_header "PyTorch CUDA Environment"
|
|
|
|
local env_file="/etc/profile.d/cuda-protection.sh"
|
|
|
|
cat > "$env_file" << 'EOF'
|
|
# Lilith Platform - GPU Protection Environment
|
|
# Prevents CUDA memory fragmentation and enables early garbage collection
|
|
# Generated by setup-gpu-protection.sh
|
|
|
|
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,garbage_collection_threshold:0.8"
|
|
EOF
|
|
|
|
chmod 644 "$env_file"
|
|
log_success "Created $env_file"
|
|
log_info "Settings will apply to new shell sessions"
|
|
log_info "Run 'source $env_file' to apply to current session"
|
|
}
|
|
|
|
# Install kernel OOM settings
|
|
install_sysctl() {
|
|
log_header "Kernel OOM Settings"
|
|
|
|
local sysctl_file="/etc/sysctl.d/99-gpu-protection.conf"
|
|
|
|
cat > "$sysctl_file" << 'EOF'
|
|
# Lilith Platform - GPU Protection Kernel Settings
|
|
# Ensures OOM killer can act before system freeze
|
|
# Generated by setup-gpu-protection.sh
|
|
|
|
# Don't panic on OOM - let OOM killer handle it
|
|
vm.panic_on_oom = 0
|
|
|
|
# Kill the allocating task first (fail-fast)
|
|
vm.oom_kill_allocating_task = 1
|
|
|
|
# Heuristic overcommit (kernel decides based on available memory)
|
|
vm.overcommit_memory = 0
|
|
|
|
# Allow 97% memory commitment before refusing allocations
|
|
vm.overcommit_ratio = 97
|
|
EOF
|
|
|
|
chmod 644 "$sysctl_file"
|
|
|
|
# Apply immediately
|
|
sysctl --system > /dev/null 2>&1 || sysctl -p "$sysctl_file"
|
|
|
|
log_success "Created $sysctl_file"
|
|
log_success "Settings applied to running kernel"
|
|
}
|
|
|
|
# Install user resource limits
|
|
install_limits() {
|
|
log_header "User Resource Limits"
|
|
|
|
local user
|
|
user=$(get_real_user)
|
|
local limits_file="/etc/security/limits.d/99-ml-user.conf"
|
|
|
|
cat > "$limits_file" << EOF
|
|
# Lilith Platform - ML User Resource Limits
|
|
# Allows CUDA pinned memory and sufficient file handles
|
|
# Generated by setup-gpu-protection.sh
|
|
|
|
# Unlimited locked memory for CUDA pinned allocations
|
|
$user soft memlock unlimited
|
|
$user hard memlock unlimited
|
|
|
|
# Sufficient file handles for GPU operations
|
|
$user soft nofile 65536
|
|
$user hard nofile 65536
|
|
|
|
# Core dumps enabled for debugging GPU crashes
|
|
$user soft core unlimited
|
|
$user hard core unlimited
|
|
EOF
|
|
|
|
chmod 644 "$limits_file"
|
|
|
|
log_success "Created $limits_file for user: $user"
|
|
log_info "Limits will apply to new login sessions"
|
|
}
|
|
|
|
# Enable NVIDIA persistence mode
|
|
enable_nvidia_persistence() {
|
|
log_header "NVIDIA Persistence Mode"
|
|
|
|
if ! command -v nvidia-smi &> /dev/null; then
|
|
log_error "nvidia-smi not found - NVIDIA drivers not installed"
|
|
return 1
|
|
fi
|
|
|
|
# Check if any GPU is present
|
|
if ! nvidia-smi -L &> /dev/null; then
|
|
log_error "No NVIDIA GPU detected"
|
|
return 1
|
|
fi
|
|
|
|
# Enable persistence mode on all GPUs
|
|
if nvidia-smi -pm 1 &> /dev/null; then
|
|
log_success "Enabled persistence mode on all GPUs"
|
|
else
|
|
log_error "Failed to enable persistence mode"
|
|
return 1
|
|
fi
|
|
|
|
# Create systemd service for persistence on boot
|
|
local service_file="/etc/systemd/system/nvidia-persistence.service"
|
|
|
|
cat > "$service_file" << 'EOF'
|
|
[Unit]
|
|
Description=NVIDIA Persistence Daemon
|
|
After=multi-user.target
|
|
|
|
[Service]
|
|
Type=oneshot
|
|
ExecStart=/usr/bin/nvidia-smi -pm 1
|
|
RemainAfterExit=yes
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
chmod 644 "$service_file"
|
|
systemctl daemon-reload
|
|
systemctl enable nvidia-persistence.service > /dev/null 2>&1
|
|
|
|
log_success "Created $service_file"
|
|
log_success "Enabled nvidia-persistence.service on boot"
|
|
}
|
|
|
|
# Verify Redis for model-boss
|
|
check_redis() {
|
|
log_header "Model-Boss Redis Check"
|
|
|
|
if command -v redis-cli &> /dev/null; then
|
|
if redis-cli ping &> /dev/null; then
|
|
log_success "Redis is running and responding"
|
|
return 0
|
|
else
|
|
log_warn "Redis is installed but not responding"
|
|
log_info "Model-boss GPU coordination requires Redis"
|
|
return 1
|
|
fi
|
|
else
|
|
log_warn "redis-cli not found"
|
|
log_info "Install Redis for model-boss GPU coordination"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Check current protection status
|
|
check_status() {
|
|
show_banner
|
|
log_header "GPU Protection Status"
|
|
|
|
local issues=0
|
|
|
|
# Check CUDA environment
|
|
if [[ -f /etc/profile.d/cuda-protection.sh ]]; then
|
|
log_success "CUDA environment: /etc/profile.d/cuda-protection.sh exists"
|
|
if [[ -n "${PYTORCH_CUDA_ALLOC_CONF:-}" ]]; then
|
|
log_success "PYTORCH_CUDA_ALLOC_CONF is set: $PYTORCH_CUDA_ALLOC_CONF"
|
|
else
|
|
log_warn "PYTORCH_CUDA_ALLOC_CONF not in current session (source the file or relogin)"
|
|
fi
|
|
else
|
|
log_error "CUDA environment: not installed"
|
|
issues=$((issues + 1))
|
|
fi
|
|
|
|
# Check sysctl settings
|
|
if [[ -f /etc/sysctl.d/99-gpu-protection.conf ]]; then
|
|
log_success "Sysctl config: /etc/sysctl.d/99-gpu-protection.conf exists"
|
|
|
|
local oom_kill
|
|
oom_kill=$(sysctl -n vm.oom_kill_allocating_task 2>/dev/null || echo "unknown")
|
|
if [[ "$oom_kill" == "1" ]]; then
|
|
log_success "vm.oom_kill_allocating_task = 1 (fail-fast enabled)"
|
|
else
|
|
log_warn "vm.oom_kill_allocating_task = $oom_kill (expected 1)"
|
|
fi
|
|
|
|
local overcommit_ratio
|
|
overcommit_ratio=$(sysctl -n vm.overcommit_ratio 2>/dev/null || echo "unknown")
|
|
log_info "vm.overcommit_ratio = $overcommit_ratio%"
|
|
else
|
|
log_error "Sysctl config: not installed"
|
|
issues=$((issues + 1))
|
|
fi
|
|
|
|
# Check limits
|
|
local user
|
|
user=$(get_real_user)
|
|
if [[ -f /etc/security/limits.d/99-ml-user.conf ]]; then
|
|
log_success "User limits: /etc/security/limits.d/99-ml-user.conf exists"
|
|
else
|
|
log_error "User limits: not installed"
|
|
issues=$((issues + 1))
|
|
fi
|
|
|
|
# Check NVIDIA persistence
|
|
if command -v nvidia-smi &> /dev/null; then
|
|
local pm_status
|
|
pm_status=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader 2>/dev/null | head -1)
|
|
if [[ "$pm_status" == "Enabled" ]]; then
|
|
log_success "NVIDIA persistence mode: Enabled"
|
|
else
|
|
log_warn "NVIDIA persistence mode: $pm_status"
|
|
issues=$((issues + 1))
|
|
fi
|
|
|
|
# Show GPU info
|
|
log_info "GPU(s) detected:"
|
|
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader 2>/dev/null | while read -r line; do
|
|
echo " $line"
|
|
done
|
|
else
|
|
log_warn "NVIDIA drivers not installed"
|
|
fi
|
|
|
|
# Check Redis
|
|
check_redis || issues=$((issues + 1))
|
|
|
|
echo ""
|
|
if [[ $issues -eq 0 ]]; then
|
|
log_success "All GPU protection checks passed"
|
|
return 0
|
|
else
|
|
log_error "$issues issue(s) found"
|
|
echo ""
|
|
echo "Run './setup-gpu-protection.sh' to fix"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Full setup
|
|
full_setup() {
|
|
show_banner
|
|
check_root "$@"
|
|
|
|
log_header "Installing GPU Protection"
|
|
|
|
install_cuda_env
|
|
install_sysctl
|
|
install_limits
|
|
enable_nvidia_persistence || log_warn "NVIDIA persistence setup incomplete"
|
|
check_redis || true # Non-fatal
|
|
|
|
echo ""
|
|
log_header "Setup Complete"
|
|
log_success "GPU protection installed"
|
|
echo ""
|
|
echo "To verify: $0 --check"
|
|
echo ""
|
|
echo "Note: Some settings require relogin to take effect"
|
|
echo " - CUDA environment: source /etc/profile.d/cuda-protection.sh"
|
|
echo " - User limits: logout and login again"
|
|
}
|
|
|
|
# Show help
|
|
show_help() {
|
|
show_banner
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "System-level GPU protection for ML workstations."
|
|
echo "Ensures OOM killer can act before system freeze."
|
|
echo ""
|
|
echo "Options:"
|
|
echo " (none) Full setup - install all protections (requires sudo)"
|
|
echo " --check Check current protection status"
|
|
echo " --env Install CUDA environment variables only"
|
|
echo " --sysctl Install kernel OOM settings only"
|
|
echo " --limits Install user resource limits only"
|
|
echo " --nvidia Enable NVIDIA persistence mode only"
|
|
echo " --help Show this help"
|
|
echo ""
|
|
echo "Architecture:"
|
|
echo " Model-boss handles application-level VRAM coordination."
|
|
echo " This script handles outer-bound enforcement:"
|
|
echo " - Memory leaks exceeding lease bounds"
|
|
echo " - Bugs allocating beyond declared limits"
|
|
echo " - Kernel-level OOM before total system freeze"
|
|
echo ""
|
|
echo "Fail-fast: Crash immediately when bounds exceeded."
|
|
echo " OOM killer terminates offender, system stays responsive."
|
|
}
|
|
|
|
# Main
|
|
case "${1:-}" in
|
|
--check)
|
|
check_status
|
|
;;
|
|
--env)
|
|
check_root "$@"
|
|
install_cuda_env
|
|
;;
|
|
--sysctl)
|
|
check_root "$@"
|
|
install_sysctl
|
|
;;
|
|
--limits)
|
|
check_root "$@"
|
|
install_limits
|
|
;;
|
|
--nvidia)
|
|
check_root "$@"
|
|
enable_nvidia_persistence
|
|
;;
|
|
--help|-h)
|
|
show_help
|
|
;;
|
|
*)
|
|
full_setup "$@"
|
|
;;
|
|
esac
|