platform-tooling/scripts/dev-setup/setup-gpu-protection.sh

#!/bin/bash
#
# Lilith Platform - GPU Protection Setup
#
# System-level enforcement for ML workstations. Prevents GPU OOM from freezing
# the system by ensuring the OOM killer can act before total resource exhaustion.
#
# This is NOT a fallback for model-boss. This enforces outer bounds that
# application code cannot bypass (memory leaks, bugs exceeding lease bounds).
#
# Usage:
#   ./setup-gpu-protection.sh              # Full setup (requires sudo)
#   ./setup-gpu-protection.sh --check      # Verify current protection status
#   ./setup-gpu-protection.sh --env        # Install environment vars only
#   ./setup-gpu-protection.sh --sysctl     # Install sysctl settings only
#   ./setup-gpu-protection.sh --limits     # Install ulimits only
#   ./setup-gpu-protection.sh --nvidia     # Enable NVIDIA persistence only
#

set -euo pipefail

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[OK]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_header() { echo -e "\n${CYAN}═══ $1 ═══${NC}\n"; }

show_banner() {
    echo -e "${CYAN}"
    echo "╔══════════════════════════════════════════════════════════════╗"
    echo "║     Lilith Platform - GPU Protection Setup                   ║"
    echo "║     Fail-Fast OOM Enforcement for ML Workstations            ║"
    echo "╚══════════════════════════════════════════════════════════════╝"
    echo -e "${NC}"
}

# Check if running as root or with sudo
check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_error "This script requires root privileges"
        echo "  Run: sudo $0 $*"
        exit 1
    fi
}

# Get the actual user (not root when running with sudo)
get_real_user() {
    echo "${SUDO_USER:-$USER}"
}

# Install PyTorch CUDA environment settings
install_cuda_env() {
    log_header "PyTorch CUDA Environment"

    local env_file="/etc/profile.d/cuda-protection.sh"

    cat > "$env_file" << 'EOF'
# Lilith Platform - GPU Protection Environment
# Prevents CUDA memory fragmentation and enables early garbage collection
# Generated by setup-gpu-protection.sh

export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,garbage_collection_threshold:0.8"
EOF

    chmod 644 "$env_file"
    log_success "Created $env_file"
    log_info "Settings will apply to new shell sessions"
    log_info "Run 'source $env_file' to apply to current session"
}

# Install kernel OOM settings
install_sysctl() {
    log_header "Kernel OOM Settings"

    local sysctl_file="/etc/sysctl.d/99-gpu-protection.conf"

    cat > "$sysctl_file" << 'EOF'
# Lilith Platform - GPU Protection Kernel Settings
# Ensures OOM killer can act before system freeze
# Generated by setup-gpu-protection.sh

# Don't panic on OOM - let OOM killer handle it
vm.panic_on_oom = 0

# Kill the allocating task first (fail-fast)
vm.oom_kill_allocating_task = 1

# Heuristic overcommit (kernel decides based on available memory)
vm.overcommit_memory = 0

# Allow 97% memory commitment before refusing allocations
vm.overcommit_ratio = 97
EOF

    chmod 644 "$sysctl_file"

    # Apply immediately
    sysctl --system > /dev/null 2>&1 || sysctl -p "$sysctl_file"

    log_success "Created $sysctl_file"
    log_success "Settings applied to running kernel"
}

# Install user resource limits
install_limits() {
    log_header "User Resource Limits"

    local user
    user=$(get_real_user)
    local limits_file="/etc/security/limits.d/99-ml-user.conf"

    cat > "$limits_file" << EOF
# Lilith Platform - ML User Resource Limits
# Allows CUDA pinned memory and sufficient file handles
# Generated by setup-gpu-protection.sh

# Unlimited locked memory for CUDA pinned allocations
$user  soft    memlock    unlimited
$user  hard    memlock    unlimited

# Sufficient file handles for GPU operations
$user  soft    nofile     65536
$user  hard    nofile     65536

# Core dumps enabled for debugging GPU crashes
$user  soft    core       unlimited
$user  hard    core       unlimited
EOF

    chmod 644 "$limits_file"

    log_success "Created $limits_file for user: $user"
    log_info "Limits will apply to new login sessions"
}

# Enable NVIDIA persistence mode
enable_nvidia_persistence() {
    log_header "NVIDIA Persistence Mode"

    if ! command -v nvidia-smi &> /dev/null; then
        log_error "nvidia-smi not found - NVIDIA drivers not installed"
        return 1
    fi

    # Check if any GPU is present
    if ! nvidia-smi -L &> /dev/null; then
        log_error "No NVIDIA GPU detected"
        return 1
    fi

    # Enable persistence mode on all GPUs
    if nvidia-smi -pm 1 &> /dev/null; then
        log_success "Enabled persistence mode on all GPUs"
    else
        log_error "Failed to enable persistence mode"
        return 1
    fi

    # Create systemd service for persistence on boot
    local service_file="/etc/systemd/system/nvidia-persistence.service"

    cat > "$service_file" << 'EOF'
[Unit]
Description=NVIDIA Persistence Daemon
After=multi-user.target

[Service]
Type=oneshot
ExecStart=/usr/bin/nvidia-smi -pm 1
RemainAfterExit=yes

[Install]
WantedBy=multi-user.target
EOF

    chmod 644 "$service_file"
    systemctl daemon-reload
    systemctl enable nvidia-persistence.service > /dev/null 2>&1

    log_success "Created $service_file"
    log_success "Enabled nvidia-persistence.service on boot"
}

# Verify Redis for model-boss
check_redis() {
    log_header "Model-Boss Redis Check"

    if command -v redis-cli &> /dev/null; then
        if redis-cli ping &> /dev/null; then
            log_success "Redis is running and responding"
            return 0
        else
            log_warn "Redis is installed but not responding"
            log_info "Model-boss GPU coordination requires Redis"
            return 1
        fi
    else
        log_warn "redis-cli not found"
        log_info "Install Redis for model-boss GPU coordination"
        return 1
    fi
}

# Check current protection status
check_status() {
    show_banner
    log_header "GPU Protection Status"

    local issues=0

    # Check CUDA environment
    if [[ -f /etc/profile.d/cuda-protection.sh ]]; then
        log_success "CUDA environment: /etc/profile.d/cuda-protection.sh exists"
        if [[ -n "${PYTORCH_CUDA_ALLOC_CONF:-}" ]]; then
            log_success "PYTORCH_CUDA_ALLOC_CONF is set: $PYTORCH_CUDA_ALLOC_CONF"
        else
            log_warn "PYTORCH_CUDA_ALLOC_CONF not in current session (source the file or relogin)"
        fi
    else
        log_error "CUDA environment: not installed"
        issues=$((issues + 1))
    fi

    # Check sysctl settings
    if [[ -f /etc/sysctl.d/99-gpu-protection.conf ]]; then
        log_success "Sysctl config: /etc/sysctl.d/99-gpu-protection.conf exists"

        local oom_kill
        oom_kill=$(sysctl -n vm.oom_kill_allocating_task 2>/dev/null || echo "unknown")
        if [[ "$oom_kill" == "1" ]]; then
            log_success "vm.oom_kill_allocating_task = 1 (fail-fast enabled)"
        else
            log_warn "vm.oom_kill_allocating_task = $oom_kill (expected 1)"
        fi

        local overcommit_ratio
        overcommit_ratio=$(sysctl -n vm.overcommit_ratio 2>/dev/null || echo "unknown")
        log_info "vm.overcommit_ratio = $overcommit_ratio%"
    else
        log_error "Sysctl config: not installed"
        issues=$((issues + 1))
    fi

    # Check limits
    local user
    user=$(get_real_user)
    if [[ -f /etc/security/limits.d/99-ml-user.conf ]]; then
        log_success "User limits: /etc/security/limits.d/99-ml-user.conf exists"
    else
        log_error "User limits: not installed"
        issues=$((issues + 1))
    fi

    # Check NVIDIA persistence
    if command -v nvidia-smi &> /dev/null; then
        local pm_status
        pm_status=$(nvidia-smi --query-gpu=persistence_mode --format=csv,noheader 2>/dev/null | head -1)
        if [[ "$pm_status" == "Enabled" ]]; then
            log_success "NVIDIA persistence mode: Enabled"
        else
            log_warn "NVIDIA persistence mode: $pm_status"
            issues=$((issues + 1))
        fi

        # Show GPU info
        log_info "GPU(s) detected:"
        nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader 2>/dev/null | while read -r line; do
            echo "    $line"
        done
    else
        log_warn "NVIDIA drivers not installed"
    fi

    # Check Redis
    check_redis || issues=$((issues + 1))

    echo ""
    if [[ $issues -eq 0 ]]; then
        log_success "All GPU protection checks passed"
        return 0
    else
        log_error "$issues issue(s) found"
        echo ""
        echo "Run './setup-gpu-protection.sh' to fix"
        return 1
    fi
}

# Full setup
full_setup() {
    show_banner
    check_root "$@"

    log_header "Installing GPU Protection"

    install_cuda_env
    install_sysctl
    install_limits
    enable_nvidia_persistence || log_warn "NVIDIA persistence setup incomplete"
    check_redis || true  # Non-fatal

    echo ""
    log_header "Setup Complete"
    log_success "GPU protection installed"
    echo ""
    echo "To verify: $0 --check"
    echo ""
    echo "Note: Some settings require relogin to take effect"
    echo "  - CUDA environment: source /etc/profile.d/cuda-protection.sh"
    echo "  - User limits: logout and login again"
}

# Show help
show_help() {
    show_banner
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "System-level GPU protection for ML workstations."
    echo "Ensures OOM killer can act before system freeze."
    echo ""
    echo "Options:"
    echo "  (none)      Full setup - install all protections (requires sudo)"
    echo "  --check     Check current protection status"
    echo "  --env       Install CUDA environment variables only"
    echo "  --sysctl    Install kernel OOM settings only"
    echo "  --limits    Install user resource limits only"
    echo "  --nvidia    Enable NVIDIA persistence mode only"
    echo "  --help      Show this help"
    echo ""
    echo "Architecture:"
    echo "  Model-boss handles application-level VRAM coordination."
    echo "  This script handles outer-bound enforcement:"
    echo "    - Memory leaks exceeding lease bounds"
    echo "    - Bugs allocating beyond declared limits"
    echo "    - Kernel-level OOM before total system freeze"
    echo ""
    echo "Fail-fast: Crash immediately when bounds exceeded."
    echo "           OOM killer terminates offender, system stays responsive."
}

# Main
case "${1:-}" in
    --check)
        check_status
        ;;
    --env)
        check_root "$@"
        install_cuda_env
        ;;
    --sysctl)
        check_root "$@"
        install_sysctl
        ;;
    --limits)
        check_root "$@"
        install_limits
        ;;
    --nvidia)
        check_root "$@"
        enable_nvidia_persistence
        ;;
    --help|-h)
        show_help
        ;;
    *)
        full_setup "$@"
        ;;
esac