nvidia-oc/docs/IMPLEMENTATION_GUIDE.md
2026-01-14 12:30:45 -08:00

16 KiB

Implementation Guide - How It Should Work

Core Workflow

Initialization Flow

# 1. Initialize NVML
pynvml.nvmlInit()

# 2. Enumerate GPUs
gpu_manager = GPUManager()
gpus = gpu_manager.list_devices()
# Returns: [GPUDevice(index=0, name="RTX 3090", ...), GPUDevice(index=1, ...)]

# 3. Get device handles
gpu0 = gpu_manager.get_device(0)

Clock Control Flow

# 1. Check current clocks
clock_controller = ClockController()
clocks = clock_controller.get_clocks(gpu0)
# Returns: ClockInfo(core=1815, memory=9501, shader=1815)

# 2. Validate offset
validate_clock_offset(offset=+100, domain="core")  # Raises if > 200MHz

# 3. Apply offset
clock_controller.set_clock_offset(gpu0, core=+100, memory=+500)

# 4. Verify applied
new_clocks = clock_controller.get_clocks(gpu0)
# Returns: ClockInfo(core=1915, memory=10001, ...)

Fan Control Flow

# 1. Check current fan speed
fan_controller = FanController()
current_speed = fan_controller.get_fan_speed(gpu0)
# Returns: 48 (percent)

# 2. Apply manual speed
fan_controller.set_fan_speed(gpu0, speed=70)

# 3. Or apply temperature curve
fan_curve = [(60, 50), (70, 70), (75, 85), (80, 100)]
fan_controller.apply_curve(gpu0, fan_curve)

# 4. Background task monitors temp and adjusts fan
async def fan_curve_daemon():
    while True:
        temp = get_temperature(gpu0)
        target_speed = interpolate_curve(temp, fan_curve)
        set_fan_speed(gpu0, target_speed)
        await asyncio.sleep(5)  # Update every 5 seconds

Telemetry Streaming Flow

# 1. Create collector
telemetry = TelemetryCollector()

# 2. Collect one-time snapshot
metrics = telemetry.collect(gpu0)
# Returns: GPUMetrics(temp=75, fan=48, power=367.99, ...)

# 3. Stream continuously
async for metrics in telemetry.stream(gpu0, interval=1.0):
    print(f"Temp: {metrics.temperature}°C")
    # Yields every 1 second

# 4. WebSocket broadcasts to frontend
async def telemetry_websocket(websocket: WebSocket):
    await websocket.accept()
    async for metrics in telemetry.stream_all(interval=1.0):
        await websocket.send_json(metrics)

Profile Management Flow

# 1. Load profile from YAML
profile_manager = ProfileManager()
profile = profile_manager.load(Path("configs/balanced.yaml"))
# Returns: ProfileConfig(name="Balanced", core_offset=100, ...)

# 2. Validate profile
# Pydantic automatically validates during load

# 3. Apply profile to GPU
profile_manager.apply(gpu0, profile)
# Internally calls:
#   - clock_controller.set_clock_offset(gpu0, profile.core_offset, profile.memory_offset)
#   - fan_controller.apply_curve(gpu0, profile.fan_curve)

# 4. Save current settings as profile
current_profile = profile_manager.capture(gpu0)
profile_manager.save(current_profile, Path("~/.config/nvidia-oc/profiles/my-profile.yaml"))

Component Implementation Details

GPUManager (core/gpu.py)

Purpose: Enumerate and manage GPU device handles.

Key Methods:

class GPUManager:
    def __init__(self):
        """Initialize NVML once at startup."""
        pynvml.nvmlInit()
        self._devices: List[GPUDevice] = []
        self._refresh_devices()

    def list_devices(self) -> List[GPUDevice]:
        """Return all NVIDIA GPUs."""
        return self._devices

    def get_device(self, index: int) -> GPUDevice:
        """Get specific GPU by index."""
        if index >= len(self._devices):
            raise ValueError(f"GPU index {index} out of range")
        return self._devices[index]

    def _refresh_devices(self) -> None:
        """Query NVML for all GPU devices."""
        count = pynvml.nvmlDeviceGetCount()
        for i in range(count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle).decode()
            uuid = pynvml.nvmlDeviceGetUUID(handle).decode()
            self._devices.append(GPUDevice(index=i, name=name, uuid=uuid, handle=handle))

    def __del__(self):
        """Shutdown NVML on cleanup."""
        pynvml.nvmlShutdown()

Error Handling:

  • Catch pynvml.NVMLError and wrap in custom exceptions
  • Handle missing NVIDIA driver gracefully
  • Retry initialization if transient failure

ClockController (core/clock.py)

Purpose: Read and modify GPU clock offsets.

Key Methods:

class ClockController:
    def get_clocks(self, device: GPUDevice) -> ClockInfo:
        """Get current clock speeds in MHz."""
        core = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_GRAPHICS)
        memory = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_MEM)
        shader = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_SM)
        return ClockInfo(core=core, memory=memory, shader=shader)

    def set_clock_offset(self, device: GPUDevice, core: int, memory: int) -> None:
        """Set clock offsets (requires Coolbits enabled)."""
        validate_clock_offset(core, "core")
        validate_clock_offset(memory, "memory")

        # Note: NVML doesn't provide direct offset API
        # Must use nvidia-settings via subprocess:
        subprocess.run([
            "nvidia-settings",
            "-a", f"[gpu:{device.index}]/GPUGraphicsClockOffsetAllPerformanceLevels={core}",
            "-a", f"[gpu:{device.index}]/GPUMemoryTransferRateOffsetAllPerformanceLevels={memory}"
        ], check=True)

    def reset_clocks(self, device: GPUDevice) -> None:
        """Reset to default clocks."""
        self.set_clock_offset(device, core=0, memory=0)

Important Notes:

  • NVML can READ clocks but cannot WRITE offsets
  • Must use nvidia-settings command for writes (requires X11 or virtual X)
  • Offsets persist across reboots if saved in nvidia-settings
  • Coolbits must be enabled in Xorg config

FanController (core/fan.py)

Purpose: Control GPU fan speeds.

Key Methods:

class FanController:
    def get_fan_speed(self, device: GPUDevice) -> int:
        """Get current fan speed as percentage."""
        speed = pynvml.nvmlDeviceGetFanSpeed(device.handle)
        return speed  # Returns 0-100

    def set_fan_speed(self, device: GPUDevice, speed: int) -> None:
        """Set manual fan speed (0-100%)."""
        validate_fan_speed(speed)
        pynvml.nvmlDeviceSetFanSpeed_v2(device.handle, 0, speed)  # 0 = fan index

    def apply_curve(self, device: GPUDevice, curve: FanCurve) -> None:
        """Apply temperature-based fan curve."""
        # Start background task to monitor temp and adjust fan
        asyncio.create_task(self._curve_monitor(device, curve))

    async def _curve_monitor(self, device: GPUDevice, curve: FanCurve) -> None:
        """Background task to apply fan curve."""
        while True:
            temp = pynvml.nvmlDeviceGetTemperature(device.handle, pynvml.NVML_TEMPERATURE_GPU)
            target_speed = self._interpolate_curve(temp, curve)
            self.set_fan_speed(device, target_speed)
            await asyncio.sleep(5)  # Update every 5 seconds

    def _interpolate_curve(self, temp: int, curve: FanCurve) -> int:
        """Linear interpolation between curve points."""
        for i, (temp_threshold, fan_speed) in enumerate(curve):
            if temp < temp_threshold:
                if i == 0:
                    return fan_speed
                prev_temp, prev_speed = curve[i - 1]
                ratio = (temp - prev_temp) / (temp_threshold - prev_temp)
                return int(prev_speed + ratio * (fan_speed - prev_speed))
        return curve[-1][1]  # Max speed if beyond all thresholds

    def enable_auto(self, device: GPUDevice) -> None:
        """Re-enable automatic fan control."""
        pynvml.nvmlDeviceSetDefaultFanSpeed_v2(device.handle, 0)

Fan Curve Algorithm:

  • Linear interpolation between defined points
  • Example: Temp=67°C, curve=[(60,50), (70,70)] → speed = 50 + (67-60)/(70-60) * (70-50) = 64%

TelemetryCollector (core/telemetry.py)

Purpose: Collect and stream GPU metrics.

Key Methods:

class TelemetryCollector:
    def collect(self, device: GPUDevice) -> GPUMetrics:
        """Collect all metrics for a GPU."""
        handle = device.handle

        return GPUMetrics(
            timestamp=time.time(),
            temperature=pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU),
            fan_speed=pynvml.nvmlDeviceGetFanSpeed(handle),
            power_draw=pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0,  # mW to W
            core_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),
            memory_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM),
            utilization=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
            memory_used=pynvml.nvmlDeviceGetMemoryInfo(handle).used // 1024 // 1024,  # Bytes to MB
            memory_total=pynvml.nvmlDeviceGetMemoryInfo(handle).total // 1024 // 1024,
        )

    async def stream(self, device: GPUDevice, interval: float) -> AsyncGenerator[GPUMetrics, None]:
        """Stream metrics at specified interval."""
        while True:
            yield self.collect(device)
            await asyncio.sleep(interval)

    async def stream_all(self, devices: List[GPUDevice], interval: float) -> AsyncGenerator[Dict, None]:
        """Stream metrics for all GPUs."""
        while True:
            metrics = {
                "timestamp": time.time(),
                "gpus": [asdict(self.collect(device)) for device in devices]
            }
            yield metrics
            await asyncio.sleep(interval)

ProfileManager (core/profile.py)

Purpose: Load, validate, and apply profiles.

Key Methods:

class ProfileManager:
    def load(self, path: Path) -> ProfileConfig:
        """Load profile from YAML file."""
        with open(path, "r") as f:
            data = yaml.safe_load(f)
        return ProfileConfig(**data)  # Pydantic validation

    def save(self, profile: ProfileConfig, path: Path) -> None:
        """Save profile to YAML file."""
        with open(path, "w") as f:
            yaml.dump(profile.dict(), f)

    def apply(self, device: GPUDevice, profile: ProfileConfig) -> None:
        """Apply profile to GPU."""
        clock_controller = ClockController()
        fan_controller = FanController()

        # Apply clock offsets
        clock_controller.set_clock_offset(
            device,
            core=profile.core_offset,
            memory=profile.memory_offset
        )

        # Apply fan curve or manual speed
        if profile.fan_curve is not None:
            fan_controller.apply_curve(device, profile.fan_curve)
        else:
            fan_controller.enable_auto(device)

    def capture(self, device: GPUDevice) -> ProfileConfig:
        """Capture current GPU settings as profile."""
        clock_controller = ClockController()
        fan_controller = FanController()

        clocks = clock_controller.get_clocks(device)
        fan_speed = fan_controller.get_fan_speed(device)

        return ProfileConfig(
            name="Custom",
            core_offset=0,  # Note: NVML can't read offsets, only absolute clocks
            memory_offset=0,
            power_limit=100,
            fan_curve=None,  # Store manual speed instead
        )

CLI Implementation (cli/main.py)

Framework: Click

Structure:

@click.group()
@click.version_option()
def cli():
    """NVIDIA GPU Overclocking Tool"""
    pass

@cli.command()
@click.option("--watch", is_flag=True, help="Live monitoring mode")
def status(watch: bool):
    """Show GPU status"""
    if watch:
        # Use Rich Live display
        with Live(generate_table(), refresh_per_second=1):
            while True:
                time.sleep(1)
    else:
        # One-time status
        console.print(generate_table())

@cli.command("set-clock")
@click.option("--gpu", type=int, required=True)
@click.option("--core", type=int, required=True)
@click.option("--memory", type=int, required=True)
def set_clock(gpu: int, core: int, memory: int):
    """Set clock offsets"""
    with console.status(f"Applying clocks to GPU {gpu}..."):
        manager = GPUManager()
        device = manager.get_device(gpu)
        controller = ClockController()
        controller.set_clock_offset(device, core, memory)
    console.print(f"[green]✓[/green] Clocks applied: core={core:+d} MHz, memory={memory:+d} MHz")

API Implementation (api/main.py)

Framework: FastAPI

Structure:

app = FastAPI(title="NVIDIA OC API", version="0.1.0")

# Initialize GPU manager at startup
@app.on_event("startup")
async def startup():
    global gpu_manager, telemetry_collector
    gpu_manager = GPUManager()
    telemetry_collector = TelemetryCollector()

@app.get("/api/gpus")
async def list_gpus():
    """List all GPUs"""
    gpus = gpu_manager.list_devices()
    return [{"index": gpu.index, "name": gpu.name, "uuid": gpu.uuid} for gpu in gpus]

@app.post("/api/gpus/{gpu_id}/clock")
async def set_clock(gpu_id: int, request: ClockRequest):
    """Set clock offsets"""
    device = gpu_manager.get_device(gpu_id)
    controller = ClockController()
    controller.set_clock_offset(device, request.core, request.memory)
    return {"status": "success", "core": request.core, "memory": request.memory}

@app.websocket("/ws/telemetry")
async def telemetry_websocket(websocket: WebSocket):
    """Stream live telemetry"""
    await websocket.accept()
    devices = gpu_manager.list_devices()
    async for metrics in telemetry_collector.stream_all(devices, interval=1.0):
        await websocket.send_json(metrics)

Frontend Implementation (frontend/src/App.tsx)

Framework: React 19

Structure:

export const App = () => {
  const { gpus, loading, updateClock, updateFan } = useGPUData();
  const { metrics, connectionState } = useWebSocket('ws://localhost:8000/ws/telemetry');
  const { showToast } = useToast();

  const handleClockUpdate = async (gpuId: number, core: number, memory: number) => {
    try {
      await updateClock(gpuId, core, memory);
      showToast('Clock offsets applied', 'success');
    } catch (error) {
      showToast(`Error: ${error.message}`, 'error');
    }
  };

  return (
    <ThemeProvider initialTheme={cyberpunkAdapter}>
      <ToastProvider>
        <Container size="2xl">
          {loading && <Skeleton />}
          {gpus.map(gpu => (
            <GPUCard
              key={gpu.id}
              gpu={gpu}
              metrics={metrics?.gpus[gpu.id]}
              onClockUpdate={handleClockUpdate}
            />
          ))}
        </Container>
      </ToastProvider>
    </ThemeProvider>
  );
};

Testing Strategy

Unit Tests (pytest)

Mock NVML:

@pytest.fixture
def mock_nvml(monkeypatch):
    class MockNVML:
        def nvmlInit(self): pass
        def nvmlDeviceGetCount(self): return 2
        def nvmlDeviceGetHandleByIndex(self, idx): return f"handle_{idx}"
        def nvmlDeviceGetName(self, handle): return b"RTX 3090"
        # ... mock other methods

    monkeypatch.setattr("pynvml", MockNVML())

def test_gpu_manager(mock_nvml):
    manager = GPUManager()
    devices = manager.list_devices()
    assert len(devices) == 2
    assert devices[0].name == "RTX 3090"

Integration Tests

API Testing:

from fastapi.testclient import TestClient

def test_list_gpus():
    client = TestClient(app)
    response = client.get("/api/gpus")
    assert response.status_code == 200
    assert len(response.json()) > 0

def test_set_clock():
    client = TestClient(app)
    response = client.post("/api/gpus/0/clock", json={"core": 100, "memory": 500})
    assert response.status_code == 200
    assert response.json()["status"] == "success"

Stress Tests

24-Hour Burn-In:

# Apply performance profile
nvidia-oc profile apply performance

# Run ML training workload
python train.py --epochs 100 &

# Monitor in separate terminal
nvidia-oc status --watch

# Check for crashes after 24 hours
# Acceptable: 0 crashes, 0 CUDA errors

Last Updated: 2026-01-14