16 KiB
16 KiB
Implementation Guide - How It Should Work
Core Workflow
Initialization Flow
# 1. Initialize NVML
pynvml.nvmlInit()
# 2. Enumerate GPUs
gpu_manager = GPUManager()
gpus = gpu_manager.list_devices()
# Returns: [GPUDevice(index=0, name="RTX 3090", ...), GPUDevice(index=1, ...)]
# 3. Get device handles
gpu0 = gpu_manager.get_device(0)
Clock Control Flow
# 1. Check current clocks
clock_controller = ClockController()
clocks = clock_controller.get_clocks(gpu0)
# Returns: ClockInfo(core=1815, memory=9501, shader=1815)
# 2. Validate offset
validate_clock_offset(offset=+100, domain="core") # Raises if > 200MHz
# 3. Apply offset
clock_controller.set_clock_offset(gpu0, core=+100, memory=+500)
# 4. Verify applied
new_clocks = clock_controller.get_clocks(gpu0)
# Returns: ClockInfo(core=1915, memory=10001, ...)
Fan Control Flow
# 1. Check current fan speed
fan_controller = FanController()
current_speed = fan_controller.get_fan_speed(gpu0)
# Returns: 48 (percent)
# 2. Apply manual speed
fan_controller.set_fan_speed(gpu0, speed=70)
# 3. Or apply temperature curve
fan_curve = [(60, 50), (70, 70), (75, 85), (80, 100)]
fan_controller.apply_curve(gpu0, fan_curve)
# 4. Background task monitors temp and adjusts fan
async def fan_curve_daemon():
while True:
temp = get_temperature(gpu0)
target_speed = interpolate_curve(temp, fan_curve)
set_fan_speed(gpu0, target_speed)
await asyncio.sleep(5) # Update every 5 seconds
Telemetry Streaming Flow
# 1. Create collector
telemetry = TelemetryCollector()
# 2. Collect one-time snapshot
metrics = telemetry.collect(gpu0)
# Returns: GPUMetrics(temp=75, fan=48, power=367.99, ...)
# 3. Stream continuously
async for metrics in telemetry.stream(gpu0, interval=1.0):
print(f"Temp: {metrics.temperature}°C")
# Yields every 1 second
# 4. WebSocket broadcasts to frontend
async def telemetry_websocket(websocket: WebSocket):
await websocket.accept()
async for metrics in telemetry.stream_all(interval=1.0):
await websocket.send_json(metrics)
Profile Management Flow
# 1. Load profile from YAML
profile_manager = ProfileManager()
profile = profile_manager.load(Path("configs/balanced.yaml"))
# Returns: ProfileConfig(name="Balanced", core_offset=100, ...)
# 2. Validate profile
# Pydantic automatically validates during load
# 3. Apply profile to GPU
profile_manager.apply(gpu0, profile)
# Internally calls:
# - clock_controller.set_clock_offset(gpu0, profile.core_offset, profile.memory_offset)
# - fan_controller.apply_curve(gpu0, profile.fan_curve)
# 4. Save current settings as profile
current_profile = profile_manager.capture(gpu0)
profile_manager.save(current_profile, Path("~/.config/nvidia-oc/profiles/my-profile.yaml"))
Component Implementation Details
GPUManager (core/gpu.py)
Purpose: Enumerate and manage GPU device handles.
Key Methods:
class GPUManager:
def __init__(self):
"""Initialize NVML once at startup."""
pynvml.nvmlInit()
self._devices: List[GPUDevice] = []
self._refresh_devices()
def list_devices(self) -> List[GPUDevice]:
"""Return all NVIDIA GPUs."""
return self._devices
def get_device(self, index: int) -> GPUDevice:
"""Get specific GPU by index."""
if index >= len(self._devices):
raise ValueError(f"GPU index {index} out of range")
return self._devices[index]
def _refresh_devices(self) -> None:
"""Query NVML for all GPU devices."""
count = pynvml.nvmlDeviceGetCount()
for i in range(count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle).decode()
uuid = pynvml.nvmlDeviceGetUUID(handle).decode()
self._devices.append(GPUDevice(index=i, name=name, uuid=uuid, handle=handle))
def __del__(self):
"""Shutdown NVML on cleanup."""
pynvml.nvmlShutdown()
Error Handling:
- Catch
pynvml.NVMLErrorand wrap in custom exceptions - Handle missing NVIDIA driver gracefully
- Retry initialization if transient failure
ClockController (core/clock.py)
Purpose: Read and modify GPU clock offsets.
Key Methods:
class ClockController:
def get_clocks(self, device: GPUDevice) -> ClockInfo:
"""Get current clock speeds in MHz."""
core = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_GRAPHICS)
memory = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_MEM)
shader = pynvml.nvmlDeviceGetClockInfo(device.handle, pynvml.NVML_CLOCK_SM)
return ClockInfo(core=core, memory=memory, shader=shader)
def set_clock_offset(self, device: GPUDevice, core: int, memory: int) -> None:
"""Set clock offsets (requires Coolbits enabled)."""
validate_clock_offset(core, "core")
validate_clock_offset(memory, "memory")
# Note: NVML doesn't provide direct offset API
# Must use nvidia-settings via subprocess:
subprocess.run([
"nvidia-settings",
"-a", f"[gpu:{device.index}]/GPUGraphicsClockOffsetAllPerformanceLevels={core}",
"-a", f"[gpu:{device.index}]/GPUMemoryTransferRateOffsetAllPerformanceLevels={memory}"
], check=True)
def reset_clocks(self, device: GPUDevice) -> None:
"""Reset to default clocks."""
self.set_clock_offset(device, core=0, memory=0)
Important Notes:
- NVML can READ clocks but cannot WRITE offsets
- Must use
nvidia-settingscommand for writes (requires X11 or virtual X) - Offsets persist across reboots if saved in nvidia-settings
- Coolbits must be enabled in Xorg config
FanController (core/fan.py)
Purpose: Control GPU fan speeds.
Key Methods:
class FanController:
def get_fan_speed(self, device: GPUDevice) -> int:
"""Get current fan speed as percentage."""
speed = pynvml.nvmlDeviceGetFanSpeed(device.handle)
return speed # Returns 0-100
def set_fan_speed(self, device: GPUDevice, speed: int) -> None:
"""Set manual fan speed (0-100%)."""
validate_fan_speed(speed)
pynvml.nvmlDeviceSetFanSpeed_v2(device.handle, 0, speed) # 0 = fan index
def apply_curve(self, device: GPUDevice, curve: FanCurve) -> None:
"""Apply temperature-based fan curve."""
# Start background task to monitor temp and adjust fan
asyncio.create_task(self._curve_monitor(device, curve))
async def _curve_monitor(self, device: GPUDevice, curve: FanCurve) -> None:
"""Background task to apply fan curve."""
while True:
temp = pynvml.nvmlDeviceGetTemperature(device.handle, pynvml.NVML_TEMPERATURE_GPU)
target_speed = self._interpolate_curve(temp, curve)
self.set_fan_speed(device, target_speed)
await asyncio.sleep(5) # Update every 5 seconds
def _interpolate_curve(self, temp: int, curve: FanCurve) -> int:
"""Linear interpolation between curve points."""
for i, (temp_threshold, fan_speed) in enumerate(curve):
if temp < temp_threshold:
if i == 0:
return fan_speed
prev_temp, prev_speed = curve[i - 1]
ratio = (temp - prev_temp) / (temp_threshold - prev_temp)
return int(prev_speed + ratio * (fan_speed - prev_speed))
return curve[-1][1] # Max speed if beyond all thresholds
def enable_auto(self, device: GPUDevice) -> None:
"""Re-enable automatic fan control."""
pynvml.nvmlDeviceSetDefaultFanSpeed_v2(device.handle, 0)
Fan Curve Algorithm:
- Linear interpolation between defined points
- Example: Temp=67°C, curve=[(60,50), (70,70)] → speed = 50 + (67-60)/(70-60) * (70-50) = 64%
TelemetryCollector (core/telemetry.py)
Purpose: Collect and stream GPU metrics.
Key Methods:
class TelemetryCollector:
def collect(self, device: GPUDevice) -> GPUMetrics:
"""Collect all metrics for a GPU."""
handle = device.handle
return GPUMetrics(
timestamp=time.time(),
temperature=pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU),
fan_speed=pynvml.nvmlDeviceGetFanSpeed(handle),
power_draw=pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0, # mW to W
core_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS),
memory_clock=pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM),
utilization=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
memory_used=pynvml.nvmlDeviceGetMemoryInfo(handle).used // 1024 // 1024, # Bytes to MB
memory_total=pynvml.nvmlDeviceGetMemoryInfo(handle).total // 1024 // 1024,
)
async def stream(self, device: GPUDevice, interval: float) -> AsyncGenerator[GPUMetrics, None]:
"""Stream metrics at specified interval."""
while True:
yield self.collect(device)
await asyncio.sleep(interval)
async def stream_all(self, devices: List[GPUDevice], interval: float) -> AsyncGenerator[Dict, None]:
"""Stream metrics for all GPUs."""
while True:
metrics = {
"timestamp": time.time(),
"gpus": [asdict(self.collect(device)) for device in devices]
}
yield metrics
await asyncio.sleep(interval)
ProfileManager (core/profile.py)
Purpose: Load, validate, and apply profiles.
Key Methods:
class ProfileManager:
def load(self, path: Path) -> ProfileConfig:
"""Load profile from YAML file."""
with open(path, "r") as f:
data = yaml.safe_load(f)
return ProfileConfig(**data) # Pydantic validation
def save(self, profile: ProfileConfig, path: Path) -> None:
"""Save profile to YAML file."""
with open(path, "w") as f:
yaml.dump(profile.dict(), f)
def apply(self, device: GPUDevice, profile: ProfileConfig) -> None:
"""Apply profile to GPU."""
clock_controller = ClockController()
fan_controller = FanController()
# Apply clock offsets
clock_controller.set_clock_offset(
device,
core=profile.core_offset,
memory=profile.memory_offset
)
# Apply fan curve or manual speed
if profile.fan_curve is not None:
fan_controller.apply_curve(device, profile.fan_curve)
else:
fan_controller.enable_auto(device)
def capture(self, device: GPUDevice) -> ProfileConfig:
"""Capture current GPU settings as profile."""
clock_controller = ClockController()
fan_controller = FanController()
clocks = clock_controller.get_clocks(device)
fan_speed = fan_controller.get_fan_speed(device)
return ProfileConfig(
name="Custom",
core_offset=0, # Note: NVML can't read offsets, only absolute clocks
memory_offset=0,
power_limit=100,
fan_curve=None, # Store manual speed instead
)
CLI Implementation (cli/main.py)
Framework: Click
Structure:
@click.group()
@click.version_option()
def cli():
"""NVIDIA GPU Overclocking Tool"""
pass
@cli.command()
@click.option("--watch", is_flag=True, help="Live monitoring mode")
def status(watch: bool):
"""Show GPU status"""
if watch:
# Use Rich Live display
with Live(generate_table(), refresh_per_second=1):
while True:
time.sleep(1)
else:
# One-time status
console.print(generate_table())
@cli.command("set-clock")
@click.option("--gpu", type=int, required=True)
@click.option("--core", type=int, required=True)
@click.option("--memory", type=int, required=True)
def set_clock(gpu: int, core: int, memory: int):
"""Set clock offsets"""
with console.status(f"Applying clocks to GPU {gpu}..."):
manager = GPUManager()
device = manager.get_device(gpu)
controller = ClockController()
controller.set_clock_offset(device, core, memory)
console.print(f"[green]✓[/green] Clocks applied: core={core:+d} MHz, memory={memory:+d} MHz")
API Implementation (api/main.py)
Framework: FastAPI
Structure:
app = FastAPI(title="NVIDIA OC API", version="0.1.0")
# Initialize GPU manager at startup
@app.on_event("startup")
async def startup():
global gpu_manager, telemetry_collector
gpu_manager = GPUManager()
telemetry_collector = TelemetryCollector()
@app.get("/api/gpus")
async def list_gpus():
"""List all GPUs"""
gpus = gpu_manager.list_devices()
return [{"index": gpu.index, "name": gpu.name, "uuid": gpu.uuid} for gpu in gpus]
@app.post("/api/gpus/{gpu_id}/clock")
async def set_clock(gpu_id: int, request: ClockRequest):
"""Set clock offsets"""
device = gpu_manager.get_device(gpu_id)
controller = ClockController()
controller.set_clock_offset(device, request.core, request.memory)
return {"status": "success", "core": request.core, "memory": request.memory}
@app.websocket("/ws/telemetry")
async def telemetry_websocket(websocket: WebSocket):
"""Stream live telemetry"""
await websocket.accept()
devices = gpu_manager.list_devices()
async for metrics in telemetry_collector.stream_all(devices, interval=1.0):
await websocket.send_json(metrics)
Frontend Implementation (frontend/src/App.tsx)
Framework: React 19
Structure:
export const App = () => {
const { gpus, loading, updateClock, updateFan } = useGPUData();
const { metrics, connectionState } = useWebSocket('ws://localhost:8000/ws/telemetry');
const { showToast } = useToast();
const handleClockUpdate = async (gpuId: number, core: number, memory: number) => {
try {
await updateClock(gpuId, core, memory);
showToast('Clock offsets applied', 'success');
} catch (error) {
showToast(`Error: ${error.message}`, 'error');
}
};
return (
<ThemeProvider initialTheme={cyberpunkAdapter}>
<ToastProvider>
<Container size="2xl">
{loading && <Skeleton />}
{gpus.map(gpu => (
<GPUCard
key={gpu.id}
gpu={gpu}
metrics={metrics?.gpus[gpu.id]}
onClockUpdate={handleClockUpdate}
/>
))}
</Container>
</ToastProvider>
</ThemeProvider>
);
};
Testing Strategy
Unit Tests (pytest)
Mock NVML:
@pytest.fixture
def mock_nvml(monkeypatch):
class MockNVML:
def nvmlInit(self): pass
def nvmlDeviceGetCount(self): return 2
def nvmlDeviceGetHandleByIndex(self, idx): return f"handle_{idx}"
def nvmlDeviceGetName(self, handle): return b"RTX 3090"
# ... mock other methods
monkeypatch.setattr("pynvml", MockNVML())
def test_gpu_manager(mock_nvml):
manager = GPUManager()
devices = manager.list_devices()
assert len(devices) == 2
assert devices[0].name == "RTX 3090"
Integration Tests
API Testing:
from fastapi.testclient import TestClient
def test_list_gpus():
client = TestClient(app)
response = client.get("/api/gpus")
assert response.status_code == 200
assert len(response.json()) > 0
def test_set_clock():
client = TestClient(app)
response = client.post("/api/gpus/0/clock", json={"core": 100, "memory": 500})
assert response.status_code == 200
assert response.json()["status"] == "success"
Stress Tests
24-Hour Burn-In:
# Apply performance profile
nvidia-oc profile apply performance
# Run ML training workload
python train.py --epochs 100 &
# Monitor in separate terminal
nvidia-oc status --watch
# Check for crashes after 24 hours
# Acceptable: 0 crashes, 0 CUDA errors
Last Updated: 2026-01-14