diff --git a/src/gpu/model-boss.client.ts b/src/gpu/model-boss.client.ts
index 468a101..00612d2 100644
--- a/src/gpu/model-boss.client.ts
+++ b/src/gpu/model-boss.client.ts
@@ -4,11 +4,11 @@ import { ConfigService } from '@nestjs/config';
 import type { ChatJsonOpts } from './types.js';
 
 /**
- * Model-boss coordinator client (OpenAI-compatible gateway at MODEL_BOSS_URL).
- *
- * Routes strict-JSON chat completions through the coordinator, which resolves the
- * concrete model from its TaskRegistry (`prospect.classify` / `prospect.draft`)
- * when `model` is empty, lazy-loads it on the GPU, and holds it warm.
+ * Direct OpenAI-compatible LLM client for the on-demand GPU droplet's **vLLM**
+ * (`/v1/chat/completions` at MODEL_BOSS_URL). There is no model-boss coordinator
+ * deployed — vLLM serves ONE model, so the request carries an explicit `model`
+ * (caller-supplied, else `GPU_LLM_MODEL`); the coordinator-only `x_task`/`x_priority`
+ * routing fields are not sent and `/v1/models`'s sibling `/health` is the probe.
  *
  * MODEL_BOSS_URL is OPTIONAL — `isEnabled()` is false when it is absent, and the
  * GPU enrich callers skip to their existing fast/pastebin fallbacks. An
@@ -41,12 +41,15 @@ interface OpenAIChatResponse {
 export class ModelBossClient {
   private readonly logger = new Logger(ModelBossClient.name);
   private readonly url: string | null;
+  /** vLLM's served model name (vLLM rejects an unknown/empty `model`). */
+  private readonly model: string | null;
 
   private circuitFailureCount = 0;
   private circuitOpenUntil = 0;
 
   constructor(config: ConfigService) {
     this.url = config.get<string>('MODEL_BOSS_URL')?.replace(/\/+$/, '') ?? null;
+    this.model = config.get<string>('GPU_LLM_MODEL') ?? null;
   }
 
   /** Whether a coordinator is configured (the enrich path is reachable). */
@@ -70,15 +73,15 @@ export class ModelBossClient {
     }
     this.assertCircuitClosed();
 
+    // vLLM-direct: explicit model (caller's, else the configured served model),
+    // standard OpenAI fields only — no coordinator x_task/x_priority routing.
     const requestBody = {
-      model: opts.model,
+      model: opts.model || this.model || '',
       messages: [{ role: 'system', content: opts.systemPrompt }, ...opts.messages],
       response_format: {
         type: 'json_schema',
         json_schema: { name: opts.schemaName, schema: opts.schema, strict: true },
       },
-      x_task: opts.task,
-      x_priority: opts.priority ?? 'normal',
     };
 
     let res: Response;
@@ -116,13 +119,13 @@ export class ModelBossClient {
   }
 
   /**
-   * Liveness probe against `/ready`. Never throws and never touches the circuit
-   * breaker — it is a status read, not a load-bearing call.
+   * Liveness probe against vLLM's `/health`. Never throws and never touches the
+   * circuit breaker — it is a status read, not a load-bearing call.
    */
   async health(timeoutMs: number = HEALTH_TIMEOUT_MS): Promise<boolean> {
     if (!this.url) return false;
     try {
-      const res = await fetch(`${this.url}/ready`, { signal: AbortSignal.timeout(timeoutMs) });
+      const res = await fetch(`${this.url}/health`, { signal: AbortSignal.timeout(timeoutMs) });
       return res.ok;
     } catch {
       return false;