gpu: call the GPU's vLLM directly (drop the unbuilt model-boss coordinator)

There is no model-boss coordinator deployed — the on-demand GPU droplet serves raw vLLM (/v1). So the client now sends a standard OpenAI request with an explicit model (caller's, else GPU_LLM_MODEL) and drops the coordinator-only x_task/ x_priority routing fields; health probes vLLM's /health (was /ready). Set GPU_LLM_MODEL to vLLM's --served-model-name when the GPU is provisioned. (ChatJsonOpts.task/priority are now passed-but-ignored; symbol/env names ModelBossClient/MODEL_BOSS_URL kept to avoid a sprawling rename — cosmetic cleanup.) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 03:36:09 -04:00 · 2026-06-30 03:36:09 -04:00 · d0824d7614
commit d0824d7614
parent 3adc3433d6
1 changed files with 14 additions and 11 deletions
--- a/src/gpu/model-boss.client.ts
+++ b/src/gpu/model-boss.client.ts
@ -4,11 +4,11 @@ import { ConfigService } from '@nestjs/config';
 import type { ChatJsonOpts } from './types.js';

 /**
- * Model-boss coordinator client (OpenAI-compatible gateway at MODEL_BOSS_URL).
- *
- * Routes strict-JSON chat completions through the coordinator, which resolves the
- * concrete model from its TaskRegistry (`prospect.classify` / `prospect.draft`)
- * when `model` is empty, lazy-loads it on the GPU, and holds it warm.
+ * Direct OpenAI-compatible LLM client for the on-demand GPU droplet's **vLLM**
+ * (`/v1/chat/completions` at MODEL_BOSS_URL). There is no model-boss coordinator
+ * deployed — vLLM serves ONE model, so the request carries an explicit `model`
+ * (caller-supplied, else `GPU_LLM_MODEL`); the coordinator-only `x_task`/`x_priority`
+ * routing fields are not sent and `/v1/models`'s sibling `/health` is the probe.
 *
 * MODEL_BOSS_URL is OPTIONAL — `isEnabled()` is false when it is absent, and the
 * GPU enrich callers skip to their existing fast/pastebin fallbacks. An
@ -41,12 +41,15 @@ interface OpenAIChatResponse {
 export class ModelBossClient {
  private readonly logger = new Logger(ModelBossClient.name);
  private readonly url: string | null;
+  /** vLLM's served model name (vLLM rejects an unknown/empty `model`). */
+  private readonly model: string | null;

  private circuitFailureCount = 0;
  private circuitOpenUntil = 0;

  constructor(config: ConfigService) {
    this.url = config.get<string>('MODEL_BOSS_URL')?.replace(/\/+$/, '') ?? null;
+    this.model = config.get<string>('GPU_LLM_MODEL') ?? null;
  }

  /** Whether a coordinator is configured (the enrich path is reachable). */
@ -70,15 +73,15 @@ export class ModelBossClient {
    }
    this.assertCircuitClosed();

+    // vLLM-direct: explicit model (caller's, else the configured served model),
+    // standard OpenAI fields only — no coordinator x_task/x_priority routing.
    const requestBody = {
-      model: opts.model,
+      model: opts.model || this.model || '',
      messages: [{ role: 'system', content: opts.systemPrompt }, ...opts.messages],
      response_format: {
        type: 'json_schema',
        json_schema: { name: opts.schemaName, schema: opts.schema, strict: true },
      },
-      x_task: opts.task,
-      x_priority: opts.priority ?? 'normal',
    };

    let res: Response;
@ -116,13 +119,13 @@ export class ModelBossClient {
  }

  /**
-   * Liveness probe against `/ready`. Never throws and never touches the circuit
-   * breaker — it is a status read, not a load-bearing call.
+   * Liveness probe against vLLM's `/health`. Never throws and never touches the
+   * circuit breaker — it is a status read, not a load-bearing call.
   */
  async health(timeoutMs: number = HEALTH_TIMEOUT_MS): Promise<boolean> {
    if (!this.url) return false;
    try {
-      const res = await fetch(`${this.url}/ready`, { signal: AbortSignal.timeout(timeoutMs) });
+      const res = await fetch(`${this.url}/health`, { signal: AbortSignal.timeout(timeoutMs) });
      return res.ok;
    } catch {
      return false;