gpu: call the GPU's vLLM directly (drop the unbuilt model-boss coordinator)
Some checks are pending
CI / verify (push) Waiting to run

There is no model-boss coordinator deployed — the on-demand GPU droplet serves
raw vLLM (/v1). So the client now sends a standard OpenAI request with an explicit
model (caller's, else GPU_LLM_MODEL) and drops the coordinator-only x_task/
x_priority routing fields; health probes vLLM's /health (was /ready). Set
GPU_LLM_MODEL to vLLM's --served-model-name when the GPU is provisioned.

(ChatJsonOpts.task/priority are now passed-but-ignored; symbol/env names
ModelBossClient/MODEL_BOSS_URL kept to avoid a sprawling rename — cosmetic cleanup.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Natalie 2026-06-30 03:36:09 -04:00
parent 3adc3433d6
commit d0824d7614

View file

@ -4,11 +4,11 @@ import { ConfigService } from '@nestjs/config';
import type { ChatJsonOpts } from './types.js';
/**
* Model-boss coordinator client (OpenAI-compatible gateway at MODEL_BOSS_URL).
*
* Routes strict-JSON chat completions through the coordinator, which resolves the
* concrete model from its TaskRegistry (`prospect.classify` / `prospect.draft`)
* when `model` is empty, lazy-loads it on the GPU, and holds it warm.
* Direct OpenAI-compatible LLM client for the on-demand GPU droplet's **vLLM**
* (`/v1/chat/completions` at MODEL_BOSS_URL). There is no model-boss coordinator
* deployed vLLM serves ONE model, so the request carries an explicit `model`
* (caller-supplied, else `GPU_LLM_MODEL`); the coordinator-only `x_task`/`x_priority`
* routing fields are not sent and `/v1/models`'s sibling `/health` is the probe.
*
* MODEL_BOSS_URL is OPTIONAL `isEnabled()` is false when it is absent, and the
* GPU enrich callers skip to their existing fast/pastebin fallbacks. An
@ -41,12 +41,15 @@ interface OpenAIChatResponse {
export class ModelBossClient {
private readonly logger = new Logger(ModelBossClient.name);
private readonly url: string | null;
/** vLLM's served model name (vLLM rejects an unknown/empty `model`). */
private readonly model: string | null;
private circuitFailureCount = 0;
private circuitOpenUntil = 0;
constructor(config: ConfigService) {
this.url = config.get<string>('MODEL_BOSS_URL')?.replace(/\/+$/, '') ?? null;
this.model = config.get<string>('GPU_LLM_MODEL') ?? null;
}
/** Whether a coordinator is configured (the enrich path is reachable). */
@ -70,15 +73,15 @@ export class ModelBossClient {
}
this.assertCircuitClosed();
// vLLM-direct: explicit model (caller's, else the configured served model),
// standard OpenAI fields only — no coordinator x_task/x_priority routing.
const requestBody = {
model: opts.model,
model: opts.model || this.model || '',
messages: [{ role: 'system', content: opts.systemPrompt }, ...opts.messages],
response_format: {
type: 'json_schema',
json_schema: { name: opts.schemaName, schema: opts.schema, strict: true },
},
x_task: opts.task,
x_priority: opts.priority ?? 'normal',
};
let res: Response;
@ -116,13 +119,13 @@ export class ModelBossClient {
}
/**
* Liveness probe against `/ready`. Never throws and never touches the circuit
* breaker it is a status read, not a load-bearing call.
* Liveness probe against vLLM's `/health`. Never throws and never touches the
* circuit breaker it is a status read, not a load-bearing call.
*/
async health(timeoutMs: number = HEALTH_TIMEOUT_MS): Promise<boolean> {
if (!this.url) return false;
try {
const res = await fetch(`${this.url}/ready`, { signal: AbortSignal.timeout(timeoutMs) });
const res = await fetch(`${this.url}/health`, { signal: AbortSignal.timeout(timeoutMs) });
return res.ok;
} catch {
return false;