diff --git a/src/gpu/model-boss.client.ts b/src/gpu/model-boss.client.ts index 468a101..00612d2 100644 --- a/src/gpu/model-boss.client.ts +++ b/src/gpu/model-boss.client.ts @@ -4,11 +4,11 @@ import { ConfigService } from '@nestjs/config'; import type { ChatJsonOpts } from './types.js'; /** - * Model-boss coordinator client (OpenAI-compatible gateway at MODEL_BOSS_URL). - * - * Routes strict-JSON chat completions through the coordinator, which resolves the - * concrete model from its TaskRegistry (`prospect.classify` / `prospect.draft`) - * when `model` is empty, lazy-loads it on the GPU, and holds it warm. + * Direct OpenAI-compatible LLM client for the on-demand GPU droplet's **vLLM** + * (`/v1/chat/completions` at MODEL_BOSS_URL). There is no model-boss coordinator + * deployed — vLLM serves ONE model, so the request carries an explicit `model` + * (caller-supplied, else `GPU_LLM_MODEL`); the coordinator-only `x_task`/`x_priority` + * routing fields are not sent and `/v1/models`'s sibling `/health` is the probe. * * MODEL_BOSS_URL is OPTIONAL — `isEnabled()` is false when it is absent, and the * GPU enrich callers skip to their existing fast/pastebin fallbacks. An @@ -41,12 +41,15 @@ interface OpenAIChatResponse { export class ModelBossClient { private readonly logger = new Logger(ModelBossClient.name); private readonly url: string | null; + /** vLLM's served model name (vLLM rejects an unknown/empty `model`). */ + private readonly model: string | null; private circuitFailureCount = 0; private circuitOpenUntil = 0; constructor(config: ConfigService) { this.url = config.get('MODEL_BOSS_URL')?.replace(/\/+$/, '') ?? null; + this.model = config.get('GPU_LLM_MODEL') ?? null; } /** Whether a coordinator is configured (the enrich path is reachable). */ @@ -70,15 +73,15 @@ export class ModelBossClient { } this.assertCircuitClosed(); + // vLLM-direct: explicit model (caller's, else the configured served model), + // standard OpenAI fields only — no coordinator x_task/x_priority routing. const requestBody = { - model: opts.model, + model: opts.model || this.model || '', messages: [{ role: 'system', content: opts.systemPrompt }, ...opts.messages], response_format: { type: 'json_schema', json_schema: { name: opts.schemaName, schema: opts.schema, strict: true }, }, - x_task: opts.task, - x_priority: opts.priority ?? 'normal', }; let res: Response; @@ -116,13 +119,13 @@ export class ModelBossClient { } /** - * Liveness probe against `/ready`. Never throws and never touches the circuit - * breaker — it is a status read, not a load-bearing call. + * Liveness probe against vLLM's `/health`. Never throws and never touches the + * circuit breaker — it is a status read, not a load-bearing call. */ async health(timeoutMs: number = HEALTH_TIMEOUT_MS): Promise { if (!this.url) return false; try { - const res = await fetch(`${this.url}/ready`, { signal: AbortSignal.timeout(timeoutMs) }); + const res = await fetch(`${this.url}/health`, { signal: AbortSignal.timeout(timeoutMs) }); return res.ok; } catch { return false;