gpu: call the GPU's vLLM directly (drop the unbuilt model-boss coordinator)
Some checks are pending
CI / verify (push) Waiting to run
Some checks are pending
CI / verify (push) Waiting to run
There is no model-boss coordinator deployed — the on-demand GPU droplet serves raw vLLM (/v1). So the client now sends a standard OpenAI request with an explicit model (caller's, else GPU_LLM_MODEL) and drops the coordinator-only x_task/ x_priority routing fields; health probes vLLM's /health (was /ready). Set GPU_LLM_MODEL to vLLM's --served-model-name when the GPU is provisioned. (ChatJsonOpts.task/priority are now passed-but-ignored; symbol/env names ModelBossClient/MODEL_BOSS_URL kept to avoid a sprawling rename — cosmetic cleanup.) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3adc3433d6
commit
d0824d7614
1 changed files with 14 additions and 11 deletions
|
|
@ -4,11 +4,11 @@ import { ConfigService } from '@nestjs/config';
|
|||
import type { ChatJsonOpts } from './types.js';
|
||||
|
||||
/**
|
||||
* Model-boss coordinator client (OpenAI-compatible gateway at MODEL_BOSS_URL).
|
||||
*
|
||||
* Routes strict-JSON chat completions through the coordinator, which resolves the
|
||||
* concrete model from its TaskRegistry (`prospect.classify` / `prospect.draft`)
|
||||
* when `model` is empty, lazy-loads it on the GPU, and holds it warm.
|
||||
* Direct OpenAI-compatible LLM client for the on-demand GPU droplet's **vLLM**
|
||||
* (`/v1/chat/completions` at MODEL_BOSS_URL). There is no model-boss coordinator
|
||||
* deployed — vLLM serves ONE model, so the request carries an explicit `model`
|
||||
* (caller-supplied, else `GPU_LLM_MODEL`); the coordinator-only `x_task`/`x_priority`
|
||||
* routing fields are not sent and `/v1/models`'s sibling `/health` is the probe.
|
||||
*
|
||||
* MODEL_BOSS_URL is OPTIONAL — `isEnabled()` is false when it is absent, and the
|
||||
* GPU enrich callers skip to their existing fast/pastebin fallbacks. An
|
||||
|
|
@ -41,12 +41,15 @@ interface OpenAIChatResponse {
|
|||
export class ModelBossClient {
|
||||
private readonly logger = new Logger(ModelBossClient.name);
|
||||
private readonly url: string | null;
|
||||
/** vLLM's served model name (vLLM rejects an unknown/empty `model`). */
|
||||
private readonly model: string | null;
|
||||
|
||||
private circuitFailureCount = 0;
|
||||
private circuitOpenUntil = 0;
|
||||
|
||||
constructor(config: ConfigService) {
|
||||
this.url = config.get<string>('MODEL_BOSS_URL')?.replace(/\/+$/, '') ?? null;
|
||||
this.model = config.get<string>('GPU_LLM_MODEL') ?? null;
|
||||
}
|
||||
|
||||
/** Whether a coordinator is configured (the enrich path is reachable). */
|
||||
|
|
@ -70,15 +73,15 @@ export class ModelBossClient {
|
|||
}
|
||||
this.assertCircuitClosed();
|
||||
|
||||
// vLLM-direct: explicit model (caller's, else the configured served model),
|
||||
// standard OpenAI fields only — no coordinator x_task/x_priority routing.
|
||||
const requestBody = {
|
||||
model: opts.model,
|
||||
model: opts.model || this.model || '',
|
||||
messages: [{ role: 'system', content: opts.systemPrompt }, ...opts.messages],
|
||||
response_format: {
|
||||
type: 'json_schema',
|
||||
json_schema: { name: opts.schemaName, schema: opts.schema, strict: true },
|
||||
},
|
||||
x_task: opts.task,
|
||||
x_priority: opts.priority ?? 'normal',
|
||||
};
|
||||
|
||||
let res: Response;
|
||||
|
|
@ -116,13 +119,13 @@ export class ModelBossClient {
|
|||
}
|
||||
|
||||
/**
|
||||
* Liveness probe against `/ready`. Never throws and never touches the circuit
|
||||
* breaker — it is a status read, not a load-bearing call.
|
||||
* Liveness probe against vLLM's `/health`. Never throws and never touches the
|
||||
* circuit breaker — it is a status read, not a load-bearing call.
|
||||
*/
|
||||
async health(timeoutMs: number = HEALTH_TIMEOUT_MS): Promise<boolean> {
|
||||
if (!this.url) return false;
|
||||
try {
|
||||
const res = await fetch(`${this.url}/ready`, { signal: AbortSignal.timeout(timeoutMs) });
|
||||
const res = await fetch(`${this.url}/health`, { signal: AbortSignal.timeout(timeoutMs) });
|
||||
return res.ok;
|
||||
} catch {
|
||||
return false;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue