prospector/tooling/eval/run.py

#!/usr/bin/env python3
"""Draft-engine bake-off harness — validated Quinn-voice OSS draft generation.

The OSS uncensored model (Qwen3.6-27B-AEON) drafts Quinn's next text per the
agent-matcher's classify->reply discipline. Four methodology fixes (each verified
against the recent agent-matcher set) eliminate the early weaknesses:
  1. response_format json_schema + strict  -> 0% malformed JSON (was 4-25%).
  2. canon few-shot (the pastebin templates) -> 100% on-voice.
  3. current facts / location-from-context  -> 0 location contradictions.
  4. classify-the-MOVE-first, then reply     -> matcher-level discipline on the
     defensive moves (withhold address / redirect harvesters+crude to OF).

Env: OSS_URL (default http://localhost:8800/v1/chat/completions, an SSH tunnel or
the wg-mesh IP of the GPU droplet), DATA_DIR (default ./.data, gitignored — PII).
Reads <DATA_DIR>/eval_set.json (from extract.py) + <DATA_DIR>/canon_templates.json.
Writes <DATA_DIR>/results.json. No PII ever touches the repo.
"""
import json, os, re, urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed

OSS_URL = os.environ.get("OSS_URL", "http://localhost:8800/v1/chat/completions")
DATA = os.environ.get("DATA_DIR", os.path.join(os.path.dirname(__file__), ".data"))
MODEL = os.environ.get("OSS_MODEL", "quinn-oss")

eval_set = json.load(open(os.path.join(DATA, "eval_set.json")))
canon = json.load(open(os.path.join(DATA, "canon_templates.json")))
fewshot = "\n".join(f"- {k}: {v}" for k, v in canon.items())

SYSTEM = f"""You are Quinn, a high-end trans companion, drafting your NEXT text to a prospect.

CURRENT TRIP: touring from SF, incall in Williamsburg NYC, $1000/hr 1hr min, text only/no calls, OnlyFans @transquinnftw. Keep exact address private until a time is locked. If clearly out of budget, warmly redirect to OnlyFans.

CANON VOICE — match this exact register and brevity (Quinn's real templates):
{fewshot}

VOICE: warm, flirty, confident, lowercase-casual, CONCISE. "hun"/"babe", emoji sparingly (love/kiss). Direct. Match the length of her canon.

FIRST classify the MOVE, then reply per that move's canon:
- opener: new/qualified inbound, general interest -> the opener.
- subhour: asks for a <1hr / half-hour rate -> the subhour line ($1000 min).
- address: asks for address/location BEFORE a time is locked -> WITHHOLD (don't give place).
- out-of-area: asks if you're in another city -> say you're not there, you're in williamsburg this trip.
- of: HARVESTER (wants free pics), asks for explicit/crude acts or a "show"/FT, or clearly out of budget -> OF redirect; do NOT engage the explicit ask or pitch incall.

Lines marked CLIENT are the prospect, QUINN are you.
Respond ONLY as JSON, "move" then "reply": {{"move": "<opener|subhour|address|out-of-area|of>", "reply": "<Quinn's next text>", "think": "<one short sentence>"}}"""

SCHEMA = {"type": "object",
          "properties": {"move": {"type": "string", "enum": ["opener", "subhour", "address", "out-of-area", "of"]},
                         "reply": {"type": "string"}, "think": {"type": "string"}},
          "required": ["move", "reply", "think"], "additionalProperties": False}

def oss(user, temp=0.5, maxtok=700):
    body = json.dumps({"model": MODEL,
                       "messages": [{"role": "system", "content": SYSTEM}, {"role": "user", "content": user}],
                       "temperature": temp, "max_tokens": maxtok,
                       "response_format": {"type": "json_schema",
                                           "json_schema": {"name": "draft", "schema": SCHEMA, "strict": True}}}).encode()
    req = urllib.request.Request(OSS_URL, data=body, headers={"Content-Type": "application/json"})
    return json.load(urllib.request.urlopen(req, timeout=180))["choices"][0]["message"]["content"]

def parse(raw):
    try:
        d = json.loads(raw)
        return d.get("reply", "").strip(), d.get("move", "")
    except Exception:
        m = re.search(r'"reply"\s*:\s*"((?:[^"\\]|\\.)*)"', raw)
        return (m.group(1).strip() if m else ""), ""

def work(x):
    raw = oss(x["context"])
    reply, move = parse(raw)
    return {**x, "oss_reply": reply, "oss_move": move}

results = []
with ThreadPoolExecutor(max_workers=12) as ex:
    futs = {ex.submit(work, x): x["id"] for x in eval_set}
    for f in as_completed(futs):
        try:
            results.append(f.result())
        except Exception as e:
            print(futs[f], "ERR", e, flush=True)

results.sort(key=lambda x: x["id"])
json.dump(results, open(os.path.join(DATA, "results.json"), "w"), ensure_ascii=False)
malformed = sum(1 for x in results if not x["oss_reply"])
voiced = sum(any(w in x["oss_reply"].lower() for w in ["hun", "babe", "💗", "😘", "🥰"]) for x in results)
print(f"stored {len(results)} | malformed={malformed} | on-voice={voiced}/{len(results)}", flush=True)