prospector/tooling/eval/extract.py
Natalie 19c578bead
Some checks are pending
CI / verify (push) Waiting to run
feat(prospector): add tooling/eval draft-engine bake-off harness
Validated OSS (Qwen3.6-27B-AEON-Uncensored) Quinn-voice drafting against the
agent-matcher reply-queue baseline. Four methodology fixes eliminate the early
weaknesses: json_schema strict (0% malformed), canon few-shot (100% on-voice),
current-facts/location-from-context (0 location errors), and classify-move-first
then reply (matcher-level discipline on defensive moves: withhold address,
redirect harvesters+crude to OF). PII stays under gitignored .data/; scripts
only. Claude is the offline judge/advisor, never the runtime generator.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 01:47:56 -04:00

61 lines
3 KiB
Python

#!/usr/bin/env python3
"""Build the (pseudonymized) eval set from the agent-matcher reply-queue + chat.db.
Source of truth for the recent set = the Executor agent-matcher reply-queue
(handles + classified cat/tmpl + the matcher's drafted reply, the baseline we
score against). Full conversation context is pulled from the local Messages
chat.db. PII handling: phone-number handles are replaced with RQ_NN pseudonyms;
the pseudonym->handle map stays local (*.local.json, gitignored) and is NEVER
sent off the laptop. Conversation text is required (it's the model input) and is
written only under DATA_DIR (gitignored).
Env: REPLY_QUEUE (path to the matcher reply-queue json), DATA_DIR (default ./.data).
Writes <DATA_DIR>/eval_set.json, canon_templates.json, handle_map.local.json.
"""
import sqlite3, json, os
DATA = os.environ.get("DATA_DIR", os.path.join(os.path.dirname(__file__), ".data"))
os.makedirs(DATA, exist_ok=True)
REPLY_QUEUE = os.environ.get("REPLY_QUEUE",
os.path.expanduser("~/Documents/Claude/Projects/Executor/prospecting/reply-queue-2026-06-28.json"))
CHATDB = os.path.expanduser("~/Library/Messages/chat.db")
def decode_attr(data):
"""Pull the text out of a Messages attributedBody blob (typedstream)."""
if not data:
return None
try:
idx = data.index(b"NSString")
p = data.index(b"\x2b", idx) + 1 # the 0x2b value-marker after the class name
except ValueError:
return None
length = data[p]
if length == 0x81: # long form: 2-byte LE length follows
length = int.from_bytes(data[p + 1:p + 3], "little"); p += 3
else:
p += 1
return (data[p:p + length].decode("utf-8", "replace").strip()) or None
rq = json.load(open(REPLY_QUEUE))
json.dump(rq.get("templates", {}), open(os.path.join(DATA, "canon_templates.json"), "w"))
db = sqlite3.connect(f"file:{CHATDB}?mode=ro", uri=True)
eval_set, mapping = [], {}
for i, item in enumerate(rq["queue"], 1):
handle, pid = item["to"], f"RQ_{i:02d}"
rows = db.execute(
"SELECT m.is_from_me,m.text,m.attributedBody FROM message m "
"JOIN handle hd ON m.handle_id=hd.ROWID WHERE hd.id=? ORDER BY m.date ASC", (handle,)).fetchall()
turns = [{"who": "quinn" if is_me else "client", "text": (text or "").strip() or decode_attr(ab)}
for is_me, text, ab in rows]
turns = [t for t in turns if t["text"]] or [{"who": "client", "text": item["their_last"]}]
while turns and turns[-1]["who"] == "quinn": # end on the client's turn
turns = turns[:-1]
ctx = "\n".join(f"{t['who'].upper()}: {t['text']}" for t in turns[-14:])
eval_set.append({"id": pid, "context": ctx, "their_last": item["their_last"],
"cat": item["cat"], "tmpl": item["tmpl"], "matcher_reply": item["body"]})
mapping[pid] = handle
json.dump(eval_set, open(os.path.join(DATA, "eval_set.json"), "w"), ensure_ascii=False)
json.dump(mapping, open(os.path.join(DATA, "handle_map.local.json"), "w"))
print(f"eval set: {len(eval_set)} convos -> {DATA}/eval_set.json (pseudonymized; map kept local)")