lilith-platform.live/codebase/@features/ad-watch/src/classify.ts
Natalie 769bfcd61d feat(ad-watch): plum stdio MCP — scrape ad-platform listings, diff vs canonical
quinn-adwatch: a stateless, plum-local stdio MCP that scrapes Quinn's live
listings on her 11 ad platforms (Eros/Tryst/TS4Rent/MegaPersonals/TSEscorts/
AdultLook/AdultSearch/SkipTheGames + OnlyFans/Fansly/ManyVids) and surfaces
discrepancies vs the canonical provider-config profile.

- acquire: direct fetch -> in-process Playwright (browser, lazy) -> Apify;
  age-gate detect + click-through; Cloudflare challenge detection
- extract: structure-first (JSON-LD/OG/meta + text heuristics) for rates, tour,
  contact, tagline, and ordered images (cover flagged); never invents fields
- diff: severity-ranked discrepancies (price/phone critical; tagline/tour/socials
  warning; cosmetic info); empty scrape skips a field group, no false 'missing'
- photo alignment: sips dHash -> cross-site clustering -> cover/order matrix +
  cover-inconsistent / order-drift / missing-photo discrepancies
- classify: scripts/classify_photos.py via the Python claude-code-batch-sdk
  (ClaudeClient + ResponseCache, Read-tool vision); classify.ts is a thin bridge

Black-independent by design (black + apricot expected to stay down): all deps are
public npm (SDK StdioServerTransport, no @lilith/mcp-common), classify uses the
on-disk Python SDK + local claude CLI, and ADWATCH_CANONICAL_FILE diffs against a
local provider-config snapshot. 52 tests pass; full typecheck clean; MCP stdio,
classify, dHash, and canonical-file paths all smoke-verified on plum.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 19:11:33 -04:00

123 lines
4.2 KiB
TypeScript

/**
* Semantic photo labels via the **claude-code-batch-sdk** (Python, at
* ~/Code/@applications/@ml/@packages/@py/claude-code-batch-sdk) — Quinn's batch
* SDK, NOT the TS @lilith/claude-code-sdk and NOT the official Agent SDK.
*
* It's Python (no API key, uses the `claude` CLI with content-addressable
* caching + concurrency), so the classify step is `scripts/classify_photos.py`
* and this module is a thin subprocess bridge: pipe the cluster representatives
* in as JSON, read JSON labels back, coerce, and attach to the report.
*
* Verified on plum (claude CLI + the SDK source on disk, black-independent): the
* script reads an image and returns a valid label.
*/
import { spawn } from 'node:child_process';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
import type { AlignmentReport, PhotoCluster } from './align.js';
import { defaultImageRoot } from './images.js';
import { coerceLabel, type PhotoLabel } from './classify-parse.js';
export { PHOTO_CATEGORIES, type PhotoCategory, type PhotoLabel } from './classify-parse.js';
const PYTHON = process.env['ADWATCH_PYTHON'] ?? 'python3';
const CLASSIFY_LEVEL = process.env['ADWATCH_CLASSIFY_LEVEL'] ?? 'haiku';
function scriptPath(): string {
// src/classify.ts → ../scripts/classify_photos.py
return join(dirname(fileURLToPath(import.meta.url)), '..', 'scripts', 'classify_photos.py');
}
interface ClassifyRequestPhoto {
id: string;
path: string;
sha256?: string;
}
/** Run the Python classifier over a request payload; return its raw JSON rows. */
function runPython(payload: unknown): Promise<Array<Record<string, unknown>>> {
return new Promise((resolve, reject) => {
const proc = spawn(PYTHON, [scriptPath()], { stdio: ['pipe', 'pipe', 'pipe'] });
let out = '';
let err = '';
proc.stdout.on('data', (d) => (out += d));
proc.stderr.on('data', (d) => (err += d));
proc.on('error', (e) => reject(new Error(`failed to spawn ${PYTHON}: ${e.message}`)));
proc.on('close', (code) => {
if (code !== 0) {
reject(new Error(`classify_photos.py exited ${code}: ${err.trim().slice(0, 300)}`));
return;
}
try {
const parsed = JSON.parse(out);
resolve(Array.isArray(parsed) ? parsed : []);
} catch {
reject(new Error(`classify_photos.py returned non-JSON: ${out.trim().slice(0, 200)}`));
}
});
proc.stdin.write(JSON.stringify(payload));
proc.stdin.end();
});
}
/**
* Classify one representative image per photo cluster via the Python batch SDK.
* The label applies to the whole cluster (every site that photo appears on).
*/
export async function classifyClusters(
clusters: PhotoCluster[],
opts: { imageRoot?: string; cacheDir?: string } = {},
): Promise<PhotoLabel[]> {
const reps: ClassifyRequestPhoto[] = clusters
.map((c) => {
const m = c.members[0];
return m ? { id: c.id, path: m.path } : null;
})
.filter((r): r is ClassifyRequestPhoto => r !== null);
if (reps.length === 0) return [];
const rows = await runPython({
photos: reps,
imageRoot: opts.imageRoot ?? defaultImageRoot(),
model: CLASSIFY_LEVEL,
...(opts.cacheDir ? { cacheDir: opts.cacheDir } : {}),
});
return rows.map((row) => {
const photoId = typeof row['photoId'] === 'string' ? row['photoId'] : '';
if (typeof row['error'] === 'string') {
return {
photoId,
category: 'portrait',
thumbnailFitness: 0,
faceVisible: false,
note: '',
error: row['error'],
} satisfies PhotoLabel;
}
return coerceLabel(photoId, row);
});
}
/** Attach labels to a report's photos (by cluster id), returning a new report. */
export function attachLabels(report: AlignmentReport, labels: PhotoLabel[]): AlignmentReport {
const byId = new Map(labels.map((l) => [l.photoId, l]));
return {
...report,
photos: report.photos.map((p) => {
const l = byId.get(p.id);
return l
? {
...p,
label: {
category: l.category,
thumbnailFitness: l.thumbnailFitness,
faceVisible: l.faceVisible,
note: l.note,
},
}
: p;
}),
};
}