lilith-platform.live/codebase/@features/ad-watch/src/phash.ts
Natalie 769bfcd61d feat(ad-watch): plum stdio MCP — scrape ad-platform listings, diff vs canonical
quinn-adwatch: a stateless, plum-local stdio MCP that scrapes Quinn's live
listings on her 11 ad platforms (Eros/Tryst/TS4Rent/MegaPersonals/TSEscorts/
AdultLook/AdultSearch/SkipTheGames + OnlyFans/Fansly/ManyVids) and surfaces
discrepancies vs the canonical provider-config profile.

- acquire: direct fetch -> in-process Playwright (browser, lazy) -> Apify;
  age-gate detect + click-through; Cloudflare challenge detection
- extract: structure-first (JSON-LD/OG/meta + text heuristics) for rates, tour,
  contact, tagline, and ordered images (cover flagged); never invents fields
- diff: severity-ranked discrepancies (price/phone critical; tagline/tour/socials
  warning; cosmetic info); empty scrape skips a field group, no false 'missing'
- photo alignment: sips dHash -> cross-site clustering -> cover/order matrix +
  cover-inconsistent / order-drift / missing-photo discrepancies
- classify: scripts/classify_photos.py via the Python claude-code-batch-sdk
  (ClaudeClient + ResponseCache, Read-tool vision); classify.ts is a thin bridge

Black-independent by design (black + apricot expected to stay down): all deps are
public npm (SDK StdioServerTransport, no @lilith/mcp-common), classify uses the
on-disk Python SDK + local claude CLI, and ADWATCH_CANONICAL_FILE diffs against a
local provider-config snapshot. 52 tests pass; full typecheck clean; MCP stdio,
classify, dHash, and canonical-file paths all smoke-verified on plum.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-26 19:11:33 -04:00

101 lines
3.6 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Perceptual hashing for cross-site photo identity.
*
* sha256 (in images.ts) only catches byte-exact dups; the SAME photo recompressed
* or resized on two platforms has a different sha256 but a near-identical
* **dHash**. dHash clustering by Hamming distance is what lets ad-watch say
* "the Tryst cover is photo-A, which sits at position 4 on Eros".
*
* Decode path uses macOS **sips** (house preference — no PIL, no extra deps):
* sips downscales to a 9×8 24-bit BMP, which we parse directly (uncompressed,
* trivial header) and reduce to a 64-bit difference hash.
*/
import { execFile } from 'node:child_process';
import { readFile, unlink } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import { randomUUID } from 'node:crypto';
const exec = promisify(execFile);
// dHash works on a (W+1)×H grid; comparing horizontally adjacent columns yields
// W×H bits. 9×8 → 64 bits.
export const DHASH_W = 9;
export const DHASH_H = 8;
/**
* Parse a 24-bit uncompressed BMP into row-major (top-down) grayscale luminance.
* BMP rows are bottom-up and BGR, padded to 4-byte boundaries — normalized here.
*/
export function parseBmp24ToGray(buf: Buffer): { width: number; height: number; gray: number[] } {
if (buf.length < 54 || buf[0] !== 0x42 || buf[1] !== 0x4d) {
throw new Error('not a BMP');
}
const dataOffset = buf.readUInt32LE(10);
const width = buf.readInt32LE(18);
const rawHeight = buf.readInt32LE(22);
const bpp = buf.readUInt16LE(28);
if (bpp !== 24) throw new Error(`expected 24bpp BMP, got ${bpp}`);
const height = Math.abs(rawHeight);
const bottomUp = rawHeight > 0;
const rowBytes = width * 3;
const rowPadded = Math.ceil(rowBytes / 4) * 4;
const gray: number[] = new Array(width * height);
for (let row = 0; row < height; row++) {
const srcRow = bottomUp ? height - 1 - row : row; // normalize to top-down
const base = dataOffset + srcRow * rowPadded;
for (let x = 0; x < width; x++) {
const p = base + x * 3;
const b = buf[p]!;
const g = buf[p + 1]!;
const r = buf[p + 2]!;
gray[row * width + x] = 0.299 * r + 0.587 * g + 0.114 * b;
}
}
return { width, height, gray };
}
/** Difference hash: for each row, bit = (col < nextCol) brightness. */
export function dHashFromGray(gray: number[], width: number, height: number): bigint {
let hash = 0n;
for (let y = 0; y < height; y++) {
for (let x = 0; x < width - 1; x++) {
const left = gray[y * width + x]!;
const right = gray[y * width + x + 1]!;
hash = (hash << 1n) | (left < right ? 1n : 0n);
}
}
return hash;
}
export function dHashHex(hash: bigint): string {
return hash.toString(16).padStart(16, '0');
}
/** Hamming distance between two dHashes (number of differing bits). */
export function hamming(a: bigint, b: bigint): number {
let x = a ^ b;
let count = 0;
while (x > 0n) {
count += Number(x & 1n);
x >>= 1n;
}
return count;
}
/** dHash an image file via sips → 9×8 BMP → difference hash. */
export async function dHashOfFile(path: string): Promise<bigint> {
const tmp = join(tmpdir(), `adwatch-${randomUUID()}.bmp`);
try {
// -z <height> <width> resizes (note order); -s format bmp emits 24-bit BMP.
await exec('sips', ['-s', 'format', 'bmp', '-z', String(DHASH_H), String(DHASH_W), path, '--out', tmp]);
const buf = await readFile(tmp);
const { width, height, gray } = parseBmp24ToGray(buf);
return dHashFromGray(gray, width, height);
} finally {
await unlink(tmp).catch(() => undefined);
}
}