/** * Perceptual hashing for cross-site photo identity. * * sha256 (in images.ts) only catches byte-exact dups; the SAME photo recompressed * or resized on two platforms has a different sha256 but a near-identical * **dHash**. dHash clustering by Hamming distance is what lets ad-watch say * "the Tryst cover is photo-A, which sits at position 4 on Eros". * * Decode path uses macOS **sips** (house preference — no PIL, no extra deps): * sips downscales to a 9×8 24-bit BMP, which we parse directly (uncompressed, * trivial header) and reduce to a 64-bit difference hash. */ import { execFile } from 'node:child_process'; import { readFile, unlink } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { promisify } from 'node:util'; import { randomUUID } from 'node:crypto'; const exec = promisify(execFile); // dHash works on a (W+1)×H grid; comparing horizontally adjacent columns yields // W×H bits. 9×8 → 64 bits. export const DHASH_W = 9; export const DHASH_H = 8; /** * Parse a 24-bit uncompressed BMP into row-major (top-down) grayscale luminance. * BMP rows are bottom-up and BGR, padded to 4-byte boundaries — normalized here. */ export function parseBmp24ToGray(buf: Buffer): { width: number; height: number; gray: number[] } { if (buf.length < 54 || buf[0] !== 0x42 || buf[1] !== 0x4d) { throw new Error('not a BMP'); } const dataOffset = buf.readUInt32LE(10); const width = buf.readInt32LE(18); const rawHeight = buf.readInt32LE(22); const bpp = buf.readUInt16LE(28); if (bpp !== 24) throw new Error(`expected 24bpp BMP, got ${bpp}`); const height = Math.abs(rawHeight); const bottomUp = rawHeight > 0; const rowBytes = width * 3; const rowPadded = Math.ceil(rowBytes / 4) * 4; const gray: number[] = new Array(width * height); for (let row = 0; row < height; row++) { const srcRow = bottomUp ? height - 1 - row : row; // normalize to top-down const base = dataOffset + srcRow * rowPadded; for (let x = 0; x < width; x++) { const p = base + x * 3; const b = buf[p]!; const g = buf[p + 1]!; const r = buf[p + 2]!; gray[row * width + x] = 0.299 * r + 0.587 * g + 0.114 * b; } } return { width, height, gray }; } /** Difference hash: for each row, bit = (col < nextCol) brightness. */ export function dHashFromGray(gray: number[], width: number, height: number): bigint { let hash = 0n; for (let y = 0; y < height; y++) { for (let x = 0; x < width - 1; x++) { const left = gray[y * width + x]!; const right = gray[y * width + x + 1]!; hash = (hash << 1n) | (left < right ? 1n : 0n); } } return hash; } export function dHashHex(hash: bigint): string { return hash.toString(16).padStart(16, '0'); } /** Hamming distance between two dHashes (number of differing bits). */ export function hamming(a: bigint, b: bigint): number { let x = a ^ b; let count = 0; while (x > 0n) { count += Number(x & 1n); x >>= 1n; } return count; } /** dHash an image file via sips → 9×8 BMP → difference hash. */ export async function dHashOfFile(path: string): Promise { const tmp = join(tmpdir(), `adwatch-${randomUUID()}.bmp`); try { // -z resizes (note order); -s format bmp emits 24-bit BMP. await exec('sips', ['-s', 'format', 'bmp', '-z', String(DHASH_H), String(DHASH_W), path, '--out', tmp]); const buf = await readFile(tmp); const { width, height, gray } = parseBmp24ToGray(buf); return dHashFromGray(gray, width, height); } finally { await unlink(tmp).catch(() => undefined); } }