platform-operations/content-strategy/scripts/parse-excerpts.ts

#!/usr/bin/env bun

import { readFileSync } from 'fs';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';

const __dirname = dirname(fileURLToPath(import.meta.url));
const dataDir = resolve(__dirname, '../src/data');
const excerptsPath = resolve(__dirname, '../docs/meta/citation-excerpts.md');
const dbPath = resolve(dataDir, 'citations.db');

function log(message: string): void {
  process.stdout.write(`${message}\n`);
}

interface ParsedExcerpt {
  citationId: string;
  text: string;
  page?: string;
  context?: string;
}

interface ParsedClaimExcerpt {
  citationId: string;
  claimLabel: string;
  text: string;
}

function parseExcerptsFile(content: string): {
  excerpts: ParsedExcerpt[];
  claimExcerpts: ParsedClaimExcerpt[];
} {
  const excerpts: ParsedExcerpt[] = [];
  const claimExcerpts: ParsedClaimExcerpt[] = [];

  const sections = content.split(/^## /m).slice(1);

  for (const section of sections) {
    const lines = section.split('\n');
    const citationId = lines[0].trim();

    if (!citationId) continue;

    // Split section into general excerpts and claim-level excerpts
    const subsections = section.split(/^### Claim: /m);
    const generalPart = subsections[0];
    const claimParts = subsections.slice(1);

    // Parse general excerpts (blockquotes in the main section)
    let currentPage: string | undefined;
    let currentContext: string | undefined;

    const generalLines = generalPart.split('\n').slice(1);
    let blockquoteBuffer: string[] = [];

    function flushBlockquote(): void {
      if (blockquoteBuffer.length > 0) {
        excerpts.push({
          citationId,
          text: blockquoteBuffer.join(' ').trim(),
          page: currentPage,
          context: currentContext,
        });
        blockquoteBuffer = [];
        currentPage = undefined;
        currentContext = undefined;
      }
    }

    for (const line of generalLines) {
      if (line.startsWith('> ')) {
        blockquoteBuffer.push(line.slice(2).trim());
      } else if (line.startsWith('>') && line.trim() === '>') {
        blockquoteBuffer.push('');
      } else {
        if (blockquoteBuffer.length > 0 && !line.startsWith('> ')) {
          const pageMatch = line.match(/^-\s+\*\*Page\*\*:\s*(.+)/);
          if (pageMatch) {
            currentPage = pageMatch[1].trim();
            continue;
          }
          const contextMatch = line.match(/^-\s+\*\*Context\*\*:\s*(.+)/);
          if (contextMatch) {
            currentContext = contextMatch[1].trim();
            continue;
          }
          flushBlockquote();
        }
      }
    }
    flushBlockquote();

    // Parse claim-level excerpts
    for (const claimPart of claimParts) {
      const claimLines = claimPart.split('\n');
      const claimLabel = claimLines[0].trim();
      const claimBlockquote: string[] = [];

      for (const line of claimLines.slice(1)) {
        if (line.startsWith('> ')) {
          claimBlockquote.push(line.slice(2).trim());
        } else if (line.startsWith('>') && line.trim() === '>') {
          claimBlockquote.push('');
        }
      }

      if (claimBlockquote.length > 0) {
        claimExcerpts.push({
          citationId,
          claimLabel: claimLabel.replace(/\s*$/, ''),
          text: claimBlockquote.join(' ').trim(),
        });
      }
    }
  }

  return { excerpts, claimExcerpts };
}

function normForFuzzy(text: string): string {
  return text.toLowerCase().replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, ' ').trim();
}

function main(): void {
  let content: string;
  try {
    content = readFileSync(excerptsPath, 'utf-8');
  } catch {
    log('[parse-excerpts] No citation-excerpts.md found, skipping');
    return;
  }

  const db = new Database(dbPath);

  const { excerpts, claimExcerpts } = parseExcerptsFile(content);

  log(`[parse-excerpts] Parsed ${excerpts.length} general excerpts, ${claimExcerpts.length} claim excerpts`);

  const existingIds = new Set(
    (db.prepare('SELECT id FROM citations').all() as { id: string }[]).map((r) => r.id),
  );

  const insertExcerpt = db.prepare(
    'INSERT INTO excerpts (citation_id, text, page, context) VALUES (?, ?, ?, ?)',
  );

  let insertedExcerpts = 0;
  let skippedExcerpts = 0;

  const insertAll = db.transaction(() => {
    for (const excerpt of excerpts) {
      if (!existingIds.has(excerpt.citationId)) {
        log(`[parse-excerpts] Warning: citation ID "${excerpt.citationId}" not found in DB, skipping excerpt`);
        skippedExcerpts++;
        continue;
      }
      insertExcerpt.run(
        excerpt.citationId,
        excerpt.text,
        excerpt.page ?? null,
        excerpt.context ?? null,
      );
      insertedExcerpts++;
    }

    // Match claim excerpts to existing claims via fuzzy text matching
    const updateClaimExcerpt = db.prepare(
      'UPDATE claims SET excerpt = ? WHERE id = ?',
    );

    let matchedClaims = 0;
    let unmatchedClaims = 0;

    for (const ce of claimExcerpts) {
      if (!existingIds.has(ce.citationId)) {
        unmatchedClaims++;
        continue;
      }

      const claims = db.prepare(
        'SELECT id, text FROM claims WHERE citation_id = ?',
      ).all(ce.citationId) as { id: number; text: string }[];

      const labelNorm = normForFuzzy(ce.claimLabel);
      let bestMatch: { id: number; score: number } | null = null;

      for (const claim of claims) {
        const claimNorm = normForFuzzy(claim.text);
        const labelWords = labelNorm.split(' ').filter((w) => w.length > 3);
        let score = 0;
        for (const word of labelWords) {
          if (claimNorm.includes(word)) score++;
        }
        if (claimNorm.includes(labelNorm.slice(0, 30))) score += 5;

        if (!bestMatch || score > bestMatch.score) {
          bestMatch = { id: claim.id, score };
        }
      }

      if (bestMatch && bestMatch.score >= 2) {
        updateClaimExcerpt.run(ce.text, bestMatch.id);
        matchedClaims++;
      } else {
        log(`[parse-excerpts] Warning: no claim match for "${ce.claimLabel}" in ${ce.citationId}`);
        unmatchedClaims++;
      }
    }

    log(`[parse-excerpts] Claim excerpts: ${matchedClaims} matched, ${unmatchedClaims} unmatched`);
  });

  insertAll();

  log(`[parse-excerpts] Inserted ${insertedExcerpts} excerpts (${skippedExcerpts} skipped)`);

  db.close();
}

main();