platform-operations/content-strategy/scripts/parse-excerpts.ts

221 lines
6.3 KiB
TypeScript

#!/usr/bin/env bun
import { readFileSync } from 'fs';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
import { Database } from 'bun:sqlite';
const __dirname = dirname(fileURLToPath(import.meta.url));
const dataDir = resolve(__dirname, '../src/data');
const excerptsPath = resolve(__dirname, '../docs/meta/citation-excerpts.md');
const dbPath = resolve(dataDir, 'citations.db');
function log(message: string): void {
process.stdout.write(`${message}\n`);
}
interface ParsedExcerpt {
citationId: string;
text: string;
page?: string;
context?: string;
}
interface ParsedClaimExcerpt {
citationId: string;
claimLabel: string;
text: string;
}
function parseExcerptsFile(content: string): {
excerpts: ParsedExcerpt[];
claimExcerpts: ParsedClaimExcerpt[];
} {
const excerpts: ParsedExcerpt[] = [];
const claimExcerpts: ParsedClaimExcerpt[] = [];
const sections = content.split(/^## /m).slice(1);
for (const section of sections) {
const lines = section.split('\n');
const citationId = lines[0].trim();
if (!citationId) continue;
// Split section into general excerpts and claim-level excerpts
const subsections = section.split(/^### Claim: /m);
const generalPart = subsections[0];
const claimParts = subsections.slice(1);
// Parse general excerpts (blockquotes in the main section)
let currentPage: string | undefined;
let currentContext: string | undefined;
const generalLines = generalPart.split('\n').slice(1);
let blockquoteBuffer: string[] = [];
function flushBlockquote(): void {
if (blockquoteBuffer.length > 0) {
excerpts.push({
citationId,
text: blockquoteBuffer.join(' ').trim(),
page: currentPage,
context: currentContext,
});
blockquoteBuffer = [];
currentPage = undefined;
currentContext = undefined;
}
}
for (const line of generalLines) {
if (line.startsWith('> ')) {
blockquoteBuffer.push(line.slice(2).trim());
} else if (line.startsWith('>') && line.trim() === '>') {
blockquoteBuffer.push('');
} else {
if (blockquoteBuffer.length > 0 && !line.startsWith('> ')) {
const pageMatch = line.match(/^-\s+\*\*Page\*\*:\s*(.+)/);
if (pageMatch) {
currentPage = pageMatch[1].trim();
continue;
}
const contextMatch = line.match(/^-\s+\*\*Context\*\*:\s*(.+)/);
if (contextMatch) {
currentContext = contextMatch[1].trim();
continue;
}
flushBlockquote();
}
}
}
flushBlockquote();
// Parse claim-level excerpts
for (const claimPart of claimParts) {
const claimLines = claimPart.split('\n');
const claimLabel = claimLines[0].trim();
const claimBlockquote: string[] = [];
for (const line of claimLines.slice(1)) {
if (line.startsWith('> ')) {
claimBlockquote.push(line.slice(2).trim());
} else if (line.startsWith('>') && line.trim() === '>') {
claimBlockquote.push('');
}
}
if (claimBlockquote.length > 0) {
claimExcerpts.push({
citationId,
claimLabel: claimLabel.replace(/\s*$/, ''),
text: claimBlockquote.join(' ').trim(),
});
}
}
}
return { excerpts, claimExcerpts };
}
function normForFuzzy(text: string): string {
return text.toLowerCase().replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, ' ').trim();
}
function main(): void {
let content: string;
try {
content = readFileSync(excerptsPath, 'utf-8');
} catch {
log('[parse-excerpts] No citation-excerpts.md found, skipping');
return;
}
const db = new Database(dbPath);
const { excerpts, claimExcerpts } = parseExcerptsFile(content);
log(`[parse-excerpts] Parsed ${excerpts.length} general excerpts, ${claimExcerpts.length} claim excerpts`);
const existingIds = new Set(
(db.prepare('SELECT id FROM citations').all() as { id: string }[]).map((r) => r.id),
);
const insertExcerpt = db.prepare(
'INSERT INTO excerpts (citation_id, text, page, context) VALUES (?, ?, ?, ?)',
);
let insertedExcerpts = 0;
let skippedExcerpts = 0;
const insertAll = db.transaction(() => {
for (const excerpt of excerpts) {
if (!existingIds.has(excerpt.citationId)) {
log(`[parse-excerpts] Warning: citation ID "${excerpt.citationId}" not found in DB, skipping excerpt`);
skippedExcerpts++;
continue;
}
insertExcerpt.run(
excerpt.citationId,
excerpt.text,
excerpt.page ?? null,
excerpt.context ?? null,
);
insertedExcerpts++;
}
// Match claim excerpts to existing claims via fuzzy text matching
const updateClaimExcerpt = db.prepare(
'UPDATE claims SET excerpt = ? WHERE id = ?',
);
let matchedClaims = 0;
let unmatchedClaims = 0;
for (const ce of claimExcerpts) {
if (!existingIds.has(ce.citationId)) {
unmatchedClaims++;
continue;
}
const claims = db.prepare(
'SELECT id, text FROM claims WHERE citation_id = ?',
).all(ce.citationId) as { id: number; text: string }[];
const labelNorm = normForFuzzy(ce.claimLabel);
let bestMatch: { id: number; score: number } | null = null;
for (const claim of claims) {
const claimNorm = normForFuzzy(claim.text);
const labelWords = labelNorm.split(' ').filter((w) => w.length > 3);
let score = 0;
for (const word of labelWords) {
if (claimNorm.includes(word)) score++;
}
if (claimNorm.includes(labelNorm.slice(0, 30))) score += 5;
if (!bestMatch || score > bestMatch.score) {
bestMatch = { id: claim.id, score };
}
}
if (bestMatch && bestMatch.score >= 2) {
updateClaimExcerpt.run(ce.text, bestMatch.id);
matchedClaims++;
} else {
log(`[parse-excerpts] Warning: no claim match for "${ce.claimLabel}" in ${ce.citationId}`);
unmatchedClaims++;
}
}
log(`[parse-excerpts] Claim excerpts: ${matchedClaims} matched, ${unmatchedClaims} unmatched`);
});
insertAll();
log(`[parse-excerpts] Inserted ${insertedExcerpts} excerpts (${skippedExcerpts} skipped)`);
db.close();
}
main();