221 lines
6.3 KiB
TypeScript
221 lines
6.3 KiB
TypeScript
#!/usr/bin/env bun
|
|
|
|
import { readFileSync } from 'fs';
|
|
import { resolve, dirname } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { Database } from 'bun:sqlite';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const dataDir = resolve(__dirname, '../src/data');
|
|
const excerptsPath = resolve(__dirname, '../docs/meta/citation-excerpts.md');
|
|
const dbPath = resolve(dataDir, 'citations.db');
|
|
|
|
function log(message: string): void {
|
|
process.stdout.write(`${message}\n`);
|
|
}
|
|
|
|
interface ParsedExcerpt {
|
|
citationId: string;
|
|
text: string;
|
|
page?: string;
|
|
context?: string;
|
|
}
|
|
|
|
interface ParsedClaimExcerpt {
|
|
citationId: string;
|
|
claimLabel: string;
|
|
text: string;
|
|
}
|
|
|
|
function parseExcerptsFile(content: string): {
|
|
excerpts: ParsedExcerpt[];
|
|
claimExcerpts: ParsedClaimExcerpt[];
|
|
} {
|
|
const excerpts: ParsedExcerpt[] = [];
|
|
const claimExcerpts: ParsedClaimExcerpt[] = [];
|
|
|
|
const sections = content.split(/^## /m).slice(1);
|
|
|
|
for (const section of sections) {
|
|
const lines = section.split('\n');
|
|
const citationId = lines[0].trim();
|
|
|
|
if (!citationId) continue;
|
|
|
|
// Split section into general excerpts and claim-level excerpts
|
|
const subsections = section.split(/^### Claim: /m);
|
|
const generalPart = subsections[0];
|
|
const claimParts = subsections.slice(1);
|
|
|
|
// Parse general excerpts (blockquotes in the main section)
|
|
let currentPage: string | undefined;
|
|
let currentContext: string | undefined;
|
|
|
|
const generalLines = generalPart.split('\n').slice(1);
|
|
let blockquoteBuffer: string[] = [];
|
|
|
|
function flushBlockquote(): void {
|
|
if (blockquoteBuffer.length > 0) {
|
|
excerpts.push({
|
|
citationId,
|
|
text: blockquoteBuffer.join(' ').trim(),
|
|
page: currentPage,
|
|
context: currentContext,
|
|
});
|
|
blockquoteBuffer = [];
|
|
currentPage = undefined;
|
|
currentContext = undefined;
|
|
}
|
|
}
|
|
|
|
for (const line of generalLines) {
|
|
if (line.startsWith('> ')) {
|
|
blockquoteBuffer.push(line.slice(2).trim());
|
|
} else if (line.startsWith('>') && line.trim() === '>') {
|
|
blockquoteBuffer.push('');
|
|
} else {
|
|
if (blockquoteBuffer.length > 0 && !line.startsWith('> ')) {
|
|
const pageMatch = line.match(/^-\s+\*\*Page\*\*:\s*(.+)/);
|
|
if (pageMatch) {
|
|
currentPage = pageMatch[1].trim();
|
|
continue;
|
|
}
|
|
const contextMatch = line.match(/^-\s+\*\*Context\*\*:\s*(.+)/);
|
|
if (contextMatch) {
|
|
currentContext = contextMatch[1].trim();
|
|
continue;
|
|
}
|
|
flushBlockquote();
|
|
}
|
|
}
|
|
}
|
|
flushBlockquote();
|
|
|
|
// Parse claim-level excerpts
|
|
for (const claimPart of claimParts) {
|
|
const claimLines = claimPart.split('\n');
|
|
const claimLabel = claimLines[0].trim();
|
|
const claimBlockquote: string[] = [];
|
|
|
|
for (const line of claimLines.slice(1)) {
|
|
if (line.startsWith('> ')) {
|
|
claimBlockquote.push(line.slice(2).trim());
|
|
} else if (line.startsWith('>') && line.trim() === '>') {
|
|
claimBlockquote.push('');
|
|
}
|
|
}
|
|
|
|
if (claimBlockquote.length > 0) {
|
|
claimExcerpts.push({
|
|
citationId,
|
|
claimLabel: claimLabel.replace(/\s*$/, ''),
|
|
text: claimBlockquote.join(' ').trim(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return { excerpts, claimExcerpts };
|
|
}
|
|
|
|
function normForFuzzy(text: string): string {
|
|
return text.toLowerCase().replace(/[^a-z0-9\s]/g, '').replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
function main(): void {
|
|
let content: string;
|
|
try {
|
|
content = readFileSync(excerptsPath, 'utf-8');
|
|
} catch {
|
|
log('[parse-excerpts] No citation-excerpts.md found, skipping');
|
|
return;
|
|
}
|
|
|
|
const db = new Database(dbPath);
|
|
|
|
const { excerpts, claimExcerpts } = parseExcerptsFile(content);
|
|
|
|
log(`[parse-excerpts] Parsed ${excerpts.length} general excerpts, ${claimExcerpts.length} claim excerpts`);
|
|
|
|
const existingIds = new Set(
|
|
(db.prepare('SELECT id FROM citations').all() as { id: string }[]).map((r) => r.id),
|
|
);
|
|
|
|
const insertExcerpt = db.prepare(
|
|
'INSERT INTO excerpts (citation_id, text, page, context) VALUES (?, ?, ?, ?)',
|
|
);
|
|
|
|
let insertedExcerpts = 0;
|
|
let skippedExcerpts = 0;
|
|
|
|
const insertAll = db.transaction(() => {
|
|
for (const excerpt of excerpts) {
|
|
if (!existingIds.has(excerpt.citationId)) {
|
|
log(`[parse-excerpts] Warning: citation ID "${excerpt.citationId}" not found in DB, skipping excerpt`);
|
|
skippedExcerpts++;
|
|
continue;
|
|
}
|
|
insertExcerpt.run(
|
|
excerpt.citationId,
|
|
excerpt.text,
|
|
excerpt.page ?? null,
|
|
excerpt.context ?? null,
|
|
);
|
|
insertedExcerpts++;
|
|
}
|
|
|
|
// Match claim excerpts to existing claims via fuzzy text matching
|
|
const updateClaimExcerpt = db.prepare(
|
|
'UPDATE claims SET excerpt = ? WHERE id = ?',
|
|
);
|
|
|
|
let matchedClaims = 0;
|
|
let unmatchedClaims = 0;
|
|
|
|
for (const ce of claimExcerpts) {
|
|
if (!existingIds.has(ce.citationId)) {
|
|
unmatchedClaims++;
|
|
continue;
|
|
}
|
|
|
|
const claims = db.prepare(
|
|
'SELECT id, text FROM claims WHERE citation_id = ?',
|
|
).all(ce.citationId) as { id: number; text: string }[];
|
|
|
|
const labelNorm = normForFuzzy(ce.claimLabel);
|
|
let bestMatch: { id: number; score: number } | null = null;
|
|
|
|
for (const claim of claims) {
|
|
const claimNorm = normForFuzzy(claim.text);
|
|
const labelWords = labelNorm.split(' ').filter((w) => w.length > 3);
|
|
let score = 0;
|
|
for (const word of labelWords) {
|
|
if (claimNorm.includes(word)) score++;
|
|
}
|
|
if (claimNorm.includes(labelNorm.slice(0, 30))) score += 5;
|
|
|
|
if (!bestMatch || score > bestMatch.score) {
|
|
bestMatch = { id: claim.id, score };
|
|
}
|
|
}
|
|
|
|
if (bestMatch && bestMatch.score >= 2) {
|
|
updateClaimExcerpt.run(ce.text, bestMatch.id);
|
|
matchedClaims++;
|
|
} else {
|
|
log(`[parse-excerpts] Warning: no claim match for "${ce.claimLabel}" in ${ce.citationId}`);
|
|
unmatchedClaims++;
|
|
}
|
|
}
|
|
|
|
log(`[parse-excerpts] Claim excerpts: ${matchedClaims} matched, ${unmatchedClaims} unmatched`);
|
|
});
|
|
|
|
insertAll();
|
|
|
|
log(`[parse-excerpts] Inserted ${insertedExcerpts} excerpts (${skippedExcerpts} skipped)`);
|
|
|
|
db.close();
|
|
}
|
|
|
|
main();
|