762 lines
20 KiB
TypeScript
762 lines
20 KiB
TypeScript
#!/usr/bin/env bun
|
|
|
|
import { readFileSync, readdirSync } from 'fs';
|
|
import { resolve, dirname } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { Database } from 'bun:sqlite';
|
|
|
|
import type {
|
|
CitationAuthor,
|
|
CitationClaim,
|
|
CitationType,
|
|
ThemeId,
|
|
} from '../src/types/citations';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const dataDir = resolve(__dirname, '../src/data');
|
|
const archiveDir = resolve(__dirname, '../docs/meta/theme-sources');
|
|
const schemaPath = resolve(__dirname, 'schema.sql');
|
|
const dbPath = resolve(dataDir, 'citations.db');
|
|
|
|
// ─── Theme ID mapping from filename stem ───
|
|
|
|
const THEME_MAP: Record<string, ThemeId> = {
|
|
'01-anti-extraction': 'anti-extraction',
|
|
'02-inverse-capitalism': 'inverse-capitalism',
|
|
'03-body-sovereignty': 'body-sovereignty',
|
|
'04-privacy': 'privacy',
|
|
'05-permanent-software': 'permanent-software',
|
|
'06-human-work': 'human-work',
|
|
'07-ai-philosophy': 'ai-philosophy',
|
|
'08-slutology': 'slutology',
|
|
'09-cooperative-future': 'cooperative-future',
|
|
'10-open-source': 'open-source',
|
|
};
|
|
|
|
// ─── Utilities ───
|
|
|
|
function slugify(text: string): string {
|
|
return text
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-+|-+$/g, '')
|
|
.slice(0, 80);
|
|
}
|
|
|
|
function normalizeForDedup(text: string): string {
|
|
return text.toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
}
|
|
|
|
function normalizeForMatch(text: string): string {
|
|
return text
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9\s]/g, '')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
function getPrimaryYear(yearStr: string): string {
|
|
const match = yearStr.match(/\d{4}/);
|
|
return match ? match[0] : yearStr.replace(/\s+/g, '').trim();
|
|
}
|
|
|
|
function log(message: string): void {
|
|
process.stdout.write(`${message}\n`);
|
|
}
|
|
|
|
function loadJson<T>(filename: string): T {
|
|
const raw = readFileSync(resolve(dataDir, filename), 'utf-8');
|
|
return JSON.parse(raw) as T;
|
|
}
|
|
|
|
// ─── Intermediate types ───
|
|
|
|
interface RawEntry {
|
|
source: string;
|
|
claim: string;
|
|
year: string;
|
|
category: string;
|
|
theme: ThemeId;
|
|
calculated: boolean;
|
|
url?: string;
|
|
}
|
|
|
|
interface FullRef {
|
|
number: number;
|
|
raw: string;
|
|
url?: string;
|
|
doi?: string;
|
|
arxiv?: string;
|
|
}
|
|
|
|
// ─── Table format detection ───
|
|
|
|
type TableFormat = 'variant-a' | 'variant-b';
|
|
|
|
function detectTableFormat(line: string): TableFormat | null {
|
|
if (/\|\s*#\s*\|\s*Claim\s*\|/i.test(line)) return 'variant-a';
|
|
if (/\|\s*#\s*\|\s*Source\s*\|\s*Year\s*\|\s*Claim/i.test(line)) return 'variant-b';
|
|
return null;
|
|
}
|
|
|
|
function parseTableRow(
|
|
line: string,
|
|
format: TableFormat,
|
|
): { source: string; claim: string; year: string } | null {
|
|
const cells = line
|
|
.split('|')
|
|
.map((c) => c.trim())
|
|
.filter((c) => c.length > 0);
|
|
if (cells.length < 4) return null;
|
|
if (/^-+$/.test(cells[0])) return null;
|
|
if (!/^\d+$/.test(cells[0])) return null;
|
|
|
|
if (format === 'variant-a') {
|
|
return { claim: cells[1], source: cells[2], year: cells[3] };
|
|
}
|
|
return { source: cells[1], year: cells[2], claim: cells[3] };
|
|
}
|
|
|
|
// ─── Parse table-based files (variants A and B) ───
|
|
|
|
function parseTableFile(lines: string[], theme: ThemeId): RawEntry[] {
|
|
const entries: RawEntry[] = [];
|
|
let currentCategory = '';
|
|
let currentFormat: TableFormat | null = null;
|
|
|
|
for (const line of lines) {
|
|
const h2Match = line.match(/^##\s+(.+)/);
|
|
if (h2Match) {
|
|
const heading = h2Match[1].trim();
|
|
if (/full references|notes\b|cross-references|citation format/i.test(heading)) {
|
|
break;
|
|
}
|
|
currentCategory = heading;
|
|
currentFormat = null;
|
|
continue;
|
|
}
|
|
|
|
const fmt = detectTableFormat(line);
|
|
if (fmt) {
|
|
currentFormat = fmt;
|
|
continue;
|
|
}
|
|
|
|
if (!currentFormat) continue;
|
|
if (/^\|[-\s|]+\|$/.test(line)) continue;
|
|
|
|
const row = parseTableRow(line, currentFormat);
|
|
if (!row) continue;
|
|
|
|
entries.push({
|
|
source: row.source,
|
|
claim: row.claim,
|
|
year: row.year,
|
|
category: currentCategory,
|
|
theme,
|
|
calculated: /calculated|internal/i.test(row.source),
|
|
});
|
|
}
|
|
|
|
return entries;
|
|
}
|
|
|
|
// ─── Parse paragraph format (03-body-sovereignty) ───
|
|
|
|
function parseParagraphFormat(lines: string[], theme: ThemeId): RawEntry[] {
|
|
const entries: RawEntry[] = [];
|
|
let currentCategory = '';
|
|
|
|
for (const line of lines) {
|
|
const h3Match = line.match(/^###\s+(.+)/);
|
|
if (h3Match) {
|
|
currentCategory = h3Match[1].trim();
|
|
continue;
|
|
}
|
|
const h2Match = line.match(/^##\s+(.+)/);
|
|
if (h2Match) {
|
|
const heading = h2Match[1].trim();
|
|
if (/citation format|cross-references|verification/i.test(heading)) continue;
|
|
currentCategory = heading;
|
|
continue;
|
|
}
|
|
|
|
// Match numbered entries: N. **Author** ...
|
|
const numMatch = line.match(/^(\d+)\.\s+\*\*(.+?)\*\*/);
|
|
if (!numMatch) continue;
|
|
|
|
const source = numMatch[2].trim();
|
|
const rest = line.substring(numMatch[0].length).trim();
|
|
|
|
// Find year anywhere in the remaining text
|
|
const yearMatch = rest.match(/\((\d{4})\)/) ?? rest.match(/\b(20[0-2]\d|19\d\d)\b/);
|
|
const year = yearMatch ? yearMatch[1] : 'n.d.';
|
|
|
|
// Claim: prefer quoted text, else cleaned text after year
|
|
const quoteMatch = rest.match(/"([^"]+)"/);
|
|
let claim: string;
|
|
if (quoteMatch) {
|
|
claim = quoteMatch[1];
|
|
} else {
|
|
let afterYear = rest;
|
|
if (yearMatch) {
|
|
const idx = rest.indexOf(yearMatch[0]);
|
|
afterYear = rest.substring(idx + yearMatch[0].length);
|
|
}
|
|
afterYear = afterYear
|
|
.replace(/^\)\.\s*/, '')
|
|
.replace(/^[\s.,;:]+/, '')
|
|
.replace(/\*[^*]+\*/g, '')
|
|
.replace(/\s*https?:\/\/\S+/g, '')
|
|
.replace(/^[\s.,;:]+/, '')
|
|
.replace(/[.\s]+$/, '')
|
|
.trim();
|
|
|
|
if (!afterYear) {
|
|
const italicMatch = rest.match(/\*([^*]+)\*/);
|
|
afterYear = italicMatch ? italicMatch[1] : rest.substring(0, 120);
|
|
}
|
|
claim = afterYear;
|
|
}
|
|
|
|
entries.push({
|
|
source,
|
|
claim,
|
|
year,
|
|
category: currentCategory,
|
|
theme,
|
|
calculated: false,
|
|
});
|
|
}
|
|
|
|
return entries;
|
|
}
|
|
|
|
// ─── Parse bullet format (10-open-source) ───
|
|
|
|
function parseBulletFormat(lines: string[], theme: ThemeId): RawEntry[] {
|
|
const entries: RawEntry[] = [];
|
|
let currentH2 = '';
|
|
let currentSource = '';
|
|
let currentYear = '';
|
|
let currentClaim = '';
|
|
let currentUrl: string | undefined;
|
|
|
|
function flush(): void {
|
|
if (currentSource && currentClaim) {
|
|
entries.push({
|
|
source: currentSource,
|
|
claim: currentClaim,
|
|
year: currentYear || 'n.d.',
|
|
category: currentH2,
|
|
theme,
|
|
calculated: false,
|
|
url: currentUrl,
|
|
});
|
|
}
|
|
currentSource = '';
|
|
currentYear = '';
|
|
currentClaim = '';
|
|
currentUrl = undefined;
|
|
}
|
|
|
|
for (const line of lines) {
|
|
const h2Match = line.match(/^##\s+(.+)/);
|
|
if (h2Match) {
|
|
flush();
|
|
if (/verification notes/i.test(h2Match[1])) break;
|
|
currentH2 = h2Match[1].trim();
|
|
continue;
|
|
}
|
|
|
|
const h3Match = line.match(/^###\s+(.+)/);
|
|
if (h3Match) {
|
|
flush();
|
|
continue;
|
|
}
|
|
|
|
if (line.startsWith('---')) {
|
|
flush();
|
|
continue;
|
|
}
|
|
|
|
// Skip table rows (technology dependency table)
|
|
if (line.startsWith('|')) continue;
|
|
|
|
const sourceMatch = line.match(/^-\s+\*\*Source\*\*:\s*(.+)/);
|
|
if (sourceMatch) {
|
|
flush();
|
|
currentSource = sourceMatch[1].trim();
|
|
const ym =
|
|
currentSource.match(/\((\d{4})\)/) ?? currentSource.match(/\b(20[0-2]\d|19\d\d)\b/);
|
|
if (ym) currentYear = ym[1];
|
|
continue;
|
|
}
|
|
|
|
const claimMatch = line.match(
|
|
/^-\s+\*\*(?:Key\s+(?:finding|findings|data|property|text|principle|limitation|stat|figures|claim)|Quote)\*\*:\s*(.+)/i,
|
|
);
|
|
if (claimMatch) {
|
|
currentClaim = claimMatch[1].trim().replace(/^"(.+)"$/, '$1');
|
|
continue;
|
|
}
|
|
|
|
const urlMatch = line.match(/^-\s+\*\*URL\*\*:\s*(https?:\/\/\S+)/);
|
|
if (urlMatch) {
|
|
currentUrl = urlMatch[1].trim();
|
|
continue;
|
|
}
|
|
}
|
|
flush();
|
|
|
|
return entries;
|
|
}
|
|
|
|
// ─── Full references parser ───
|
|
|
|
function parseFullReferences(lines: string[]): FullRef[] {
|
|
const refs: FullRef[] = [];
|
|
let inFullRefs = false;
|
|
|
|
for (const line of lines) {
|
|
if (/^##\s+Full References/i.test(line)) {
|
|
inFullRefs = true;
|
|
continue;
|
|
}
|
|
if (!inFullRefs) continue;
|
|
|
|
const match = line.match(/^(\d+)\.\s+(.+)/);
|
|
if (!match) continue;
|
|
|
|
const num = parseInt(match[1]);
|
|
const text = match[2].trim();
|
|
|
|
const urlMatch = text.match(/(https?:\/\/\S+)/);
|
|
const url = urlMatch ? urlMatch[1].replace(/[.,;)]+$/, '') : undefined;
|
|
|
|
const doiMatch =
|
|
text.match(/https?:\/\/doi\.org\/(\S+)/) ?? text.match(/doi[:/]\s*(10\.\S+)/i);
|
|
const doi = doiMatch ? doiMatch[1].replace(/[.,;)]+$/, '') : undefined;
|
|
|
|
const arxivMatch =
|
|
text.match(/https?:\/\/arxiv\.org\/abs\/(\S+)/) ?? text.match(/arXiv[:/]\s*(\S+)/i);
|
|
const arxiv = arxivMatch ? arxivMatch[1].replace(/[.,;)]+$/, '') : undefined;
|
|
|
|
refs.push({ number: num, raw: text, url, doi, arxiv });
|
|
}
|
|
|
|
return refs;
|
|
}
|
|
|
|
// ─── Source matching: find best full-reference match for a source string ───
|
|
|
|
function findMatchingRef(source: string, refs: FullRef[]): FullRef | null {
|
|
if (refs.length === 0) return null;
|
|
|
|
const sourceNorm = normalizeForMatch(source);
|
|
const sourceWords = sourceNorm.split(' ').filter((w) => w.length > 3);
|
|
if (sourceWords.length === 0) return null;
|
|
|
|
let bestMatch: FullRef | null = null;
|
|
let bestScore = 0;
|
|
|
|
for (const ref of refs) {
|
|
const refNorm = normalizeForMatch(ref.raw);
|
|
let score = 0;
|
|
|
|
for (const word of sourceWords) {
|
|
if (refNorm.includes(word)) score++;
|
|
}
|
|
|
|
// Bonus for matching the leading source identifier
|
|
const primaryKey = normalizeForMatch(source.split(/[;,]/)[0]);
|
|
if (primaryKey.length > 4 && refNorm.includes(primaryKey)) {
|
|
score += 3;
|
|
}
|
|
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestMatch = ref;
|
|
}
|
|
}
|
|
|
|
return bestScore >= 2 ? bestMatch : null;
|
|
}
|
|
|
|
// ─── Type inference ───
|
|
|
|
const NEWS_KEYWORDS = [
|
|
'variety',
|
|
'time',
|
|
'npr',
|
|
'nbc',
|
|
'fortune',
|
|
'venturebeat',
|
|
'mit technology review',
|
|
'404 media',
|
|
'the verge',
|
|
'wired',
|
|
'ars technica',
|
|
'techcrunch',
|
|
'business standard',
|
|
'bbc',
|
|
'reuters',
|
|
'new york times',
|
|
'financial times',
|
|
'cnbc',
|
|
'android police',
|
|
];
|
|
const ACADEMIC_KEYWORDS = [
|
|
'doi',
|
|
'arxiv',
|
|
'journal',
|
|
'et al',
|
|
'university',
|
|
'neurips',
|
|
'usenix',
|
|
'ieee',
|
|
'springer',
|
|
'plos',
|
|
'bmc',
|
|
'sage journals',
|
|
];
|
|
const POLICY_KEYWORDS = [
|
|
'aclu',
|
|
'gdpr',
|
|
'regulation',
|
|
'act no',
|
|
'amnesty',
|
|
'parliament',
|
|
'government',
|
|
'medecins du monde',
|
|
'prostitution reform',
|
|
'directive',
|
|
];
|
|
const REPORT_KEYWORDS = [
|
|
'survey',
|
|
'report',
|
|
'census',
|
|
'foundation',
|
|
'coalition',
|
|
'institute',
|
|
'hacking//hustling',
|
|
'hacking hustling',
|
|
'gallup',
|
|
'kff',
|
|
'unfpa',
|
|
'democracy at work',
|
|
];
|
|
const FILING_KEYWORDS = ['companies house', 'sec filing', 'annual report', 'financial statements'];
|
|
const PLATFORM_KEYWORDS = ['terms of service', 'tos', 'documentation', 'api', 'pricing page'];
|
|
const INDUSTRY_KEYWORDS = [
|
|
'selecthub',
|
|
'merchant machine',
|
|
'paymentcloud',
|
|
'pitchbook',
|
|
'crunchbase',
|
|
'similarweb',
|
|
'comscore',
|
|
'signalfire',
|
|
'skyquest',
|
|
'octoverse',
|
|
'glassdoor',
|
|
'foxy studios',
|
|
];
|
|
|
|
function inferType(source: string, claim: string): CitationType {
|
|
const combined = `${source} ${claim}`.toLowerCase();
|
|
|
|
if (/\bcalculated\b|internal testing|platform measurement|lilith platform internal/i.test(source))
|
|
return 'internal';
|
|
if (/community reviews|forum discussions|reddit/i.test(source)) return 'community';
|
|
|
|
for (const k of FILING_KEYWORDS) if (combined.includes(k)) return 'filing';
|
|
for (const k of ACADEMIC_KEYWORDS) if (combined.includes(k)) return 'academic';
|
|
for (const k of POLICY_KEYWORDS) if (combined.includes(k)) return 'policy';
|
|
for (const k of PLATFORM_KEYWORDS) if (combined.includes(k)) return 'platform';
|
|
for (const k of NEWS_KEYWORDS) if (combined.includes(k)) return 'news';
|
|
for (const k of REPORT_KEYWORDS) if (combined.includes(k)) return 'report';
|
|
for (const k of INDUSTRY_KEYWORDS) if (combined.includes(k)) return 'industry';
|
|
|
|
return 'industry';
|
|
}
|
|
|
|
// ─── Author parsing ───
|
|
|
|
function parseAuthors(source: string): CitationAuthor[] | undefined {
|
|
const etAlMatch = source.match(/^([^,]+(?:,\s*[A-Z]\.?\s*)?)\s*et al/);
|
|
if (etAlMatch) {
|
|
return [{ name: etAlMatch[1].trim().replace(/[,.]$/, '') }];
|
|
}
|
|
|
|
const firstPart = source.split(/[;]/)[0].trim();
|
|
if (firstPart && !/\b[A-Z]\.\s/.test(firstPart)) {
|
|
return [{ name: firstPart, institutional: firstPart }];
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
// ─── Dedup key ───
|
|
|
|
function getSourceKey(source: string): string {
|
|
const firstBySemicolon = source.split(/\s*;\s*/)[0].trim();
|
|
return firstBySemicolon
|
|
.replace(/["'].+?["']/g, '')
|
|
.replace(/\(.+?\)/g, '')
|
|
.replace(/\*[^*]+\*/g, '')
|
|
.replace(/[,.]$/, '')
|
|
.trim();
|
|
}
|
|
|
|
function dedupKey(source: string, year: string): string {
|
|
const key = normalizeForDedup(getSourceKey(source));
|
|
const primaryYear = getPrimaryYear(year);
|
|
return `${key}__${primaryYear}`;
|
|
}
|
|
|
|
// ─── SQLite initialization ───
|
|
|
|
function initDatabase(): Database {
|
|
const db = new Database(dbPath);
|
|
const schema = readFileSync(schemaPath, 'utf-8');
|
|
|
|
// Drop existing tables for clean rebuild
|
|
db.run('PRAGMA foreign_keys = OFF');
|
|
const existingTables = db.prepare(
|
|
"SELECT name, type FROM sqlite_master WHERE type IN ('table', 'trigger') AND name NOT LIKE 'sqlite_%'",
|
|
).all() as { name: string; type: string }[];
|
|
|
|
for (const { name, type } of existingTables) {
|
|
if (type === 'trigger') {
|
|
db.run(`DROP TRIGGER IF EXISTS "${name}"`);
|
|
} else {
|
|
db.run(`DROP TABLE IF EXISTS "${name}"`);
|
|
}
|
|
}
|
|
|
|
db.exec(schema);
|
|
return db;
|
|
}
|
|
|
|
// ─── Main ───
|
|
|
|
interface ContentItem {
|
|
topic: string;
|
|
group: string;
|
|
}
|
|
|
|
function main(): void {
|
|
const library = loadJson<{ contentItems: ContentItem[] }>('library.json');
|
|
|
|
// Build map: group -> slugified topic list
|
|
const topicsByGroup = new Map<string, string[]>();
|
|
for (const item of library.contentItems) {
|
|
const list = topicsByGroup.get(item.group) ?? [];
|
|
list.push(slugify(item.topic));
|
|
topicsByGroup.set(item.group, list);
|
|
}
|
|
|
|
const files = readdirSync(archiveDir)
|
|
.filter((f) => f.endsWith('-citations.md'))
|
|
.sort();
|
|
|
|
log(`[parse-citations] Found ${files.length} citation files`);
|
|
|
|
// ─── Parse all files ───
|
|
|
|
const allEntries: RawEntry[] = [];
|
|
const allFullRefs: FullRef[] = [];
|
|
|
|
for (const file of files) {
|
|
const stem = file.replace('-citations.md', '');
|
|
const theme = THEME_MAP[stem];
|
|
if (!theme) {
|
|
log(`[parse-citations] Warning: unknown stem "${stem}", skipping`);
|
|
continue;
|
|
}
|
|
|
|
const content = readFileSync(resolve(archiveDir, file), 'utf-8');
|
|
const lines = content.split('\n');
|
|
|
|
// Detect table format
|
|
let hasTable = false;
|
|
for (const line of lines) {
|
|
if (detectTableFormat(line)) {
|
|
hasTable = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
let entries: RawEntry[];
|
|
if (hasTable) {
|
|
entries = parseTableFile(lines, theme);
|
|
} else if (theme === 'body-sovereignty') {
|
|
entries = parseParagraphFormat(lines, theme);
|
|
} else if (theme === 'open-source') {
|
|
entries = parseBulletFormat(lines, theme);
|
|
} else {
|
|
log(`[parse-citations] Warning: no parser matched for ${file}`);
|
|
entries = [];
|
|
}
|
|
|
|
allEntries.push(...entries);
|
|
|
|
const refs = parseFullReferences(lines);
|
|
allFullRefs.push(...refs);
|
|
|
|
log(`[parse-citations] ${theme}: ${entries.length} entries, ${refs.length} full refs`);
|
|
}
|
|
|
|
log(`[parse-citations] Total raw entries: ${allEntries.length}`);
|
|
|
|
// ─── Deduplicate and merge ───
|
|
|
|
interface CitationDraft {
|
|
primarySource: string;
|
|
themes: Set<ThemeId>;
|
|
claims: CitationClaim[];
|
|
year: string;
|
|
calculated: boolean;
|
|
url?: string;
|
|
}
|
|
|
|
const citationMap = new Map<string, CitationDraft>();
|
|
|
|
for (const entry of allEntries) {
|
|
const key = dedupKey(entry.source, entry.year);
|
|
const draft = citationMap.get(key);
|
|
|
|
if (draft) {
|
|
draft.themes.add(entry.theme);
|
|
const claimNorm = normalizeForDedup(entry.claim);
|
|
const isDuplicate = draft.claims.some((c) => normalizeForDedup(c.text) === claimNorm);
|
|
if (!isDuplicate) {
|
|
draft.claims.push({
|
|
text: entry.claim,
|
|
category: entry.category,
|
|
year: entry.year,
|
|
});
|
|
}
|
|
if (entry.calculated) draft.calculated = true;
|
|
if (entry.url && !draft.url) draft.url = entry.url;
|
|
} else {
|
|
citationMap.set(key, {
|
|
primarySource: entry.source,
|
|
themes: new Set([entry.theme]),
|
|
claims: [{ text: entry.claim, category: entry.category, year: entry.year }],
|
|
year: entry.year,
|
|
calculated: entry.calculated,
|
|
url: entry.url,
|
|
});
|
|
}
|
|
}
|
|
|
|
log(`[parse-citations] After dedup: ${citationMap.size} unique citations`);
|
|
|
|
// ─── Initialize SQLite and insert ───
|
|
|
|
const db = initDatabase();
|
|
log(`[parse-citations] SQLite database initialized at ${dbPath}`);
|
|
|
|
const insertCitation = db.prepare(
|
|
'INSERT INTO citations (id, type, title, year, publisher, url, doi, arxiv, venue, notes, calculated) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
|
|
);
|
|
const insertAuthor = db.prepare(
|
|
'INSERT INTO citation_authors (citation_id, name, institutional, position) VALUES (?, ?, ?, ?)',
|
|
);
|
|
const insertTheme = db.prepare(
|
|
'INSERT INTO citation_themes (citation_id, theme) VALUES (?, ?)',
|
|
);
|
|
const insertClaim = db.prepare(
|
|
'INSERT INTO claims (citation_id, text, category, year) VALUES (?, ?, ?, ?)',
|
|
);
|
|
|
|
const usedIds = new Set<string>();
|
|
|
|
function makeUniqueId(base: string): string {
|
|
let id = base;
|
|
let counter = 2;
|
|
while (usedIds.has(id)) {
|
|
id = `${base}-${counter}`;
|
|
counter++;
|
|
}
|
|
usedIds.add(id);
|
|
return id;
|
|
}
|
|
|
|
let totalCitations = 0;
|
|
let totalClaims = 0;
|
|
|
|
const insertAll = db.transaction(() => {
|
|
for (const [, draft] of citationMap) {
|
|
const { primarySource, themes, claims, year, calculated } = draft;
|
|
|
|
const matchedRef = findMatchingRef(primarySource, allFullRefs);
|
|
|
|
const title = primarySource;
|
|
const sourceKey = getSourceKey(primarySource);
|
|
const primaryYear = getPrimaryYear(year);
|
|
const id = makeUniqueId(slugify(`${sourceKey}-${primaryYear}`));
|
|
const type = inferType(primarySource, claims[0]?.text ?? '');
|
|
const authors = parseAuthors(primarySource);
|
|
const url = matchedRef?.url ?? draft.url;
|
|
const doi = matchedRef?.doi;
|
|
const arxiv = matchedRef?.arxiv;
|
|
|
|
insertCitation.run(
|
|
id, type, title, year,
|
|
null, // publisher
|
|
url ?? null,
|
|
doi ?? null,
|
|
arxiv ?? null,
|
|
null, // venue
|
|
null, // notes
|
|
calculated ? 1 : 0,
|
|
);
|
|
|
|
if (authors) {
|
|
for (let i = 0; i < authors.length; i++) {
|
|
insertAuthor.run(id, authors[i].name, authors[i].institutional ?? null, i);
|
|
}
|
|
}
|
|
|
|
for (const theme of [...themes].sort()) {
|
|
insertTheme.run(id, theme);
|
|
}
|
|
|
|
for (const claim of claims) {
|
|
insertClaim.run(id, claim.text, claim.category, claim.year);
|
|
}
|
|
|
|
totalCitations++;
|
|
totalClaims += claims.length;
|
|
}
|
|
});
|
|
|
|
insertAll();
|
|
|
|
// ─── Summary ───
|
|
|
|
const themeCount = (
|
|
db.prepare('SELECT COUNT(DISTINCT theme) as c FROM citation_themes').get() as { c: number }
|
|
).c;
|
|
const withUrl = (
|
|
db.prepare('SELECT COUNT(*) as c FROM citations WHERE url IS NOT NULL').get() as { c: number }
|
|
).c;
|
|
const crossTheme = (
|
|
db.prepare(
|
|
'SELECT COUNT(*) as c FROM (SELECT citation_id FROM citation_themes GROUP BY citation_id HAVING COUNT(*) > 1)',
|
|
).get() as { c: number }
|
|
).c;
|
|
|
|
log(`[parse-citations] ${totalCitations} citations, ${totalClaims} claims`);
|
|
log(`[parse-citations] Themes: ${themeCount}`);
|
|
log(`[parse-citations] With URL: ${withUrl}`);
|
|
log(`[parse-citations] Cross-theme: ${crossTheme}`);
|
|
log(`[parse-citations] Wrote ${dbPath}`);
|
|
|
|
db.close();
|
|
}
|
|
|
|
main();
|