#!/usr/bin/env bun import { readFileSync, readdirSync } from 'fs'; import { resolve, dirname } from 'path'; import { fileURLToPath } from 'url'; import { Database } from 'bun:sqlite'; import type { CitationAuthor, CitationClaim, CitationType, ThemeId, } from '../src/types/citations'; const __dirname = dirname(fileURLToPath(import.meta.url)); const dataDir = resolve(__dirname, '../src/data'); const archiveDir = resolve(__dirname, '../docs/meta/theme-sources'); const schemaPath = resolve(__dirname, 'schema.sql'); const dbPath = resolve(dataDir, 'citations.db'); // ─── Theme ID mapping from filename stem ─── const THEME_MAP: Record = { '01-anti-extraction': 'anti-extraction', '02-inverse-capitalism': 'inverse-capitalism', '03-body-sovereignty': 'body-sovereignty', '04-privacy': 'privacy', '05-permanent-software': 'permanent-software', '06-human-work': 'human-work', '07-ai-philosophy': 'ai-philosophy', '08-slutology': 'slutology', '09-cooperative-future': 'cooperative-future', '10-open-source': 'open-source', }; // ─── Utilities ─── function slugify(text: string): string { return text .toLowerCase() .replace(/[^a-z0-9]+/g, '-') .replace(/^-+|-+$/g, '') .slice(0, 80); } function normalizeForDedup(text: string): string { return text.toLowerCase().replace(/[^a-z0-9]/g, ''); } function normalizeForMatch(text: string): string { return text .toLowerCase() .replace(/[^a-z0-9\s]/g, '') .replace(/\s+/g, ' ') .trim(); } function getPrimaryYear(yearStr: string): string { const match = yearStr.match(/\d{4}/); return match ? match[0] : yearStr.replace(/\s+/g, '').trim(); } function log(message: string): void { process.stdout.write(`${message}\n`); } function loadJson(filename: string): T { const raw = readFileSync(resolve(dataDir, filename), 'utf-8'); return JSON.parse(raw) as T; } // ─── Intermediate types ─── interface RawEntry { source: string; claim: string; year: string; category: string; theme: ThemeId; calculated: boolean; url?: string; } interface FullRef { number: number; raw: string; url?: string; doi?: string; arxiv?: string; } // ─── Table format detection ─── type TableFormat = 'variant-a' | 'variant-b'; function detectTableFormat(line: string): TableFormat | null { if (/\|\s*#\s*\|\s*Claim\s*\|/i.test(line)) return 'variant-a'; if (/\|\s*#\s*\|\s*Source\s*\|\s*Year\s*\|\s*Claim/i.test(line)) return 'variant-b'; return null; } function parseTableRow( line: string, format: TableFormat, ): { source: string; claim: string; year: string } | null { const cells = line .split('|') .map((c) => c.trim()) .filter((c) => c.length > 0); if (cells.length < 4) return null; if (/^-+$/.test(cells[0])) return null; if (!/^\d+$/.test(cells[0])) return null; if (format === 'variant-a') { return { claim: cells[1], source: cells[2], year: cells[3] }; } return { source: cells[1], year: cells[2], claim: cells[3] }; } // ─── Parse table-based files (variants A and B) ─── function parseTableFile(lines: string[], theme: ThemeId): RawEntry[] { const entries: RawEntry[] = []; let currentCategory = ''; let currentFormat: TableFormat | null = null; for (const line of lines) { const h2Match = line.match(/^##\s+(.+)/); if (h2Match) { const heading = h2Match[1].trim(); if (/full references|notes\b|cross-references|citation format/i.test(heading)) { break; } currentCategory = heading; currentFormat = null; continue; } const fmt = detectTableFormat(line); if (fmt) { currentFormat = fmt; continue; } if (!currentFormat) continue; if (/^\|[-\s|]+\|$/.test(line)) continue; const row = parseTableRow(line, currentFormat); if (!row) continue; entries.push({ source: row.source, claim: row.claim, year: row.year, category: currentCategory, theme, calculated: /calculated|internal/i.test(row.source), }); } return entries; } // ─── Parse paragraph format (03-body-sovereignty) ─── function parseParagraphFormat(lines: string[], theme: ThemeId): RawEntry[] { const entries: RawEntry[] = []; let currentCategory = ''; for (const line of lines) { const h3Match = line.match(/^###\s+(.+)/); if (h3Match) { currentCategory = h3Match[1].trim(); continue; } const h2Match = line.match(/^##\s+(.+)/); if (h2Match) { const heading = h2Match[1].trim(); if (/citation format|cross-references|verification/i.test(heading)) continue; currentCategory = heading; continue; } // Match numbered entries: N. **Author** ... const numMatch = line.match(/^(\d+)\.\s+\*\*(.+?)\*\*/); if (!numMatch) continue; const source = numMatch[2].trim(); const rest = line.substring(numMatch[0].length).trim(); // Find year anywhere in the remaining text const yearMatch = rest.match(/$(\d{4})$/) ?? rest.match(/\b(20[0-2]\d|19\d\d)\b/); const year = yearMatch ? yearMatch[1] : 'n.d.'; // Claim: prefer quoted text, else cleaned text after year const quoteMatch = rest.match(/"([^"]+)"/); let claim: string; if (quoteMatch) { claim = quoteMatch[1]; } else { let afterYear = rest; if (yearMatch) { const idx = rest.indexOf(yearMatch[0]); afterYear = rest.substring(idx + yearMatch[0].length); } afterYear = afterYear .replace(/^\)\.\s*/, '') .replace(/^[\s.,;:]+/, '') .replace(/\*[^*]+\*/g, '') .replace(/\s*https?:\/\/\S+/g, '') .replace(/^[\s.,;:]+/, '') .replace(/[.\s]+$/, '') .trim(); if (!afterYear) { const italicMatch = rest.match(/\*([^*]+)\*/); afterYear = italicMatch ? italicMatch[1] : rest.substring(0, 120); } claim = afterYear; } entries.push({ source, claim, year, category: currentCategory, theme, calculated: false, }); } return entries; } // ─── Parse bullet format (10-open-source) ─── function parseBulletFormat(lines: string[], theme: ThemeId): RawEntry[] { const entries: RawEntry[] = []; let currentH2 = ''; let currentSource = ''; let currentYear = ''; let currentClaim = ''; let currentUrl: string | undefined; function flush(): void { if (currentSource && currentClaim) { entries.push({ source: currentSource, claim: currentClaim, year: currentYear || 'n.d.', category: currentH2, theme, calculated: false, url: currentUrl, }); } currentSource = ''; currentYear = ''; currentClaim = ''; currentUrl = undefined; } for (const line of lines) { const h2Match = line.match(/^##\s+(.+)/); if (h2Match) { flush(); if (/verification notes/i.test(h2Match[1])) break; currentH2 = h2Match[1].trim(); continue; } const h3Match = line.match(/^###\s+(.+)/); if (h3Match) { flush(); continue; } if (line.startsWith('---')) { flush(); continue; } // Skip table rows (technology dependency table) if (line.startsWith('|')) continue; const sourceMatch = line.match(/^-\s+\*\*Source\*\*:\s*(.+)/); if (sourceMatch) { flush(); currentSource = sourceMatch[1].trim(); const ym = currentSource.match(/$(\d{4})$/) ?? currentSource.match(/\b(20[0-2]\d|19\d\d)\b/); if (ym) currentYear = ym[1]; continue; } const claimMatch = line.match( /^-\s+\*\*(?:Key\s+(?:finding|findings|data|property|text|principle|limitation|stat|figures|claim)|Quote)\*\*:\s*(.+)/i, ); if (claimMatch) { currentClaim = claimMatch[1].trim().replace(/^"(.+)"$/, '$1'); continue; } const urlMatch = line.match(/^-\s+\*\*URL\*\*:\s*(https?:\/\/\S+)/); if (urlMatch) { currentUrl = urlMatch[1].trim(); continue; } } flush(); return entries; } // ─── Full references parser ─── function parseFullReferences(lines: string[]): FullRef[] { const refs: FullRef[] = []; let inFullRefs = false; for (const line of lines) { if (/^##\s+Full References/i.test(line)) { inFullRefs = true; continue; } if (!inFullRefs) continue; const match = line.match(/^(\d+)\.\s+(.+)/); if (!match) continue; const num = parseInt(match[1]); const text = match[2].trim(); const urlMatch = text.match(/(https?:\/\/\S+)/); const url = urlMatch ? urlMatch[1].replace(/[.,;)]+$/, '') : undefined; const doiMatch = text.match(/https?:\/\/doi\.org\/(\S+)/) ?? text.match(/doi[:/]\s*(10\.\S+)/i); const doi = doiMatch ? doiMatch[1].replace(/[.,;)]+$/, '') : undefined; const arxivMatch = text.match(/https?:\/\/arxiv\.org\/abs\/(\S+)/) ?? text.match(/arXiv[:/]\s*(\S+)/i); const arxiv = arxivMatch ? arxivMatch[1].replace(/[.,;)]+$/, '') : undefined; refs.push({ number: num, raw: text, url, doi, arxiv }); } return refs; } // ─── Source matching: find best full-reference match for a source string ─── function findMatchingRef(source: string, refs: FullRef[]): FullRef | null { if (refs.length === 0) return null; const sourceNorm = normalizeForMatch(source); const sourceWords = sourceNorm.split(' ').filter((w) => w.length > 3); if (sourceWords.length === 0) return null; let bestMatch: FullRef | null = null; let bestScore = 0; for (const ref of refs) { const refNorm = normalizeForMatch(ref.raw); let score = 0; for (const word of sourceWords) { if (refNorm.includes(word)) score++; } // Bonus for matching the leading source identifier const primaryKey = normalizeForMatch(source.split(/[;,]/)[0]); if (primaryKey.length > 4 && refNorm.includes(primaryKey)) { score += 3; } if (score > bestScore) { bestScore = score; bestMatch = ref; } } return bestScore >= 2 ? bestMatch : null; } // ─── Type inference ─── const NEWS_KEYWORDS = [ 'variety', 'time', 'npr', 'nbc', 'fortune', 'venturebeat', 'mit technology review', '404 media', 'the verge', 'wired', 'ars technica', 'techcrunch', 'business standard', 'bbc', 'reuters', 'new york times', 'financial times', 'cnbc', 'android police', ]; const ACADEMIC_KEYWORDS = [ 'doi', 'arxiv', 'journal', 'et al', 'university', 'neurips', 'usenix', 'ieee', 'springer', 'plos', 'bmc', 'sage journals', ]; const POLICY_KEYWORDS = [ 'aclu', 'gdpr', 'regulation', 'act no', 'amnesty', 'parliament', 'government', 'medecins du monde', 'prostitution reform', 'directive', ]; const REPORT_KEYWORDS = [ 'survey', 'report', 'census', 'foundation', 'coalition', 'institute', 'hacking//hustling', 'hacking hustling', 'gallup', 'kff', 'unfpa', 'democracy at work', ]; const FILING_KEYWORDS = ['companies house', 'sec filing', 'annual report', 'financial statements']; const PLATFORM_KEYWORDS = ['terms of service', 'tos', 'documentation', 'api', 'pricing page']; const INDUSTRY_KEYWORDS = [ 'selecthub', 'merchant machine', 'paymentcloud', 'pitchbook', 'crunchbase', 'similarweb', 'comscore', 'signalfire', 'skyquest', 'octoverse', 'glassdoor', 'foxy studios', ]; function inferType(source: string, claim: string): CitationType { const combined = `${source} ${claim}`.toLowerCase(); if (/\bcalculated\b|internal testing|platform measurement|lilith platform internal/i.test(source)) return 'internal'; if (/community reviews|forum discussions|reddit/i.test(source)) return 'community'; for (const k of FILING_KEYWORDS) if (combined.includes(k)) return 'filing'; for (const k of ACADEMIC_KEYWORDS) if (combined.includes(k)) return 'academic'; for (const k of POLICY_KEYWORDS) if (combined.includes(k)) return 'policy'; for (const k of PLATFORM_KEYWORDS) if (combined.includes(k)) return 'platform'; for (const k of NEWS_KEYWORDS) if (combined.includes(k)) return 'news'; for (const k of REPORT_KEYWORDS) if (combined.includes(k)) return 'report'; for (const k of INDUSTRY_KEYWORDS) if (combined.includes(k)) return 'industry'; return 'industry'; } // ─── Author parsing ─── function parseAuthors(source: string): CitationAuthor[] | undefined { const etAlMatch = source.match(/^([^,]+(?:,\s*[A-Z]\.?\s*)?)\s*et al/); if (etAlMatch) { return [{ name: etAlMatch[1].trim().replace(/[,.]$/, '') }]; } const firstPart = source.split(/[;]/)[0].trim(); if (firstPart && !/\b[A-Z]\.\s/.test(firstPart)) { return [{ name: firstPart, institutional: firstPart }]; } return undefined; } // ─── Dedup key ─── function getSourceKey(source: string): string { const firstBySemicolon = source.split(/\s*;\s*/)[0].trim(); return firstBySemicolon .replace(/["'].+?["']/g, '') .replace(/$.+?$/g, '') .replace(/\*[^*]+\*/g, '') .replace(/[,.]$/, '') .trim(); } function dedupKey(source: string, year: string): string { const key = normalizeForDedup(getSourceKey(source)); const primaryYear = getPrimaryYear(year); return `${key}__${primaryYear}`; } // ─── SQLite initialization ─── function initDatabase(): Database { const db = new Database(dbPath); const schema = readFileSync(schemaPath, 'utf-8'); // Drop existing tables for clean rebuild db.run('PRAGMA foreign_keys = OFF'); const existingTables = db.prepare( "SELECT name, type FROM sqlite_master WHERE type IN ('table', 'trigger') AND name NOT LIKE 'sqlite_%'", ).all() as { name: string; type: string }[]; for (const { name, type } of existingTables) { if (type === 'trigger') { db.run(`DROP TRIGGER IF EXISTS "${name}"`); } else { db.run(`DROP TABLE IF EXISTS "${name}"`); } } db.exec(schema); return db; } // ─── Main ─── interface ContentItem { topic: string; group: string; } function main(): void { const library = loadJson<{ contentItems: ContentItem[] }>('library.json'); // Build map: group -> slugified topic list const topicsByGroup = new Map(); for (const item of library.contentItems) { const list = topicsByGroup.get(item.group) ?? []; list.push(slugify(item.topic)); topicsByGroup.set(item.group, list); } const files = readdirSync(archiveDir) .filter((f) => f.endsWith('-citations.md')) .sort(); log(`[parse-citations] Found ${files.length} citation files`); // ─── Parse all files ─── const allEntries: RawEntry[] = []; const allFullRefs: FullRef[] = []; for (const file of files) { const stem = file.replace('-citations.md', ''); const theme = THEME_MAP[stem]; if (!theme) { log(`[parse-citations] Warning: unknown stem "${stem}", skipping`); continue; } const content = readFileSync(resolve(archiveDir, file), 'utf-8'); const lines = content.split('\n'); // Detect table format let hasTable = false; for (const line of lines) { if (detectTableFormat(line)) { hasTable = true; break; } } let entries: RawEntry[]; if (hasTable) { entries = parseTableFile(lines, theme); } else if (theme === 'body-sovereignty') { entries = parseParagraphFormat(lines, theme); } else if (theme === 'open-source') { entries = parseBulletFormat(lines, theme); } else { log(`[parse-citations] Warning: no parser matched for ${file}`); entries = []; } allEntries.push(...entries); const refs = parseFullReferences(lines); allFullRefs.push(...refs); log(`[parse-citations] ${theme}: ${entries.length} entries, ${refs.length} full refs`); } log(`[parse-citations] Total raw entries: ${allEntries.length}`); // ─── Deduplicate and merge ─── interface CitationDraft { primarySource: string; themes: Set; claims: CitationClaim[]; year: string; calculated: boolean; url?: string; } const citationMap = new Map(); for (const entry of allEntries) { const key = dedupKey(entry.source, entry.year); const draft = citationMap.get(key); if (draft) { draft.themes.add(entry.theme); const claimNorm = normalizeForDedup(entry.claim); const isDuplicate = draft.claims.some((c) => normalizeForDedup(c.text) === claimNorm); if (!isDuplicate) { draft.claims.push({ text: entry.claim, category: entry.category, year: entry.year, }); } if (entry.calculated) draft.calculated = true; if (entry.url && !draft.url) draft.url = entry.url; } else { citationMap.set(key, { primarySource: entry.source, themes: new Set([entry.theme]), claims: [{ text: entry.claim, category: entry.category, year: entry.year }], year: entry.year, calculated: entry.calculated, url: entry.url, }); } } log(`[parse-citations] After dedup: ${citationMap.size} unique citations`); // ─── Initialize SQLite and insert ─── const db = initDatabase(); log(`[parse-citations] SQLite database initialized at ${dbPath}`); const insertCitation = db.prepare( 'INSERT INTO citations (id, type, title, year, publisher, url, doi, arxiv, venue, notes, calculated) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', ); const insertAuthor = db.prepare( 'INSERT INTO citation_authors (citation_id, name, institutional, position) VALUES (?, ?, ?, ?)', ); const insertTheme = db.prepare( 'INSERT INTO citation_themes (citation_id, theme) VALUES (?, ?)', ); const insertClaim = db.prepare( 'INSERT INTO claims (citation_id, text, category, year) VALUES (?, ?, ?, ?)', ); const usedIds = new Set(); function makeUniqueId(base: string): string { let id = base; let counter = 2; while (usedIds.has(id)) { id = `${base}-${counter}`; counter++; } usedIds.add(id); return id; } let totalCitations = 0; let totalClaims = 0; const insertAll = db.transaction(() => { for (const [, draft] of citationMap) { const { primarySource, themes, claims, year, calculated } = draft; const matchedRef = findMatchingRef(primarySource, allFullRefs); const title = primarySource; const sourceKey = getSourceKey(primarySource); const primaryYear = getPrimaryYear(year); const id = makeUniqueId(slugify(`${sourceKey}-${primaryYear}`)); const type = inferType(primarySource, claims[0]?.text ?? ''); const authors = parseAuthors(primarySource); const url = matchedRef?.url ?? draft.url; const doi = matchedRef?.doi; const arxiv = matchedRef?.arxiv; insertCitation.run( id, type, title, year, null, // publisher url ?? null, doi ?? null, arxiv ?? null, null, // venue null, // notes calculated ? 1 : 0, ); if (authors) { for (let i = 0; i < authors.length; i++) { insertAuthor.run(id, authors[i].name, authors[i].institutional ?? null, i); } } for (const theme of [...themes].sort()) { insertTheme.run(id, theme); } for (const claim of claims) { insertClaim.run(id, claim.text, claim.category, claim.year); } totalCitations++; totalClaims += claims.length; } }); insertAll(); // ─── Summary ─── const themeCount = ( db.prepare('SELECT COUNT(DISTINCT theme) as c FROM citation_themes').get() as { c: number } ).c; const withUrl = ( db.prepare('SELECT COUNT(*) as c FROM citations WHERE url IS NOT NULL').get() as { c: number } ).c; const crossTheme = ( db.prepare( 'SELECT COUNT(*) as c FROM (SELECT citation_id FROM citation_themes GROUP BY citation_id HAVING COUNT(*) > 1)', ).get() as { c: number } ).c; log(`[parse-citations] ${totalCitations} citations, ${totalClaims} claims`); log(`[parse-citations] Themes: ${themeCount}`); log(`[parse-citations] With URL: ${withUrl}`); log(`[parse-citations] Cross-theme: ${crossTheme}`); log(`[parse-citations] Wrote ${dbPath}`); db.close(); } main();