text-processing-utils/scripts/extract-unique-tech-terms.js
2026-01-21 11:37:27 -08:00

94 lines
No EOL
3.1 KiB
JavaScript

#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
// Read the system dictionary
const dictPath = path.join(__dirname, '../src/spellcheck/dictionaries/english-words.txt');
const systemWords = new Set(
fs.readFileSync(dictPath, 'utf-8')
.split('\n')
.map(w => w.trim().toLowerCase())
.filter(w => w.length > 0)
);
// Read technical dictionary source to extract terms
const techDictPath = path.join(__dirname, '../src/spellcheck/dictionaries/technical-dictionary.ts');
const techDictContent = fs.readFileSync(techDictPath, 'utf-8');
// Extract all quoted strings from the arrays
const techTerms = new Set();
const quotedStrings = techDictContent.match(/'[^']+'/g) || [];
quotedStrings.forEach(str => {
const term = str.slice(1, -1).toLowerCase();
if (term.length > 1 && !term.includes(' ')) {
techTerms.add(term);
}
});
// Also check expanded technical dictionary
const expandedTechPath = path.join(__dirname, '../src/spellcheck/dictionaries/expanded-technical-dictionary.ts');
if (fs.existsSync(expandedTechPath)) {
const expandedContent = fs.readFileSync(expandedTechPath, 'utf-8');
const expandedQuoted = expandedContent.match(/'[^']+'/g) || [];
expandedQuoted.forEach(str => {
const term = str.slice(1, -1).toLowerCase();
if (term.length > 1 && !term.includes(' ')) {
techTerms.add(term);
}
});
}
// Check other specialized dictionaries
const specializedDicts = [
'git-vcs-dictionary.ts',
'web-technologies-dictionary.ts',
'cloud-devops-dictionary.ts',
'cli-tools-dictionary.ts'
];
specializedDicts.forEach(dictFile => {
const dictPath = path.join(__dirname, '../src/spellcheck/dictionaries/', dictFile);
if (fs.existsSync(dictPath)) {
const content = fs.readFileSync(dictPath, 'utf-8');
const quoted = content.match(/'[^']+'/g) || [];
quoted.forEach(str => {
const term = str.slice(1, -1).toLowerCase();
if (term.length > 1 && !term.includes(' ')) {
techTerms.add(term);
}
});
}
});
// Find terms not in system dictionary
const uniqueTechTerms = [];
const inSystemDict = [];
techTerms.forEach(term => {
if (!systemWords.has(term)) {
uniqueTechTerms.push(term);
} else {
inSystemDict.push(term);
}
});
// Sort for readability
uniqueTechTerms.sort();
inSystemDict.sort();
console.log(`Total technical terms: ${techTerms.size}`);
console.log(`Terms NOT in system dictionary: ${uniqueTechTerms.length}`);
console.log(`Terms already in system dictionary: ${inSystemDict.length}`);
console.log('\n=== Unique Technical Terms (not in system dict) ===\n');
console.log(uniqueTechTerms.join('\n'));
// Save unique terms to a file
const outputPath = path.join(__dirname, '../src/spellcheck/dictionaries/unique-tech-terms.txt');
fs.writeFileSync(outputPath, uniqueTechTerms.join('\n'), 'utf-8');
console.log(`\nUnique technical terms saved to: ${outputPath}`);
// Also create a supplemental dictionary file with these terms
const supplementPath = path.join(__dirname, '../src/spellcheck/dictionaries/technical-supplement.txt');
fs.writeFileSync(supplementPath, uniqueTechTerms.join('\n'), 'utf-8');
console.log(`Technical supplement dictionary saved to: ${supplementPath}`);