text-processing-utils/bin/spellcheck-cli.ts

346 lines
10 KiB
TypeScript
Executable file

#!/usr/bin/env tsx
import { accessSync } from 'node:fs';
import { readFile } from 'node:fs/promises';
import { resolve, dirname, basename } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { SpellChecker } from '../src/spellcheck/spell-checker';
import { SymSpellEngine } from '../src/spellcheck/engines/symspell-engine';
import type { SpellEngine } from '../src/spellcheck/engines/types';
// --- Constants ---
const MAX_CLI_INPUT_LENGTH = 1_000_000;
// --- Node.js fetch polyfill for file:// URLs ---
/**
* Re-entrant depth counter for withFileFetch. Only the outermost call
* saves/restores globalThis.fetch — nested calls just bump the counter.
*/
let fileFetchDepth = 0;
let originalFetch: typeof globalThis.fetch | null = null;
/**
* Temporarily patch globalThis.fetch to handle file:// URLs.
* The SpellCheckerWasm.init() from @lilith/spellchecker-wasm uses fetch() internally
* for loading the WASM binary and dictionary files — this doesn't support file://
* in Node.js. We intercept file:// requests and serve them from disk.
*
* Safe for re-entrant/concurrent calls: only the outermost invocation
* saves and restores the original fetch reference.
*/
async function withFileFetch<T>(fn: () => Promise<T>): Promise<T> {
if (fileFetchDepth === 0) {
originalFetch = globalThis.fetch;
globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => {
const url = typeof input === 'string'
? input
: input instanceof URL
? input.href
: input.url;
if (url.startsWith('file://')) {
const filePath = new URL(url).pathname;
const buffer = await readFile(filePath);
const headers: Record<string, string> = {};
if (filePath.endsWith('.wasm')) {
headers['Content-Type'] = 'application/wasm';
}
return new Response(buffer, { headers });
}
return originalFetch!(input, init);
}) as typeof fetch;
}
fileFetchDepth++;
try {
return await fn();
} finally {
fileFetchDepth--;
if (fileFetchDepth === 0 && originalFetch) {
globalThis.fetch = originalFetch;
originalFetch = null;
}
}
}
// --- Resolve data file paths ---
async function resolveWasmPaths(dataDir?: string): Promise<{
wasmPath: string;
dictionaryPath: string;
bigramPath: string;
}> {
if (dataDir) {
return {
wasmPath: resolve(dataDir, 'spellchecker.wasm'),
dictionaryPath: resolve(dataDir, 'frequency-dictionary.txt'),
bigramPath: resolve(dataDir, 'frequency-bigrams.txt'),
};
}
// Resolve WASM binary via the package's "./wasm" export (ESM-native resolution)
const wasmUrl = import.meta.resolve('@lilith/spellchecker-wasm/wasm');
const wasmPath = fileURLToPath(wasmUrl);
// Dictionary files: look for life-manager's public/spellcheck-data/ or a local data/ dir
const cliDir = dirname(fileURLToPath(import.meta.url));
const localDataDir = resolve(cliDir, '..', 'data', 'spellcheck-data');
const lifeManagerDataDir = resolve(
cliDir, '..', '..', '..', '..', '..', '@projects', '@life',
'life-manager', 'frontend', 'public', 'spellcheck-data',
);
// Try local first, then life-manager's public dir
let dictionaryDir = localDataDir;
try {
accessSync(resolve(localDataDir, 'frequency-dictionary.txt'));
} catch {
dictionaryDir = lifeManagerDataDir;
}
return {
wasmPath,
dictionaryPath: resolve(dictionaryDir, 'frequency-dictionary.txt'),
bigramPath: resolve(dictionaryDir, 'frequency-bigrams.txt'),
};
}
// --- Create WASM engine for Node.js ---
async function createWasmEngine(dataDir?: string, verbose = false): Promise<SpellEngine> {
const paths = await resolveWasmPaths(dataDir);
// Verify files exist
for (const [label, filePath] of [
['WASM binary', paths.wasmPath],
['Dictionary', paths.dictionaryPath],
] as const) {
try {
await readFile(filePath, { flag: 'r' });
} catch {
const displayPath = verbose ? filePath : basename(filePath);
throw new Error(
`${label} not found at ${displayPath}. Use --data-dir to specify the spellcheck-data directory.`,
);
}
}
const engine = new SymSpellEngine({
wasmUrl: pathToFileURL(paths.wasmPath),
dictionaryUrl: pathToFileURL(paths.dictionaryPath),
bigramUrl: pathToFileURL(paths.bigramPath),
maxEditDistance: 2,
});
await withFileFetch(() => engine.init());
return engine;
}
// --- Main ---
async function main() {
const args = process.argv.slice(2);
let autoCorrect = false;
let useWasm = false;
let verbose = false;
let dataDir: string | undefined;
let text = '';
for (let i = 0; i < args.length; i++) {
if (args[i] === '--fix' || args[i] === '-f') {
autoCorrect = true;
} else if (args[i] === '--wasm' || args[i] === '-w') {
useWasm = true;
} else if (args[i] === '--verbose' || args[i] === '-v') {
verbose = true;
} else if (args[i] === '--data-dir' && args[i + 1]) {
dataDir = args[++i];
} else if (args[i] === '--help' || args[i] === '-h') {
console.log(`
Usage: spellcheck-cli [options] <text>
Options:
-f, --fix Auto-correct the text (output corrected text only)
-w, --wasm Use SymSpell WASM engine (higher accuracy, requires dictionary data)
-v, --verbose Show normalization steps and intermediate forms
--data-dir PATH Path to spellcheck-data/ directory (WASM + dictionaries)
-h, --help Show this help message
WASM mode uses the SymSpell algorithm with a frequency dictionary for
high-accuracy corrections, including aggressive normalization for garbled
mobile/bike-typed input.
Examples:
spellcheck-cli "Check this text"
spellcheck-cli --fix "Fix teh typos"
spellcheck-cli --wasm "eeeeeeeexpppperi8ments qareee mmmporttANT"
spellcheck-cli --wasm --fix "eeeeeeeexpppperi8ments qareee mmmporttANT"
echo "Fix this text" | spellcheck-cli --fix
`);
process.exit(0);
} else {
text = args.slice(i).join(' ');
break;
}
}
// If no text provided as args, read from stdin (with byte limit)
if (!text) {
const chunks: Buffer[] = [];
let totalBytes = 0;
for await (const chunk of process.stdin) {
totalBytes += chunk.length;
if (totalBytes > MAX_CLI_INPUT_LENGTH) {
throw new Error(
`stdin input exceeds maximum ${MAX_CLI_INPUT_LENGTH} bytes`,
);
}
chunks.push(chunk);
}
text = Buffer.concat(chunks).toString();
}
if (!text.trim()) {
console.error('No text provided. Use --help for usage information.');
process.exit(1);
}
// Create engine (WASM or dictionary-based)
let engine: SpellEngine | undefined;
if (useWasm) {
if (verbose) console.error('[init] Loading WASM engine...');
const startTime = performance.now();
engine = await createWasmEngine(dataDir, verbose);
if (verbose) {
console.error(`[init] WASM engine ready (${Math.round(performance.now() - startTime)}ms)`);
}
}
// Create spellchecker with options
const spellChecker = new SpellChecker({
engine,
autoCorrect,
enableAggressiveNormalization: true,
ignoreCamelCase: false,
customWords: ['claude', 'md', 'cuwu', 'api', 'cli', 'npm', 'tsx', 'workspace', 'uwuapps'],
caseSensitive: false,
...(!engine ? { dictionaries: ['english', 'technical'] } : {}),
confidenceThresholds: {
autoFix: 0.60,
suggest: 0.40,
possible: 0.20,
},
});
await spellChecker.initialize();
if (!engine) {
spellChecker.addSplitWordPattern('ist he', 'is the', 0.95);
spellChecker.addWord('legacy', 'english');
spellChecker.addWord('banned', 'english');
}
if (autoCorrect) {
// Fix mode: output corrected text only
const fixed = await spellChecker.fix(text);
console.log(fixed);
} else {
// Check mode: show word-by-word results with normalization info
const words = text.split(/\s+/);
const results = await Promise.all(words.map((word) => spellChecker.check(word)));
const errors = results.filter((r) => !r.correct && r.suggestions.length > 0);
if (errors.length === 0) {
console.log('No spelling errors found.');
} else {
for (const error of errors) {
const suggestion = error.suggestions[0];
let detail = `distance: ${distanceLabel(error.word, suggestion)}`;
if (verbose) {
const normInfo = describeNormalization(error.word);
if (normInfo) {
detail = `normalized: ${normInfo}, ${detail}`;
}
}
console.log(` "${error.word}" → ${suggestion} (${detail})`);
}
}
}
}
/**
* Describe what normalization transforms were applied to get from
* the garbled input closer to a dictionary word.
*/
function describeNormalization(word: string): string | null {
const lower = word.toLowerCase();
const steps: string[] = [];
const hasRepeats = /(.)\1{2,}/.test(lower);
const hasDigits = /[0-9]/.test(lower);
const hasDoubles = /(.)\1/.test(lower);
if (hasRepeats && hasDigits) {
const normalized = lower.replace(/(.)\1{2,}/g, '$1').replace(/[0-9]/g, '');
steps.push(`collapse+strip → "${normalized}"`);
} else if (hasRepeats) {
const normalized = lower.replace(/(.)\1{2,}/g, '$1');
steps.push(`collapse → "${normalized}"`);
} else if (hasDigits) {
const normalized = lower.replace(/[0-9]/g, '');
steps.push(`strip → "${normalized}"`);
} else if (hasDoubles) {
const normalized = lower.replace(/(.)\1+/g, '$1');
if (normalized !== lower) {
steps.push(`collapse2+ → "${normalized}"`);
}
}
return steps.length > 0 ? steps.join(', ') : null;
}
/**
* Simple edit distance label for display.
*/
function distanceLabel(original: string, suggestion: string): string {
const a = original.toLowerCase();
const b = suggestion.toLowerCase();
if (a === b) return '0';
const matrix: number[][] = [];
for (let i = 0; i <= a.length; i++) {
matrix[i] = [i];
for (let j = 1; j <= b.length; j++) {
if (i === 0) {
matrix[i][j] = j;
} else {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
matrix[i][j] = Math.min(
matrix[i - 1][j] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j - 1] + cost,
);
}
}
}
return String(matrix[a.length][b.length]);
}
main().catch((error) => {
console.error('Error:', error.message);
process.exit(1);
});