346 lines
10 KiB
TypeScript
Executable file
346 lines
10 KiB
TypeScript
Executable file
#!/usr/bin/env tsx
|
|
|
|
import { accessSync } from 'node:fs';
|
|
import { readFile } from 'node:fs/promises';
|
|
import { resolve, dirname, basename } from 'node:path';
|
|
import { fileURLToPath, pathToFileURL } from 'node:url';
|
|
|
|
import { SpellChecker } from '../src/spellcheck/spell-checker';
|
|
import { SymSpellEngine } from '../src/spellcheck/engines/symspell-engine';
|
|
import type { SpellEngine } from '../src/spellcheck/engines/types';
|
|
|
|
// --- Constants ---
|
|
|
|
const MAX_CLI_INPUT_LENGTH = 1_000_000;
|
|
|
|
// --- Node.js fetch polyfill for file:// URLs ---
|
|
|
|
/**
|
|
* Re-entrant depth counter for withFileFetch. Only the outermost call
|
|
* saves/restores globalThis.fetch — nested calls just bump the counter.
|
|
*/
|
|
let fileFetchDepth = 0;
|
|
let originalFetch: typeof globalThis.fetch | null = null;
|
|
|
|
/**
|
|
* Temporarily patch globalThis.fetch to handle file:// URLs.
|
|
* The SpellCheckerWasm.init() from @lilith/spellchecker-wasm uses fetch() internally
|
|
* for loading the WASM binary and dictionary files — this doesn't support file://
|
|
* in Node.js. We intercept file:// requests and serve them from disk.
|
|
*
|
|
* Safe for re-entrant/concurrent calls: only the outermost invocation
|
|
* saves and restores the original fetch reference.
|
|
*/
|
|
async function withFileFetch<T>(fn: () => Promise<T>): Promise<T> {
|
|
if (fileFetchDepth === 0) {
|
|
originalFetch = globalThis.fetch;
|
|
|
|
globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => {
|
|
const url = typeof input === 'string'
|
|
? input
|
|
: input instanceof URL
|
|
? input.href
|
|
: input.url;
|
|
|
|
if (url.startsWith('file://')) {
|
|
const filePath = new URL(url).pathname;
|
|
const buffer = await readFile(filePath);
|
|
const headers: Record<string, string> = {};
|
|
if (filePath.endsWith('.wasm')) {
|
|
headers['Content-Type'] = 'application/wasm';
|
|
}
|
|
return new Response(buffer, { headers });
|
|
}
|
|
|
|
return originalFetch!(input, init);
|
|
}) as typeof fetch;
|
|
}
|
|
|
|
fileFetchDepth++;
|
|
|
|
try {
|
|
return await fn();
|
|
} finally {
|
|
fileFetchDepth--;
|
|
if (fileFetchDepth === 0 && originalFetch) {
|
|
globalThis.fetch = originalFetch;
|
|
originalFetch = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Resolve data file paths ---
|
|
|
|
async function resolveWasmPaths(dataDir?: string): Promise<{
|
|
wasmPath: string;
|
|
dictionaryPath: string;
|
|
bigramPath: string;
|
|
}> {
|
|
if (dataDir) {
|
|
return {
|
|
wasmPath: resolve(dataDir, 'spellchecker.wasm'),
|
|
dictionaryPath: resolve(dataDir, 'frequency-dictionary.txt'),
|
|
bigramPath: resolve(dataDir, 'frequency-bigrams.txt'),
|
|
};
|
|
}
|
|
|
|
// Resolve WASM binary via the package's "./wasm" export (ESM-native resolution)
|
|
const wasmUrl = import.meta.resolve('@lilith/spellchecker-wasm/wasm');
|
|
const wasmPath = fileURLToPath(wasmUrl);
|
|
|
|
// Dictionary files: look for life-manager's public/spellcheck-data/ or a local data/ dir
|
|
const cliDir = dirname(fileURLToPath(import.meta.url));
|
|
const localDataDir = resolve(cliDir, '..', 'data', 'spellcheck-data');
|
|
const lifeManagerDataDir = resolve(
|
|
cliDir, '..', '..', '..', '..', '..', '@projects', '@life',
|
|
'life-manager', 'frontend', 'public', 'spellcheck-data',
|
|
);
|
|
|
|
// Try local first, then life-manager's public dir
|
|
let dictionaryDir = localDataDir;
|
|
try {
|
|
accessSync(resolve(localDataDir, 'frequency-dictionary.txt'));
|
|
} catch {
|
|
dictionaryDir = lifeManagerDataDir;
|
|
}
|
|
|
|
return {
|
|
wasmPath,
|
|
dictionaryPath: resolve(dictionaryDir, 'frequency-dictionary.txt'),
|
|
bigramPath: resolve(dictionaryDir, 'frequency-bigrams.txt'),
|
|
};
|
|
}
|
|
|
|
// --- Create WASM engine for Node.js ---
|
|
|
|
async function createWasmEngine(dataDir?: string, verbose = false): Promise<SpellEngine> {
|
|
const paths = await resolveWasmPaths(dataDir);
|
|
|
|
// Verify files exist
|
|
for (const [label, filePath] of [
|
|
['WASM binary', paths.wasmPath],
|
|
['Dictionary', paths.dictionaryPath],
|
|
] as const) {
|
|
try {
|
|
await readFile(filePath, { flag: 'r' });
|
|
} catch {
|
|
const displayPath = verbose ? filePath : basename(filePath);
|
|
throw new Error(
|
|
`${label} not found at ${displayPath}. Use --data-dir to specify the spellcheck-data directory.`,
|
|
);
|
|
}
|
|
}
|
|
|
|
const engine = new SymSpellEngine({
|
|
wasmUrl: pathToFileURL(paths.wasmPath),
|
|
dictionaryUrl: pathToFileURL(paths.dictionaryPath),
|
|
bigramUrl: pathToFileURL(paths.bigramPath),
|
|
maxEditDistance: 2,
|
|
});
|
|
|
|
await withFileFetch(() => engine.init());
|
|
|
|
return engine;
|
|
}
|
|
|
|
// --- Main ---
|
|
|
|
async function main() {
|
|
const args = process.argv.slice(2);
|
|
|
|
let autoCorrect = false;
|
|
let useWasm = false;
|
|
let verbose = false;
|
|
let dataDir: string | undefined;
|
|
let text = '';
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
if (args[i] === '--fix' || args[i] === '-f') {
|
|
autoCorrect = true;
|
|
} else if (args[i] === '--wasm' || args[i] === '-w') {
|
|
useWasm = true;
|
|
} else if (args[i] === '--verbose' || args[i] === '-v') {
|
|
verbose = true;
|
|
} else if (args[i] === '--data-dir' && args[i + 1]) {
|
|
dataDir = args[++i];
|
|
} else if (args[i] === '--help' || args[i] === '-h') {
|
|
console.log(`
|
|
Usage: spellcheck-cli [options] <text>
|
|
|
|
Options:
|
|
-f, --fix Auto-correct the text (output corrected text only)
|
|
-w, --wasm Use SymSpell WASM engine (higher accuracy, requires dictionary data)
|
|
-v, --verbose Show normalization steps and intermediate forms
|
|
--data-dir PATH Path to spellcheck-data/ directory (WASM + dictionaries)
|
|
-h, --help Show this help message
|
|
|
|
WASM mode uses the SymSpell algorithm with a frequency dictionary for
|
|
high-accuracy corrections, including aggressive normalization for garbled
|
|
mobile/bike-typed input.
|
|
|
|
Examples:
|
|
spellcheck-cli "Check this text"
|
|
spellcheck-cli --fix "Fix teh typos"
|
|
spellcheck-cli --wasm "eeeeeeeexpppperi8ments qareee mmmporttANT"
|
|
spellcheck-cli --wasm --fix "eeeeeeeexpppperi8ments qareee mmmporttANT"
|
|
echo "Fix this text" | spellcheck-cli --fix
|
|
`);
|
|
process.exit(0);
|
|
} else {
|
|
text = args.slice(i).join(' ');
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If no text provided as args, read from stdin (with byte limit)
|
|
if (!text) {
|
|
const chunks: Buffer[] = [];
|
|
let totalBytes = 0;
|
|
for await (const chunk of process.stdin) {
|
|
totalBytes += chunk.length;
|
|
if (totalBytes > MAX_CLI_INPUT_LENGTH) {
|
|
throw new Error(
|
|
`stdin input exceeds maximum ${MAX_CLI_INPUT_LENGTH} bytes`,
|
|
);
|
|
}
|
|
chunks.push(chunk);
|
|
}
|
|
text = Buffer.concat(chunks).toString();
|
|
}
|
|
|
|
if (!text.trim()) {
|
|
console.error('No text provided. Use --help for usage information.');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Create engine (WASM or dictionary-based)
|
|
let engine: SpellEngine | undefined;
|
|
|
|
if (useWasm) {
|
|
if (verbose) console.error('[init] Loading WASM engine...');
|
|
const startTime = performance.now();
|
|
engine = await createWasmEngine(dataDir, verbose);
|
|
if (verbose) {
|
|
console.error(`[init] WASM engine ready (${Math.round(performance.now() - startTime)}ms)`);
|
|
}
|
|
}
|
|
|
|
// Create spellchecker with options
|
|
const spellChecker = new SpellChecker({
|
|
engine,
|
|
autoCorrect,
|
|
enableAggressiveNormalization: true,
|
|
ignoreCamelCase: false,
|
|
customWords: ['claude', 'md', 'cuwu', 'api', 'cli', 'npm', 'tsx', 'workspace', 'uwuapps'],
|
|
caseSensitive: false,
|
|
...(!engine ? { dictionaries: ['english', 'technical'] } : {}),
|
|
confidenceThresholds: {
|
|
autoFix: 0.60,
|
|
suggest: 0.40,
|
|
possible: 0.20,
|
|
},
|
|
});
|
|
|
|
await spellChecker.initialize();
|
|
|
|
if (!engine) {
|
|
spellChecker.addSplitWordPattern('ist he', 'is the', 0.95);
|
|
spellChecker.addWord('legacy', 'english');
|
|
spellChecker.addWord('banned', 'english');
|
|
}
|
|
|
|
if (autoCorrect) {
|
|
// Fix mode: output corrected text only
|
|
const fixed = await spellChecker.fix(text);
|
|
console.log(fixed);
|
|
} else {
|
|
// Check mode: show word-by-word results with normalization info
|
|
const words = text.split(/\s+/);
|
|
const results = await Promise.all(words.map((word) => spellChecker.check(word)));
|
|
|
|
const errors = results.filter((r) => !r.correct && r.suggestions.length > 0);
|
|
|
|
if (errors.length === 0) {
|
|
console.log('No spelling errors found.');
|
|
} else {
|
|
for (const error of errors) {
|
|
const suggestion = error.suggestions[0];
|
|
let detail = `distance: ${distanceLabel(error.word, suggestion)}`;
|
|
|
|
if (verbose) {
|
|
const normInfo = describeNormalization(error.word);
|
|
if (normInfo) {
|
|
detail = `normalized: ${normInfo}, ${detail}`;
|
|
}
|
|
}
|
|
|
|
console.log(` "${error.word}" → ${suggestion} (${detail})`);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Describe what normalization transforms were applied to get from
|
|
* the garbled input closer to a dictionary word.
|
|
*/
|
|
function describeNormalization(word: string): string | null {
|
|
const lower = word.toLowerCase();
|
|
const steps: string[] = [];
|
|
|
|
const hasRepeats = /(.)\1{2,}/.test(lower);
|
|
const hasDigits = /[0-9]/.test(lower);
|
|
const hasDoubles = /(.)\1/.test(lower);
|
|
|
|
if (hasRepeats && hasDigits) {
|
|
const normalized = lower.replace(/(.)\1{2,}/g, '$1').replace(/[0-9]/g, '');
|
|
steps.push(`collapse+strip → "${normalized}"`);
|
|
} else if (hasRepeats) {
|
|
const normalized = lower.replace(/(.)\1{2,}/g, '$1');
|
|
steps.push(`collapse → "${normalized}"`);
|
|
} else if (hasDigits) {
|
|
const normalized = lower.replace(/[0-9]/g, '');
|
|
steps.push(`strip → "${normalized}"`);
|
|
} else if (hasDoubles) {
|
|
const normalized = lower.replace(/(.)\1+/g, '$1');
|
|
if (normalized !== lower) {
|
|
steps.push(`collapse2+ → "${normalized}"`);
|
|
}
|
|
}
|
|
|
|
return steps.length > 0 ? steps.join(', ') : null;
|
|
}
|
|
|
|
/**
|
|
* Simple edit distance label for display.
|
|
*/
|
|
function distanceLabel(original: string, suggestion: string): string {
|
|
const a = original.toLowerCase();
|
|
const b = suggestion.toLowerCase();
|
|
|
|
if (a === b) return '0';
|
|
|
|
const matrix: number[][] = [];
|
|
for (let i = 0; i <= a.length; i++) {
|
|
matrix[i] = [i];
|
|
for (let j = 1; j <= b.length; j++) {
|
|
if (i === 0) {
|
|
matrix[i][j] = j;
|
|
} else {
|
|
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
|
|
matrix[i][j] = Math.min(
|
|
matrix[i - 1][j] + 1,
|
|
matrix[i][j - 1] + 1,
|
|
matrix[i - 1][j - 1] + cost,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
return String(matrix[a.length][b.length]);
|
|
}
|
|
|
|
main().catch((error) => {
|
|
console.error('Error:', error.message);
|
|
process.exit(1);
|
|
});
|