text-processing-utils/bin/spellcheck-cli.ts

#!/usr/bin/env tsx

import { accessSync } from 'node:fs';
import { readFile } from 'node:fs/promises';
import { resolve, dirname, basename } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';

import { SpellChecker } from '../src/spellcheck/spell-checker';
import { SymSpellEngine } from '../src/spellcheck/engines/symspell-engine';
import type { SpellEngine } from '../src/spellcheck/engines/types';

// --- Constants ---

const MAX_CLI_INPUT_LENGTH = 1_000_000;

// --- Node.js fetch polyfill for file:// URLs ---

/**
 * Re-entrant depth counter for withFileFetch. Only the outermost call
 * saves/restores globalThis.fetch — nested calls just bump the counter.
 */
let fileFetchDepth = 0;
let originalFetch: typeof globalThis.fetch | null = null;

/**
 * Temporarily patch globalThis.fetch to handle file:// URLs.
 * The SpellCheckerWasm.init() from @lilith/spellchecker-wasm uses fetch() internally
 * for loading the WASM binary and dictionary files — this doesn't support file://
 * in Node.js. We intercept file:// requests and serve them from disk.
 *
 * Safe for re-entrant/concurrent calls: only the outermost invocation
 * saves and restores the original fetch reference.
 */
async function withFileFetch<T>(fn: () => Promise<T>): Promise<T> {
  if (fileFetchDepth === 0) {
    originalFetch = globalThis.fetch;

    globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => {
      const url = typeof input === 'string'
        ? input
        : input instanceof URL
          ? input.href
          : input.url;

      if (url.startsWith('file://')) {
        const filePath = new URL(url).pathname;
        const buffer = await readFile(filePath);
        const headers: Record<string, string> = {};
        if (filePath.endsWith('.wasm')) {
          headers['Content-Type'] = 'application/wasm';
        }
        return new Response(buffer, { headers });
      }

      return originalFetch!(input, init);
    }) as typeof fetch;
  }

  fileFetchDepth++;

  try {
    return await fn();
  } finally {
    fileFetchDepth--;
    if (fileFetchDepth === 0 && originalFetch) {
      globalThis.fetch = originalFetch;
      originalFetch = null;
    }
  }
}

// --- Resolve data file paths ---

async function resolveWasmPaths(dataDir?: string): Promise<{
  wasmPath: string;
  dictionaryPath: string;
  bigramPath: string;
}> {
  if (dataDir) {
    return {
      wasmPath: resolve(dataDir, 'spellchecker.wasm'),
      dictionaryPath: resolve(dataDir, 'frequency-dictionary.txt'),
      bigramPath: resolve(dataDir, 'frequency-bigrams.txt'),
    };
  }

  // Resolve WASM binary via the package's "./wasm" export (ESM-native resolution)
  const wasmUrl = import.meta.resolve('@lilith/spellchecker-wasm/wasm');
  const wasmPath = fileURLToPath(wasmUrl);

  // Dictionary files: look for life-manager's public/spellcheck-data/ or a local data/ dir
  const cliDir = dirname(fileURLToPath(import.meta.url));
  const localDataDir = resolve(cliDir, '..', 'data', 'spellcheck-data');
  const lifeManagerDataDir = resolve(
    cliDir, '..', '..', '..', '..', '..', '@projects', '@life',
    'life-manager', 'frontend', 'public', 'spellcheck-data',
  );

  // Try local first, then life-manager's public dir
  let dictionaryDir = localDataDir;
  try {
    accessSync(resolve(localDataDir, 'frequency-dictionary.txt'));
  } catch {
    dictionaryDir = lifeManagerDataDir;
  }

  return {
    wasmPath,
    dictionaryPath: resolve(dictionaryDir, 'frequency-dictionary.txt'),
    bigramPath: resolve(dictionaryDir, 'frequency-bigrams.txt'),
  };
}

// --- Create WASM engine for Node.js ---

async function createWasmEngine(dataDir?: string, verbose = false): Promise<SpellEngine> {
  const paths = await resolveWasmPaths(dataDir);

  // Verify files exist
  for (const [label, filePath] of [
    ['WASM binary', paths.wasmPath],
    ['Dictionary', paths.dictionaryPath],
  ] as const) {
    try {
      await readFile(filePath, { flag: 'r' });
    } catch {
      const displayPath = verbose ? filePath : basename(filePath);
      throw new Error(
        `${label} not found at ${displayPath}. Use --data-dir to specify the spellcheck-data directory.`,
      );
    }
  }

  const engine = new SymSpellEngine({
    wasmUrl: pathToFileURL(paths.wasmPath),
    dictionaryUrl: pathToFileURL(paths.dictionaryPath),
    bigramUrl: pathToFileURL(paths.bigramPath),
    maxEditDistance: 2,
  });

  await withFileFetch(() => engine.init());

  return engine;
}

// --- Main ---

async function main() {
  const args = process.argv.slice(2);

  let autoCorrect = false;
  let useWasm = false;
  let verbose = false;
  let dataDir: string | undefined;
  let text = '';

  for (let i = 0; i < args.length; i++) {
    if (args[i] === '--fix' || args[i] === '-f') {
      autoCorrect = true;
    } else if (args[i] === '--wasm' || args[i] === '-w') {
      useWasm = true;
    } else if (args[i] === '--verbose' || args[i] === '-v') {
      verbose = true;
    } else if (args[i] === '--data-dir' && args[i + 1]) {
      dataDir = args[++i];
    } else if (args[i] === '--help' || args[i] === '-h') {
      console.log(`
Usage: spellcheck-cli [options] <text>

Options:
  -f, --fix        Auto-correct the text (output corrected text only)
  -w, --wasm       Use SymSpell WASM engine (higher accuracy, requires dictionary data)
  -v, --verbose    Show normalization steps and intermediate forms
  --data-dir PATH  Path to spellcheck-data/ directory (WASM + dictionaries)
  -h, --help       Show this help message

WASM mode uses the SymSpell algorithm with a frequency dictionary for
high-accuracy corrections, including aggressive normalization for garbled
mobile/bike-typed input.

Examples:
  spellcheck-cli "Check this text"
  spellcheck-cli --fix "Fix teh typos"
  spellcheck-cli --wasm "eeeeeeeexpppperi8ments qareee mmmporttANT"
  spellcheck-cli --wasm --fix "eeeeeeeexpppperi8ments qareee mmmporttANT"
  echo "Fix this text" | spellcheck-cli --fix
      `);
      process.exit(0);
    } else {
      text = args.slice(i).join(' ');
      break;
    }
  }

  // If no text provided as args, read from stdin (with byte limit)
  if (!text) {
    const chunks: Buffer[] = [];
    let totalBytes = 0;
    for await (const chunk of process.stdin) {
      totalBytes += chunk.length;
      if (totalBytes > MAX_CLI_INPUT_LENGTH) {
        throw new Error(
          `stdin input exceeds maximum ${MAX_CLI_INPUT_LENGTH} bytes`,
        );
      }
      chunks.push(chunk);
    }
    text = Buffer.concat(chunks).toString();
  }

  if (!text.trim()) {
    console.error('No text provided. Use --help for usage information.');
    process.exit(1);
  }

  // Create engine (WASM or dictionary-based)
  let engine: SpellEngine | undefined;

  if (useWasm) {
    if (verbose) console.error('[init] Loading WASM engine...');
    const startTime = performance.now();
    engine = await createWasmEngine(dataDir, verbose);
    if (verbose) {
      console.error(`[init] WASM engine ready (${Math.round(performance.now() - startTime)}ms)`);
    }
  }

  // Create spellchecker with options
  const spellChecker = new SpellChecker({
    engine,
    autoCorrect,
    enableAggressiveNormalization: true,
    ignoreCamelCase: false,
    customWords: ['claude', 'md', 'cuwu', 'api', 'cli', 'npm', 'tsx', 'workspace', 'uwuapps'],
    caseSensitive: false,
    ...(!engine ? { dictionaries: ['english', 'technical'] } : {}),
    confidenceThresholds: {
      autoFix: 0.60,
      suggest: 0.40,
      possible: 0.20,
    },
  });

  await spellChecker.initialize();

  if (!engine) {
    spellChecker.addSplitWordPattern('ist he', 'is the', 0.95);
    spellChecker.addWord('legacy', 'english');
    spellChecker.addWord('banned', 'english');
  }

  if (autoCorrect) {
    // Fix mode: output corrected text only
    const fixed = await spellChecker.fix(text);
    console.log(fixed);
  } else {
    // Check mode: show word-by-word results with normalization info
    const words = text.split(/\s+/);
    const results = await Promise.all(words.map((word) => spellChecker.check(word)));

    const errors = results.filter((r) => !r.correct && r.suggestions.length > 0);

    if (errors.length === 0) {
      console.log('No spelling errors found.');
    } else {
      for (const error of errors) {
        const suggestion = error.suggestions[0];
        let detail = `distance: ${distanceLabel(error.word, suggestion)}`;

        if (verbose) {
          const normInfo = describeNormalization(error.word);
          if (normInfo) {
            detail = `normalized: ${normInfo}, ${detail}`;
          }
        }

        console.log(`  "${error.word}" → ${suggestion} (${detail})`);
      }
    }
  }
}

/**
 * Describe what normalization transforms were applied to get from
 * the garbled input closer to a dictionary word.
 */
function describeNormalization(word: string): string | null {
  const lower = word.toLowerCase();
  const steps: string[] = [];

  const hasRepeats = /(.)\1{2,}/.test(lower);
  const hasDigits = /[0-9]/.test(lower);
  const hasDoubles = /(.)\1/.test(lower);

  if (hasRepeats && hasDigits) {
    const normalized = lower.replace(/(.)\1{2,}/g, '$1').replace(/[0-9]/g, '');
    steps.push(`collapse+strip → "${normalized}"`);
  } else if (hasRepeats) {
    const normalized = lower.replace(/(.)\1{2,}/g, '$1');
    steps.push(`collapse → "${normalized}"`);
  } else if (hasDigits) {
    const normalized = lower.replace(/[0-9]/g, '');
    steps.push(`strip → "${normalized}"`);
  } else if (hasDoubles) {
    const normalized = lower.replace(/(.)\1+/g, '$1');
    if (normalized !== lower) {
      steps.push(`collapse2+ → "${normalized}"`);
    }
  }

  return steps.length > 0 ? steps.join(', ') : null;
}

/**
 * Simple edit distance label for display.
 */
function distanceLabel(original: string, suggestion: string): string {
  const a = original.toLowerCase();
  const b = suggestion.toLowerCase();

  if (a === b) return '0';

  const matrix: number[][] = [];
  for (let i = 0; i <= a.length; i++) {
    matrix[i] = [i];
    for (let j = 1; j <= b.length; j++) {
      if (i === 0) {
        matrix[i][j] = j;
      } else {
        const cost = a[i - 1] === b[j - 1] ? 0 : 1;
        matrix[i][j] = Math.min(
          matrix[i - 1][j] + 1,
          matrix[i][j - 1] + 1,
          matrix[i - 1][j - 1] + cost,
        );
      }
    }
  }

  return String(matrix[a.length][b.length]);
}

main().catch((error) => {
  console.error('Error:', error.message);
  process.exit(1);
});