perf(spellcheck): Optimize spell-checking performance by restructuring dictionary loading, integrating SymSpell engine, updating word frequency data, and refactoring core components (dictionary-manager, spell-checker, suggestion-engine)

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
Lilith 2026-02-26 15:51:46 -08:00
parent 5522dcb628
commit bd19c0c5cc
31 changed files with 4171 additions and 250 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { UrlExtractor } from '../../src/extractors/url-extractor';
import { UrlExtractor } from '../../src/extractors/url-extractor.js';
describe('UrlExtractor', () => {
describe('basic extraction', () => {

View file

@ -4,7 +4,7 @@ import {
withTimeoutSync,
TimeoutWrapper,
TimeoutError
} from '../../src/performance/timeout-wrapper';
} from '../../src/performance/timeout-wrapper.js';
describe('TimeoutWrapper', () => {
describe('withTimeout (async)', () => {

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { AnsiStripper } from '../../src/sanitizers/ansi-stripper';
import { AnsiStripper } from '../../src/sanitizers/ansi-stripper.js';
describe('AnsiStripper', () => {
const stripper = new AnsiStripper();

View file

@ -3,12 +3,13 @@
* Provides nuanced confidence levels for better auto-fix decisions
*/
import { LevenshteinDistance, DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance';
import { DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance';
import { Soundex, Metaphone } from '@lilith/text-processing-algorithms/phonetic';
import { TypoManager } from '../typos/index.js';
import keyboardLayout from '~/data/spellcheck/keyboard-layout.json' with { type: 'json' };
import wordFrequencies from '~/data/spellcheck/word-frequencies.json' with { type: 'json' };
export enum CorrectionConfidence {
AUTO_FIX = 'auto-fix', // > 0.95 - Safe to auto-fix
@ -45,8 +46,6 @@ export interface ConfidenceScorerOptions {
}
export class ConfidenceScorer {
// @ts-expect-error Reserved for future use
private readonly _levenshtein: LevenshteinDistance;
private readonly damerauLevenshtein: DamerauLevenshtein;
private readonly soundex: Soundex;
private readonly metaphone: Metaphone;
@ -77,7 +76,6 @@ export class ConfidenceScorer {
}
constructor(options: ConfidenceScorerOptions = {}) {
this._levenshtein = new LevenshteinDistance();
this.damerauLevenshtein = new DamerauLevenshtein();
this.soundex = new Soundex();
this.KEYBOARD_ADJACENCY = this.initializeKeyboardAdjacency();
@ -99,9 +97,15 @@ export class ConfidenceScorer {
original: string,
suggestion: string,
additionalSuggestions: string[] = [],
engineFrequency?: number,
): number {
const factors = this.analyzeFactors(original, suggestion, additionalSuggestions);
// If engine provides corpus frequency, use it directly instead of static lookup
if (engineFrequency !== undefined) {
factors.wordFrequency = this.normalizeEngineFrequency(engineFrequency);
}
// Check for known typo first
if (factors.isKnownTypo) {
const known = this.typoManager.getCorrection(original);
@ -182,108 +186,112 @@ export class ConfidenceScorer {
* Calculate keyboard proximity score
*/
private calculateKeyboardProximity(original: string, suggestion: string): number {
if (original.length !== suggestion.length) {
return 0;
const lenDiff = original.length - suggestion.length;
// Same length: check each differing position for keyboard adjacency
if (lenDiff === 0) {
let proximityScore = 0;
let differences = 0;
for (let i = 0; i < original.length; i++) {
const origChar = original[i].toLowerCase();
const suggChar = suggestion[i].toLowerCase();
if (origChar !== suggChar) {
differences++;
const adjacent = this.KEYBOARD_ADJACENCY.get(origChar);
if (adjacent?.has(suggChar)) {
proximityScore++;
}
}
}
if (differences === 0) {
return 1;
}
return proximityScore / differences;
}
let proximityScore = 0;
let differences = 0;
// Length diff of 1: detect accidental adjacent-key insertion
if (Math.abs(lenDiff) === 1) {
const [longer, shorter] = lenDiff > 0 ? [original, suggestion] : [suggestion, original];
for (let i = 0; i < original.length; i++) {
const origChar = original[i].toLowerCase();
const suggChar = suggestion[i].toLowerCase();
let insertIdx = 0;
if (origChar !== suggChar) {
differences++;
const adjacent = this.KEYBOARD_ADJACENCY.get(origChar);
while (insertIdx < shorter.length && longer[insertIdx] === shorter[insertIdx]) {
insertIdx++;
}
if (adjacent?.has(suggChar)) {
proximityScore++;
let matchesAfter = true;
for (let i = insertIdx; i < shorter.length; i++) {
if (longer[i + 1] !== shorter[i]) {
matchesAfter = false;
break;
}
}
if (matchesAfter) {
const insertedChar = longer[insertIdx].toLowerCase();
const prevChar = insertIdx > 0 ? longer[insertIdx - 1].toLowerCase() : null;
const nextChar = insertIdx < longer.length - 1 ? longer[insertIdx + 1].toLowerCase() : null;
const prevAdjacent = prevChar ? this.KEYBOARD_ADJACENCY.get(prevChar) : null;
const nextAdjacent = nextChar ? this.KEYBOARD_ADJACENCY.get(nextChar) : null;
if (prevAdjacent?.has(insertedChar) || nextAdjacent?.has(insertedChar)) {
return 0.8; // High proximity — accidental adjacent-key insertion
}
}
}
if (differences === 0) {
return 1;
return 0;
}
private static frequencyMap: Map<string, number> | null = null;
private static getFrequencyMap(): Map<string, number> {
if (!ConfidenceScorer.frequencyMap) {
ConfidenceScorer.frequencyMap = new Map(
Object.entries(wordFrequencies as Record<string, number>),
);
}
return proximityScore / differences;
return ConfidenceScorer.frequencyMap;
}
/**
* Get word frequency (mock implementation)
* Normalize raw corpus frequency from SymSpell engine to the 0-1000 scale
* used by the confidence factors. SymSpell counts are raw corpus occurrences
* (e.g., "the" = 23 billion). We map to the same tiered scale as getWordFrequency.
*/
private normalizeEngineFrequency(count: number): number {
if (count >= 1_000_000_000) return 1000; // Top-tier (the, of, and...)
if (count >= 100_000_000) return 800;
if (count >= 10_000_000) return 600;
if (count >= 1_000_000) return 400;
if (count >= 100_000) return 250;
if (count >= 10_000) return 150;
return 100;
}
/**
* Get word frequency score based on rank in common English words.
* Returns 0-1000 based on how common the word is.
*/
private getWordFrequency(word: string): number {
// Common words get high frequency
const commonWords = new Set([
'the',
'be',
'to',
'of',
'and',
'a',
'in',
'that',
'have',
'i',
'it',
'for',
'not',
'on',
'with',
'he',
'as',
'you',
'do',
'at',
'this',
'but',
'his',
'by',
'from',
'they',
'we',
'say',
'her',
'she',
'function',
'class',
'const',
'let',
'var',
'return',
'if',
'else',
]);
const rank = ConfidenceScorer.getFrequencyMap().get(word.toLowerCase());
if (commonWords.has(word.toLowerCase())) {
return 1000;
}
if (!rank) return 50; // Unknown words get a low default
if (rank <= 100) return 1000;
if (rank <= 500) return 800;
if (rank <= 1000) return 600;
if (rank <= 2000) return 400;
if (rank <= 3000) return 250;
if (rank <= 5000) return 150;
// Tech terms get medium frequency
const techTerms = new Set([
'javascript',
'typescript',
'python',
'java',
'react',
'angular',
'vue',
'node',
'npm',
'git',
'github',
'docker',
'kubernetes',
'api',
'rest',
]);
if (techTerms.has(word.toLowerCase())) {
return 500;
}
// Default low frequency
return 100;
}

View file

@ -0,0 +1,4 @@
export interface DictionaryDataLoader {
loadText(path: string): Promise<string>;
exists(path: string): Promise<boolean>;
}

View file

@ -3,6 +3,7 @@ import { TechnicalDictionary } from '../implementations/technical-dictionary.js'
import { DictionaryBase } from './dictionary-base.js';
import type { DictionaryDataLoader } from './dictionary-loader.js';
import type { Dictionary, DictionaryConfig } from '../../types/spellcheck.types.js';
export class CustomDictionary extends DictionaryBase {
@ -23,20 +24,27 @@ export class CustomDictionary extends DictionaryBase {
export class DictionaryManager {
private readonly dictionaries: Map<string, Dictionary> = new Map();
private readonly priorities: Map<string, number> = new Map();
private readonly loader: DictionaryDataLoader | undefined;
private initialized: boolean = false;
constructor(loader?: DictionaryDataLoader) {
this.loader = loader;
}
async initialize(configs?: DictionaryConfig[]): Promise<void> {
if (this.initialized) {
return;
}
const loader = this.loader ?? (await this.createDefaultLoader());
// Load default dictionaries
const englishDict = new EnglishDictionary();
const englishDict = new EnglishDictionary(loader);
await englishDict.loadDictionary();
this.addDictionary(englishDict, 100);
const technicalDict = new TechnicalDictionary();
const technicalDict = new TechnicalDictionary(loader);
await technicalDict.loadDictionary();
this.addDictionary(technicalDict, 90);
@ -51,6 +59,14 @@ export class DictionaryManager {
this.initialized = true;
}
private async createDefaultLoader(): Promise<DictionaryDataLoader> {
// Lazy import to avoid pulling fs into browser bundles
const { NodeDictionaryLoader } = await import('../loaders/node-loader.js');
const { getDataRoot } = await import('../../../utils/paths.js');
return new NodeDictionaryLoader(getDataRoot());
}
private async loadCustomDictionary(config: DictionaryConfig): Promise<void> {
const dict = new CustomDictionary(config.name, config.words || []);

View file

@ -1,32 +1,28 @@
import * as fs from 'fs';
import { PATHS } from '../../../utils/paths.js';
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
import { DictionaryBase } from '../core/dictionary-base.js';
export class EnglishDictionary extends DictionaryBase {
private static readonly DICTIONARY_FILE = PATHS.dictionaries.english();
private static readonly SUPPLEMENT_FILE = PATHS.dictionaries.technical();
private readonly loader: DictionaryDataLoader;
// Note: Common misspellings are now handled by TypoManager
// This keeps dictionary focused on valid words only
constructor() {
constructor(loader: DictionaryDataLoader) {
super('english');
this.loader = loader;
}
async loadDictionary(): Promise<void> {
const words = new Set<string>();
// FAIL FAST - No fallbacks per CLAUDE.md
if (!fs.existsSync(EnglishDictionary.DICTIONARY_FILE)) {
const dictionaryExists = await this.loader.exists('dictionaries/english-words.txt');
if (!dictionaryExists) {
throw new Error(
`Dictionary file not found at: ${EnglishDictionary.DICTIONARY_FILE}\n` +
`This is a hard failure. Fix the root cause - ensure dictionary file exists.`,
'Dictionary file not found: dictionaries/english-words.txt\n' +
'This is a hard failure. Fix the root cause - ensure dictionary file exists.',
);
}
// Load main English dictionary
const content = fs.readFileSync(EnglishDictionary.DICTIONARY_FILE, 'utf-8');
const content = await this.loader.loadText('dictionaries/english-words.txt');
const dictWords = content
.split('\n')
.map((w) => w.trim().toLowerCase())
@ -35,8 +31,10 @@ export class EnglishDictionary extends DictionaryBase {
dictWords.forEach((w) => words.add(w));
// Load supplemental technical terms if available
if (fs.existsSync(EnglishDictionary.SUPPLEMENT_FILE)) {
const supplementContent = fs.readFileSync(EnglishDictionary.SUPPLEMENT_FILE, 'utf-8');
const supplementExists = await this.loader.exists('dictionaries/technical-terms.txt');
if (supplementExists) {
const supplementContent = await this.loader.loadText('dictionaries/technical-terms.txt');
const supplementWords = supplementContent
.split('\n')
.map((w) => w.trim().toLowerCase())

View file

@ -1,21 +1,22 @@
import * as fs from 'fs';
import { PATHS, verifyFileExists } from '../../../utils/paths.js';
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
import { DictionaryBase } from '../core/dictionary-base.js';
export class TechnicalDictionary extends DictionaryBase {
// Path to consolidated technical terms file
private static readonly TECH_TERMS_FILE = PATHS.dictionaries.technical();
private readonly loader: DictionaryDataLoader;
constructor() {
constructor(loader: DictionaryDataLoader) {
super('technical');
this.loader = loader;
}
async loadDictionary(): Promise<void> {
// Fail fast if file doesn't exist - no test workarounds
verifyFileExists(TechnicalDictionary.TECH_TERMS_FILE);
const exists = await this.loader.exists('dictionaries/technical-terms.txt');
const content = fs.readFileSync(TechnicalDictionary.TECH_TERMS_FILE, 'utf-8');
if (!exists) {
throw new Error('Required file not found: dictionaries/technical-terms.txt');
}
const content = await this.loader.loadText('dictionaries/technical-terms.txt');
const terms = content
.split('\n')
.map((w) => w.trim().toLowerCase())

View file

@ -3,6 +3,11 @@ export { DictionaryBase } from './core/dictionary-base.js';
export { DictionaryManager, CustomDictionary } from './core/dictionary-manager.js';
export { DictionaryPersistence } from './core/dictionary-persistence.js';
export type { DictionaryData, DictionaryManifest } from './core/dictionary-persistence.js';
export type { DictionaryDataLoader } from './core/dictionary-loader.js';
// Loader exports
export { NodeDictionaryLoader } from './loaders/node-loader.js';
export { FetchDictionaryLoader } from './loaders/fetch-loader.js';
// Implementation exports
export { EnglishDictionary } from './implementations/english-dictionary.js';

View file

@ -0,0 +1,33 @@
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
export class FetchDictionaryLoader implements DictionaryDataLoader {
private readonly baseUrl: string;
constructor(baseUrl: string) {
// Strip trailing slash for consistent path joining
this.baseUrl = baseUrl.replace(/\/+$/, '');
}
async loadText(filePath: string): Promise<string> {
const url = `${this.baseUrl}/${filePath}`;
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch dictionary data from ${url}: ${response.status}`);
}
return response.text();
}
async exists(filePath: string): Promise<boolean> {
const url = `${this.baseUrl}/${filePath}`;
try {
const response = await fetch(url, { method: 'HEAD' });
return response.ok;
} catch {
return false;
}
}
}

View file

@ -0,0 +1,23 @@
import * as fs from 'fs';
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
export class NodeDictionaryLoader implements DictionaryDataLoader {
private readonly rootPath: string;
constructor(rootPath: string) {
this.rootPath = rootPath;
}
async loadText(filePath: string): Promise<string> {
const fullPath = `${this.rootPath}/${filePath}`;
return fs.readFileSync(fullPath, 'utf-8');
}
async exists(filePath: string): Promise<boolean> {
const fullPath = `${this.rootPath}/${filePath}`;
return fs.existsSync(fullPath);
}
}

View file

@ -0,0 +1,3 @@
export type { SpellEngine, SpellSuggestion } from './types.js';
export { SymSpellEngine } from './symspell-engine.js';
export type { SymSpellEngineOptions } from './symspell-engine.js';

View file

@ -0,0 +1,63 @@
import { SpellCheckerWasm, Verbosity } from '@lilith/spellchecker-wasm';
import type { SpellEngine, SpellSuggestion } from './types.js';
export interface SymSpellEngineOptions {
wasmUrl: string | URL;
dictionaryUrl: string | URL;
bigramUrl?: string | URL;
maxEditDistance?: number;
}
export class SymSpellEngine implements SpellEngine {
private checker: SpellCheckerWasm | null = null;
private readonly maxEditDistance: number;
constructor(private readonly options: SymSpellEngineOptions) {
this.maxEditDistance = options.maxEditDistance ?? 2;
}
async init(): Promise<void> {
this.checker = await SpellCheckerWasm.init({
wasmUrl: this.options.wasmUrl,
dictionaryUrl: this.options.dictionaryUrl,
bigramUrl: this.options.bigramUrl,
maxEditDistance: this.maxEditDistance,
});
}
isReady(): boolean {
return this.checker !== null;
}
contains(word: string): boolean {
if (!this.checker) return false;
return this.checker.wordExists(word.toLowerCase());
}
suggest(word: string, maxSuggestions = 5): SpellSuggestion[] {
if (!this.checker) return [];
const results = this.checker.lookup(
word.toLowerCase(),
Verbosity.Closest,
this.maxEditDistance,
);
return results.slice(0, maxSuggestions).map((r) => ({
word: r.term,
distance: r.distance,
frequency: r.count,
}));
}
addWord(word: string, frequency = 1): void {
if (!this.checker) return;
this.checker.addWord(word.toLowerCase(), frequency);
}
bigramFrequency(word1: string, word2: string): number {
if (!this.checker) return 0;
return this.checker.bigramFrequency(word1.toLowerCase(), word2.toLowerCase());
}
}

View file

@ -0,0 +1,26 @@
export interface SpellSuggestion {
word: string;
distance: number;
frequency: number;
}
export interface SpellEngine {
/** Whether the engine has been initialized and is ready. */
isReady(): boolean;
/** Check if a word exists in the dictionary (exact match). */
contains(word: string): boolean;
/** Get spelling suggestions for a word, ranked by relevance. */
suggest(word: string, maxSuggestions?: number): SpellSuggestion[];
/** Add a word to the dictionary at runtime. */
addWord(word: string, frequency?: number): void;
/**
* Get the bigram frequency for a word pair (word1 followed by word2).
* Returns 0 if the bigram doesn't exist in the dictionary.
* Used by checkText() for context-aware rescoring of candidates.
*/
bigramFrequency?(word1: string, word2: string): number;
}

View file

@ -1,9 +1,10 @@
// Main SpellChecker
export { SpellChecker } from './spell-checker.js';
// Suggestion Engine
export { SuggestionEngine } from './suggestion-engine.js';
export type { SuggestionOptions } from './suggestion-engine.js';
// Spell Engine (SymSpell-backed)
export type { SpellEngine, SpellSuggestion } from './engines/types.js';
export { SymSpellEngine } from './engines/symspell-engine.js';
export type { SymSpellEngineOptions } from './engines/symspell-engine.js';
// Re-export algorithms from @lilith/text-processing-algorithms for backward compatibility
export { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance';
@ -15,7 +16,7 @@ export { Soundex, Metaphone, DoubleMetaphone } from '@lilith/text-processing-alg
// Utilities
export { BloomFilter, CountingBloomFilter } from './utils/bloom-filter.js';
export { LRUCache, TTLCache } from './utils/lru-cache.js';
export { TTLCache } from './utils/lru-cache.js';
// Dictionaries
export { DictionaryBase } from './dictionaries/core/dictionary-base.js';
@ -28,6 +29,11 @@ export type {
DictionaryManifest,
} from './dictionaries/core/dictionary-persistence.js';
// Dictionary Loaders
export type { DictionaryDataLoader } from './dictionaries/core/dictionary-loader.js';
export { NodeDictionaryLoader } from './dictionaries/loaders/node-loader.js';
export { FetchDictionaryLoader } from './dictionaries/loaders/fetch-loader.js';
// Correction Strategies
export { AutoCorrector } from './strategies/auto-corrector.js';
export { ContextualCorrector } from './strategies/contextual-corrector.js';

View file

@ -1,16 +1,13 @@
import { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance';
import {
ConfidenceScorer,
CorrectionConfidence,
type CorrectionDecision,
} from './confidence/confidence-scorer.js';
import { CustomDictionary, DictionaryManager } from './dictionaries/core/dictionary-manager.js';
import { SuggestionEngine } from './suggestion-engine.js';
import { TypoManager } from './typos/index.js';
import type { SpellEngine } from './engines/types.js';
import type { ConfidenceScorerOptions } from './confidence/confidence-scorer.js';
import type { SuggestionOptions } from './suggestion-engine.js';
import type {
SpellCheckOptions,
SpellCheckResult,
@ -21,10 +18,8 @@ import type {
import type { SplitWordDetection } from './typos/index.js';
export class SpellChecker {
private readonly engine: SpellEngine | null;
private readonly dictionaryManager: DictionaryManager;
private readonly suggestionEngine: SuggestionEngine;
// @ts-expect-error Reserved for planned Levenshtein optimizations
private readonly _levenshtein: LevenshteinDistance;
private readonly confidenceScorer: ConfidenceScorer;
private readonly typoManager: TypoManager;
private readonly options: SpellCheckOptions;
@ -53,9 +48,8 @@ export class SpellChecker {
...options,
};
this.dictionaryManager = new DictionaryManager();
this.suggestionEngine = new SuggestionEngine(this.dictionaryManager);
this._levenshtein = new LevenshteinDistance();
this.engine = this.options.engine ?? null;
this.dictionaryManager = new DictionaryManager(this.options.loader);
this.typoManager = new TypoManager(
true,
true,
@ -77,37 +71,139 @@ export class SpellChecker {
}
try {
// Initialize dictionary manager with specified dictionaries
const configs: DictionaryConfig[] = [];
if (this.options.customWords && this.options.customWords.length > 0) {
configs.push({
name: 'custom',
words: this.options.customWords,
priority: 110,
});
if (this.engine && !this.engine.isReady()) {
throw new Error('SpellEngine must be initialized before passing to SpellChecker');
}
// Pass the requested dictionary names to the manager
await this.dictionaryManager.initialize(configs);
if (!this.engine) {
// Legacy path: initialize dictionary manager with Trie-based dictionaries
const configs: DictionaryConfig[] = [];
// The manager already loads english and technical by default
// SuggestionEngine doesn't need separate initialization
if (this.options.customWords && this.options.customWords.length > 0) {
configs.push({
name: 'custom',
words: this.options.customWords,
priority: 110,
});
}
await this.dictionaryManager.initialize(configs);
} else {
// Engine path: add custom words directly to the engine
if (this.options.customWords) {
for (const word of this.options.customWords) {
this.engine.addWord(word);
}
}
}
// Set up dictionary checker for split-word and joined-word detection
this.typoManager.setDictionaryChecker((word: string) =>
this.dictionaryManager.contains(word),
);
this.typoManager.setDictionaryChecker((word: string) => this.containsWord(word));
this.initialized = true;
} catch (error) {
// Failed to initialize SpellChecker - re-throwing with context
throw new Error(
`SpellChecker initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
);
}
}
/** Delegate word lookup to engine if available, otherwise dictionary manager. */
private containsWord(word: string): boolean {
if (this.engine) {
return this.engine.contains(word);
}
return this.dictionaryManager.contains(word);
}
/** Delegate suggestion generation to engine if available. */
private getSuggestions(word: string, maxSuggestions: number): string[] {
if (this.engine) {
return this.engine.suggest(word, maxSuggestions).map((s) => s.word);
}
return this.dictionaryManager.getSuggestions(word, maxSuggestions);
}
/**
* Rescore spelling candidates using bigram context.
*
* For each misspelled word, gets the top candidates from the engine,
* then rescores them using bigram frequencies with adjacent words.
* This promotes "hi" over "his" when the context is "_ new world"
* because "hi new" is a more natural bigram than "his new".
*
* Returns a map of original-word best-in-context-word.
*/
private buildContextCorrections(
words: Array<{ word: string; position: { start: number; end: number } }>,
): Map<string, string> {
const corrections = new Map<string, string>();
if (!this.engine?.bigramFrequency) {
return corrections;
}
// First pass: get the best single-word correction for each word
// (correct words map to themselves)
const bestWords: string[] = words.map((w) => {
const lower = w.word.toLowerCase();
if (this.containsWord(lower)) return lower;
const suggestions = this.getSuggestions(lower, 5);
return suggestions.length > 0 ? suggestions[0] : lower;
});
// Second pass: for misspelled words with multiple candidates,
// rescore using bigram context with neighbors
for (let i = 0; i < words.length; i++) {
const original = words[i].word.toLowerCase();
if (this.containsWord(original)) continue;
const candidates = this.engine.suggest(original, 10);
if (candidates.length < 2) continue;
// Get context words (use best guesses for neighbors)
const prevWord = i > 0 ? bestWords[i - 1] : null;
const nextWord = i < words.length - 1 ? bestWords[i + 1] : null;
let bestCandidate = candidates[0].word;
let bestScore = -1;
for (const candidate of candidates) {
// Base score from corpus frequency (log scale to dampen huge differences)
let score = Math.log1p(candidate.frequency);
// Bigram boost: check how well this candidate fits with neighbors
if (prevWord) {
const bigramFreq = this.engine.bigramFrequency(prevWord, candidate.word);
if (bigramFreq > 0) {
score += Math.log1p(bigramFreq) * 2; // weight bigram context heavily
}
}
if (nextWord) {
const bigramFreq = this.engine.bigramFrequency(candidate.word, nextWord);
if (bigramFreq > 0) {
score += Math.log1p(bigramFreq) * 2;
}
}
// Prefer closer edit distances
score -= candidate.distance * 2;
if (score > bestScore) {
bestScore = score;
bestCandidate = candidate.word;
}
}
// Only record if the context-aware pick differs from the frequency-only pick
if (bestCandidate !== candidates[0].word) {
corrections.set(original, bestCandidate);
}
}
return corrections;
}
async check(word: string): Promise<SpellCheckResult> {
// Input validation
if (!word || typeof word !== 'string') {
@ -169,8 +265,8 @@ export class SpellChecker {
};
}
// Check dictionary after typo check
const isCorrect = this.dictionaryManager.contains(normalizedWord);
// Check dictionary (via engine or legacy manager)
const isCorrect = this.containsWord(normalizedWord);
if (isCorrect) {
return {
@ -181,17 +277,8 @@ export class SpellChecker {
};
}
// Generate suggestions
const suggestionOptions: SuggestionOptions = {
maxSuggestions: this.options.maxSuggestions,
considerCase: this.options.caseSensitive,
minSimilarity: this.options.threshold,
};
const suggestions = this.suggestionEngine.generateSuggestions(
normalizedWord,
suggestionOptions,
);
// Generate suggestions (via engine or legacy manager)
const suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5);
// Calculate multi-factor confidence score
let confidence = 0;
@ -300,7 +387,6 @@ export class SpellChecker {
// Apply split-word corrections (these operate on word pairs)
for (const [original, correction] of splitWordCorrections) {
// Use a more precise regex for split words to avoid partial matches
const escapedOriginal = this.escapeRegex(original);
const regex = new RegExp(`\\b${escapedOriginal}\\b`, 'g');
@ -309,7 +395,6 @@ export class SpellChecker {
// Apply joined-word corrections (single words to multiple words)
for (const [original, correction] of joinedWordCorrections) {
// Use word boundary regex for joined words
const escapedOriginal = this.escapeRegex(original);
const regex = new RegExp(`\\b${escapedOriginal}\\b`, 'g');
@ -330,6 +415,10 @@ export class SpellChecker {
const checkedWords = new Set<string>();
let misspelledCount = 0;
// Build context-aware corrections by rescoring candidates using bigram frequencies.
// "hio nwe wrold" → bigram("hi","new") beats bigram("his","new") → promotes "hi".
const contextCorrections = this.buildContextCorrections(words);
for (const wordInfo of words) {
if (checkedWords.has(wordInfo.word.toLowerCase())) {
continue;
@ -342,10 +431,23 @@ export class SpellChecker {
if (!result.correct) {
misspelledCount++;
// If context rescoring produced a different best candidate for this word,
// promote it to the front of the suggestions list.
const contextSuggestion = contextCorrections.get(wordInfo.word.toLowerCase());
let suggestions = result.suggestions;
if (contextSuggestion && contextSuggestion !== wordInfo.word.toLowerCase()) {
suggestions = [
contextSuggestion,
...result.suggestions.filter((s) => s !== contextSuggestion),
];
}
// Get correction decision for severity
const decision =
result.correctionDecision ||
this.confidenceScorer.decideAction(wordInfo.word, result.suggestions, result.confidence);
this.confidenceScorer.decideAction(wordInfo.word, suggestions, result.confidence);
// Map confidence action to severity
let severity: 'error' | 'warning' | 'info';
@ -366,7 +468,7 @@ export class SpellChecker {
type: 'misspelling',
word: wordInfo.word,
message: decision.reason || `"${wordInfo.word}" is misspelled`,
suggestions: result.suggestions,
suggestions,
severity,
position: wordInfo.position,
confidence: result.confidence,
@ -380,7 +482,6 @@ export class SpellChecker {
const splitWordDetections = this.typoManager.detectSplitWords(text);
for (const detection of splitWordDetections) {
// Map confidence to severity for split-word errors
let severity: 'error' | 'warning' | 'info';
if (detection.confidence >= 0.8) {
@ -416,7 +517,6 @@ export class SpellChecker {
const joinedWordDetections = this.typoManager.detectJoinedWords(text);
for (const detection of joinedWordDetections) {
// Map confidence to severity for joined-word errors
let severity: 'error' | 'warning' | 'info';
if (detection.confidence >= 0.8) {
@ -462,16 +562,19 @@ export class SpellChecker {
}
addWord(word: string, dictionaryName: string = 'custom'): void {
// Ensure the custom dictionary exists before adding words
// Add to engine if available
if (this.engine) {
this.engine.addWord(word);
}
// Also maintain custom dictionary for legacy path
if (dictionaryName === 'custom' && !this.dictionaryManager.getDictionary('custom')) {
// Create the custom dictionary with high priority
const customDict = new CustomDictionary('custom', []);
this.dictionaryManager.addDictionary(customDict, 110);
}
this.dictionaryManager.addWordToDictionary(word, dictionaryName);
// Also add to custom words in options
if (!this.options.customWords) {
this.options.customWords = [];
}
@ -484,7 +587,6 @@ export class SpellChecker {
removeWord(word: string, dictionaryName: string = 'custom'): boolean {
const removed = this.dictionaryManager.removeWordFromDictionary(word, dictionaryName);
// Also remove from custom words in options
if (this.options.customWords) {
const index = this.options.customWords.indexOf(word);
@ -497,27 +599,22 @@ export class SpellChecker {
}
private shouldIgnoreWord(word: string): boolean {
// Check minimum word length
if (word.length < (this.options.minWordLength || 2)) {
return true;
}
// Check if word contains only numbers
if (this.options.ignoreNumbers && /^\d+$/.test(word)) {
return true;
}
// Check if word is a URL
if (this.options.ignoreUrls && this.isUrl(word)) {
return true;
}
// Check if word is an email
if (this.options.ignoreEmails && this.isEmail(word)) {
return true;
}
// Check if word is camelCase or PascalCase
if (this.options.ignoreCamelCase && this.isCamelCase(word)) {
return true;
}
@ -537,12 +634,10 @@ export class SpellChecker {
const contractionParts = normalized.split("'");
if (contractionParts.length === 2) {
// Check the full contraction first
if (this.dictionaryManager.contains(normalized.toLowerCase())) {
if (this.containsWord(normalized.toLowerCase())) {
return normalized.toLowerCase();
}
// Otherwise check the main part
normalized = contractionParts[0];
}
@ -550,7 +645,6 @@ export class SpellChecker {
}
private tokenizeText(text: string): string[] {
// Simple word tokenization
return text.match(/\b[\w']+\b/g) || [];
}
@ -559,15 +653,13 @@ export class SpellChecker {
position: { start: number; end: number };
}> {
const words: Array<{ word: string; position: { start: number; end: number } }> = [];
const regex = /\b[\w']+\b/g;
let match;
while ((match = regex.exec(text)) !== null) {
for (const match of text.matchAll(/\b[\w']+\b/g)) {
words.push({
word: match[0],
position: {
start: match.index,
end: match.index + match[0].length,
start: match.index ?? 0,
end: (match.index ?? 0) + match[0].length,
},
});
}
@ -584,23 +676,18 @@ export class SpellChecker {
}
private isCamelCase(word: string): boolean {
// Check for camelCase (must have at least one capital letter after lowercase)
// or PascalCase (starts with capital, has at least one more capital)
return /^[a-z]+[A-Z][a-zA-Z]*$/.test(word) || /^[A-Z][a-z]+[A-Z][a-zA-Z]*$/.test(word);
}
private preserveCase(original: string, correction: string): string {
// All uppercase
if (original === original.toUpperCase()) {
return correction.toUpperCase();
}
// First letter uppercase
if (original[0] === original[0].toUpperCase()) {
return correction[0].toUpperCase() + correction.slice(1).toLowerCase();
}
// Default to lowercase
return correction.toLowerCase();
}
@ -609,50 +696,34 @@ export class SpellChecker {
}
clearCache(): void {
this.suggestionEngine.clearCache();
// No-op when using SymSpell engine (no suggestion cache to clear)
}
getDictionaryNames(): string[] {
return this.dictionaryManager.getDictionaryNames();
}
/**
* Add a custom split-word pattern
*/
addSplitWordPattern(
splitForm: string,
correctForm: string,
confidence: number = 0.75,
_context?: string,
): void {
this.typoManager.addSplitWordPattern(splitForm, correctForm, confidence);
}
/**
* Check if a specific word pair could be a split-word typo
*/
checkWordPair(word1: string, word2: string): SplitWordDetection | null {
return this.typoManager.checkWordPair(word1, word2);
}
/**
* Detect split-word typos in text
*/
detectSplitWords(text: string): SplitWordDetection[] {
return this.typoManager.detectSplitWords(text);
}
/**
* Enable or disable split-word detection
*/
setSplitWordDetection(enabled: boolean): void {
this.typoManager.setSplitWordDetection(enabled);
this.options.enableSplitWordDetection = enabled;
}
/**
* Check if split-word detection is enabled
*/
isSplitWordDetectionEnabled(): boolean {
return this.typoManager.isSplitWordDetectionEnabled();
}

View file

@ -1,10 +1,11 @@
import { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance';
import { DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance';
import { TypoManager } from './typos/typo-manager.js';
import type { DictionaryManager } from './dictionaries/core/dictionary-manager.js';
import keyboardLayout from '~/data/spellcheck/keyboard-layout.json' with { type: 'json' };
import wordFrequencies from '~/data/spellcheck/word-frequencies.json' with { type: 'json' };
export interface SuggestionOptions {
maxDistance?: number;
@ -15,10 +16,11 @@ export interface SuggestionOptions {
}
export class SuggestionEngine {
private readonly levenshtein: LevenshteinDistance;
private readonly damerau: DamerauLevenshtein;
private readonly dictionaryManager: DictionaryManager;
private readonly typoManager: TypoManager;
private static keyboardLayout: Map<string, string[]>;
private static frequencyMap: Map<string, number>;
// Initialize keyboard layout from JSON
private static getKeyboardLayout(): Map<string, string[]> {
@ -39,8 +41,34 @@ export class SuggestionEngine {
return SuggestionEngine.keyboardLayout;
}
// Initialize word frequency map from JSON
private static getFrequencyMap(): Map<string, number> {
if (!SuggestionEngine.frequencyMap) {
SuggestionEngine.frequencyMap = new Map(
Object.entries(wordFrequencies as Record<string, number>),
);
}
return SuggestionEngine.frequencyMap;
}
/**
* Get a frequency bonus for a word based on its rank in common English.
* Top-100 words get +20, top-500 get +15, top-2000 get +10, top-5000 get +5, unranked get 0.
*/
private static getFrequencyBonus(word: string): number {
const rank = SuggestionEngine.getFrequencyMap().get(word.toLowerCase());
if (!rank) return 0;
if (rank <= 100) return 20;
if (rank <= 500) return 15;
if (rank <= 2000) return 10;
return 5;
}
constructor(dictionaryManager: DictionaryManager) {
this.levenshtein = new LevenshteinDistance();
this.damerau = new DamerauLevenshtein();
this.dictionaryManager = dictionaryManager;
this.typoManager = new TypoManager(true, true, false, false); // Enable common and tech typos
}
@ -70,10 +98,10 @@ export class SuggestionEngine {
maxSuggestions * 3,
);
// Filter by Levenshtein distance and similarity
// Filter by Damerau-Levenshtein distance and similarity
for (const candidate of dictSuggestions) {
const distance = this.levenshtein.calculate(normalizedWord, candidate);
const similarity = this.levenshtein.similarity(normalizedWord, candidate);
const distance = this.damerau.calculate(normalizedWord, candidate);
const similarity = this.damerau.similarity(normalizedWord, candidate);
if (distance <= maxDistance && similarity >= minSimilarity) {
suggestions.add(candidate);
@ -159,20 +187,20 @@ export class SuggestionEngine {
return suggestions.map((suggestion) => {
let score = 0;
// Levenshtein distance score (closer = better)
const distance = this.levenshtein.calculate(original, suggestion);
// Damerau-Levenshtein distance score (closer = better)
const distance = this.damerau.calculate(original, suggestion);
score += (10 - distance) * 10;
// Similarity score
const similarity = this.levenshtein.similarity(original, suggestion);
const similarity = this.damerau.similarity(original, suggestion);
score += similarity * 50;
// Length difference penalty
// Length difference penalty (reduced from -5 to -2 per char)
const lengthDiff = Math.abs(original.length - suggestion.length);
score -= lengthDiff * 5;
score -= lengthDiff * 2;
// Prefix match bonus
const prefixLength = this.commonPrefixLength(original, suggestion);
@ -184,15 +212,15 @@ export class SuggestionEngine {
score += suffixLength * 5;
// Keyboard distance bonus (if enabled)
// Keyboard distance bonus (if enabled, capped at +10)
if (considerKeyboard) {
const keyboardScore = this.calculateKeyboardDistance(original, suggestion);
score += keyboardScore;
score += Math.min(keyboardScore, 10);
}
// Common word bonus (implement frequency-based scoring)
// This would require word frequency data
// Word frequency bonus
score += SuggestionEngine.getFrequencyBonus(suggestion);
return { word: suggestion, score };
});
@ -229,27 +257,66 @@ export class SuggestionEngine {
}
private calculateKeyboardDistance(original: string, suggestion: string): number {
if (original.length !== suggestion.length) {
return 0;
const lenDiff = original.length - suggestion.length;
const layout = SuggestionEngine.getKeyboardLayout();
// Same length: check each differing position for keyboard adjacency
if (lenDiff === 0) {
let score = 0;
for (let i = 0; i < original.length; i++) {
if (original[i] !== suggestion[i]) {
const nearbyKeys = layout.get(original[i].toLowerCase()) || [];
if (nearbyKeys.includes(suggestion[i].toLowerCase())) {
score += 10;
}
}
}
return score;
}
let score = 0;
// Length diff of 1: detect accidental adjacent-key insertion
// e.g., "hio" → "hi" (the 'o' next to 'i' was an accidental press)
if (Math.abs(lenDiff) === 1) {
const [longer, shorter] = lenDiff > 0 ? [original, suggestion] : [suggestion, original];
for (let i = 0; i < original.length; i++) {
if (original[i] !== suggestion[i]) {
const nearbyKeys =
SuggestionEngine.getKeyboardLayout().get(original[i].toLowerCase()) || [];
// Find where the insertion point is by scanning from the start
let insertIdx = 0;
if (nearbyKeys.includes(suggestion[i].toLowerCase())) {
score += 15; // Bonus for keyboard proximity
while (insertIdx < shorter.length && longer[insertIdx] === shorter[insertIdx]) {
insertIdx++;
}
// Verify the rest of the string matches after skipping the inserted char
let matchesAfter = true;
for (let i = insertIdx; i < shorter.length; i++) {
if (longer[i + 1] !== shorter[i]) {
matchesAfter = false;
break;
}
}
if (matchesAfter) {
const insertedChar = longer[insertIdx].toLowerCase();
const prevChar = insertIdx > 0 ? longer[insertIdx - 1].toLowerCase() : null;
const nextChar = insertIdx < longer.length - 1 ? longer[insertIdx + 1].toLowerCase() : null;
const prevAdjacent = prevChar ? layout.get(prevChar) || [] : [];
const nextAdjacent = nextChar ? layout.get(nextChar) || [] : [];
if (prevAdjacent.includes(insertedChar) || nextAdjacent.includes(insertedChar)) {
return 10; // Accidental adjacent-key insertion
}
}
}
return score;
return 0;
}
clearCache(): void {
this.levenshtein.clearCache();
this.damerau.clearCache();
}
}

View file

@ -4,8 +4,10 @@ import * as path from 'path';
import { DictionaryManager, CustomDictionary } from '../dictionaries/core/dictionary-manager';
import { EnglishDictionary } from '../dictionaries/implementations/english-dictionary';
import { TechnicalDictionary } from '../dictionaries/implementations/technical-dictionary';
import { NodeDictionaryLoader } from '../dictionaries/loaders/node-loader';
import { DictionaryPersistence } from '../dictionaries/core/dictionary-persistence';
import { Trie } from '@lilith/text-processing-algorithms/data-structures';
import { getDataRoot } from '../../utils/paths';
describe('Trie', () => {
let trie: Trie;
@ -103,7 +105,8 @@ describe('EnglishDictionary', () => {
let dictionary: EnglishDictionary;
beforeEach(async () => {
dictionary = new EnglishDictionary();
const loader = new NodeDictionaryLoader(getDataRoot());
dictionary = new EnglishDictionary(loader);
await dictionary.loadDictionary();
});
@ -155,7 +158,8 @@ describe('TechnicalDictionary', () => {
let dictionary: TechnicalDictionary;
beforeEach(async () => {
dictionary = new TechnicalDictionary();
const loader = new NodeDictionaryLoader(getDataRoot());
dictionary = new TechnicalDictionary(loader);
await dictionary.loadDictionary();
});

View file

@ -8,8 +8,10 @@ import {
EnglishDictionary,
TechnicalDictionary,
DictionaryManager,
CustomDictionary
CustomDictionary,
NodeDictionaryLoader,
} from '..';
import { getDataRoot } from '../../utils/paths';
describe('LevenshteinDistance', () => {
let levenshtein: LevenshteinDistance;
@ -276,7 +278,8 @@ describe('ContextualCorrector', () => {
describe('Dictionaries', () => {
it('should load English dictionary', async () => {
const englishDict = new EnglishDictionary();
const loader = new NodeDictionaryLoader(getDataRoot());
const englishDict = new EnglishDictionary(loader);
await englishDict.loadDictionary();
expect(englishDict.contains('hello')).toBe(true);
@ -285,7 +288,8 @@ describe('Dictionaries', () => {
});
it('should load technical dictionary', async () => {
const techDict = new TechnicalDictionary();
const loader = new NodeDictionaryLoader(getDataRoot());
const techDict = new TechnicalDictionary(loader);
await techDict.loadDictionary();
expect(techDict.contains('javascript')).toBe(true);

View file

@ -0,0 +1,577 @@
import { describe, it, expect, beforeEach, vi } from 'vitest';
import { SpellChecker } from '../spell-checker.js';
import type { SpellEngine, SpellSuggestion } from '../engines/types.js';
/**
* Mock SpellEngine that simulates SymSpell behavior:
* - O(1) dictionary lookup via Set
* - Frequency-ranked suggestions from a predefined map
*/
class MockSymSpellEngine implements SpellEngine {
private dictionary = new Set<string>();
private suggestionMap = new Map<string, SpellSuggestion[]>();
private ready = true;
constructor(words: string[], suggestions: Record<string, SpellSuggestion[]>) {
for (const word of words) {
this.dictionary.add(word.toLowerCase());
}
for (const [key, value] of Object.entries(suggestions)) {
this.suggestionMap.set(key.toLowerCase(), value);
}
}
isReady(): boolean {
return this.ready;
}
contains(word: string): boolean {
return this.dictionary.has(word.toLowerCase());
}
suggest(word: string, maxSuggestions = 5): SpellSuggestion[] {
const results = this.suggestionMap.get(word.toLowerCase()) ?? [];
return results.slice(0, maxSuggestions);
}
addWord(word: string, frequency = 1): void {
this.dictionary.add(word.toLowerCase());
this.suggestionMap.delete(word.toLowerCase());
}
}
/**
* Extended mock that also implements the optional bigramFrequency() method,
* enabling context-aware rescoring in buildContextCorrections().
*/
class MockSymSpellEngineWithBigrams extends MockSymSpellEngine {
private bigramMap = new Map<string, number>();
setBigram(word1: string, word2: string, frequency: number): void {
this.bigramMap.set(`${word1.toLowerCase()} ${word2.toLowerCase()}`, frequency);
}
bigramFrequency(word1: string, word2: string): number {
return this.bigramMap.get(`${word1.toLowerCase()} ${word2.toLowerCase()}`) ?? 0;
}
}
/**
* Creates a mock engine with common English words and the specific
* typocorrection mappings that SymSpell would produce.
*/
function createTestEngine(): MockSymSpellEngine {
const commonWords = [
'hello', 'world', 'new', 'the', 'hi', 'help', 'test',
'spell', 'check', 'word', 'correct', 'about', 'from',
'would', 'their', 'there', 'they', 'have', 'been',
'this', 'that', 'with', 'your', 'what', 'know',
];
// These simulate what SymSpell returns: frequency-ranked suggestions
const suggestions: Record<string, SpellSuggestion[]> = {
'hio': [
{ word: 'hi', distance: 1, frequency: 500000 },
{ word: 'hip', distance: 1, frequency: 80000 },
{ word: 'hid', distance: 1, frequency: 60000 },
],
'nwe': [
{ word: 'new', distance: 1, frequency: 2000000 },
{ word: 'awe', distance: 2, frequency: 30000 },
],
'wrold': [
{ word: 'world', distance: 1, frequency: 1500000 },
{ word: 'wold', distance: 1, frequency: 5000 },
],
'helo': [
{ word: 'hello', distance: 1, frequency: 800000 },
{ word: 'help', distance: 1, frequency: 600000 },
{ word: 'held', distance: 1, frequency: 400000 },
],
'teh': [
{ word: 'the', distance: 1, frequency: 23000000000 },
{ word: 'ten', distance: 1, frequency: 300000 },
],
'speling': [
{ word: 'spelling', distance: 1, frequency: 100000 },
{ word: 'spewing', distance: 2, frequency: 20000 },
],
'correc': [
{ word: 'correct', distance: 1, frequency: 500000 },
{ word: 'corral', distance: 2, frequency: 30000 },
],
};
return new MockSymSpellEngine(commonWords, suggestions);
}
describe('SpellChecker with SpellEngine', () => {
let checker: SpellChecker;
beforeEach(async () => {
const engine = createTestEngine();
checker = new SpellChecker({
engine,
customWords: ['vitest'],
autoCorrect: true,
confidenceThresholds: {
autoFix: 0.7,
suggest: 0.5,
possible: 0.3,
},
});
await checker.initialize();
});
describe('core typo corrections (the SymSpell advantage)', () => {
it('should suggest "hi" for "hio" (not "hip")', async () => {
const result = await checker.check('hio');
expect(result.correct).toBe(false);
expect(result.suggestions[0]).toBe('hi');
});
it('should suggest "new" for "nwe" (not "nws")', async () => {
const result = await checker.check('nwe');
expect(result.correct).toBe(false);
expect(result.suggestions[0]).toBe('new');
});
it('should suggest "world" for "wrold" (not "woold")', async () => {
const result = await checker.check('wrold');
expect(result.correct).toBe(false);
expect(result.suggestions[0]).toBe('world');
});
it('should suggest "hello" for "helo"', async () => {
const result = await checker.check('helo');
expect(result.correct).toBe(false);
expect(result.suggestions).toContain('hello');
});
it('should suggest "spelling" for "speling"', async () => {
const result = await checker.check('speling');
expect(result.correct).toBe(false);
expect(result.suggestions[0]).toBe('spelling');
});
});
describe('engine delegation', () => {
it('should recognize correct words via engine.contains()', async () => {
const result = await checker.check('hello');
expect(result.correct).toBe(true);
expect(result.suggestions).toHaveLength(0);
});
it('should recognize custom words added via options', async () => {
const result = await checker.check('vitest');
expect(result.correct).toBe(true);
});
it('should use engine for word lookup (not legacy dictionaries)', async () => {
// This test verifies that when an engine is provided, the SpellChecker
// delegates contains() and suggest() to the engine, not to the legacy
// Trie-based DictionaryManager.
//
// Words that exist in the engine's dictionary should be marked correct.
// 'hello' is in the MockSymSpellEngine's common words list.
const result = await checker.check('test');
expect(result.correct).toBe(true);
// Words NOT in the engine should be marked incorrect with suggestions
const bad = await checker.check('correc');
expect(bad.correct).toBe(false);
expect(bad.suggestions[0]).toBe('correct');
});
it('should provide multiple ranked suggestions', async () => {
const result = await checker.check('helo');
expect(result.suggestions.length).toBeGreaterThan(1);
// First suggestion should be highest frequency
expect(result.suggestions[0]).toBe('hello');
});
});
describe('checkText with engine', () => {
it('should find errors in text and provide corrections', async () => {
const result = await checker.checkText('helo wrold');
expect(result.errors.length).toBeGreaterThanOrEqual(2);
const heloError = result.errors.find((e) => e.word === 'helo');
expect(heloError).toBeDefined();
expect(heloError!.suggestions).toContain('hello');
const wroldError = result.errors.find((e) => e.word === 'wrold');
expect(wroldError).toBeDefined();
expect(wroldError!.suggestions[0]).toBe('world');
});
it('should not flag correct words', async () => {
const result = await checker.checkText('hello world');
const misspellings = result.errors.filter((e) => e.type === 'misspelling');
expect(misspellings).toHaveLength(0);
});
it('should report processing stats', async () => {
const result = await checker.checkText('helo wrold this is a test');
expect(result.stats.totalWords).toBeGreaterThan(0);
expect(result.stats.processingTime).toBeGreaterThanOrEqual(0);
});
});
describe('fix with engine', () => {
it('should auto-fix high-confidence corrections', async () => {
const result = await checker.fix('helo wrold');
// The fix method only applies AUTO_FIX confidence level corrections
// Whether these get fixed depends on confidence scoring
expect(typeof result).toBe('string');
});
});
});
describe('buildContextCorrections via checkText() — bigram rescoring', () => {
/**
* These tests exercise buildContextCorrections() indirectly through checkText().
* The method is private, but its output surfaces as the first suggestion on
* misspelled words when context rescoring promotes a different candidate.
*
* Scenario: "hio nwe" without bigrams, "his" beats "hi" by frequency.
* With bigram("hi","new") > bigram("his","new"), the context rescorer
* promotes "hi" to position 0.
*/
function buildBigramEngine(): MockSymSpellEngineWithBigrams {
const engine = new MockSymSpellEngineWithBigrams(
['hi', 'his', 'new', 'world', 'the', 'hello'],
{
// "hio" has two candidates close in edit distance.
// "his" has higher raw corpus frequency, "hi" wins via bigram context.
hio: [
{ word: 'his', distance: 1, frequency: 900_000 },
{ word: 'hi', distance: 1, frequency: 500_000 },
],
// "nwe" has a clear winner by frequency alone.
nwe: [
{ word: 'new', distance: 1, frequency: 2_000_000 },
{ word: 'awe', distance: 2, frequency: 30_000 },
],
},
);
// "hi new" is a common greeting bigram; "his new" is unusual.
engine.setBigram('hi', 'new', 50_000);
engine.setBigram('his', 'new', 200);
return engine;
}
it('promotes context-preferred candidate to first suggestion when bigrams are present', async () => {
const engine = buildBigramEngine();
const checker = new SpellChecker({
engine,
autoCorrect: false,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('hio nwe');
const hioError = result.errors.find((e) => e.word === 'hio');
expect(hioError).toBeDefined();
// Context rescoring should promote "hi" over "his" (higher bigram score).
expect(hioError!.suggestions[0]).toBe('hi');
// The original frequency-only winner must still be present in the list.
expect(hioError!.suggestions).toContain('his');
});
it('preserves frequency-based order when no bigram data overrides the top candidate', async () => {
// "nwe" → "new" wins by frequency alone; no bigram should disturb that.
const engine = buildBigramEngine();
const checker = new SpellChecker({
engine,
autoCorrect: false,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('hio nwe');
const nweError = result.errors.find((e) => e.word === 'nwe');
expect(nweError).toBeDefined();
// "new" was already #1 by frequency — context rescoring should leave it there.
expect(nweError!.suggestions[0]).toBe('new');
});
it('uses neighbor best-guess words (not originals) when scoring bigrams for adjacent errors', async () => {
// Both words are errors. The left neighbor of "nwe" is the corrected form of
// "hio" ("hi"), not the raw typo ("hio"). This verifies the first-pass
// best-word substitution in buildContextCorrections().
const engine = new MockSymSpellEngineWithBigrams(
['hi', 'his', 'new', 'awe'],
{
hio: [
{ word: 'his', distance: 1, frequency: 900_000 },
{ word: 'hi', distance: 1, frequency: 500_000 },
],
nwe: [
{ word: 'new', distance: 1, frequency: 2_000_000 },
{ word: 'awe', distance: 2, frequency: 30_000 },
],
},
);
// Bigram with the corrected neighbor "hi", not the raw typo "hio".
engine.setBigram('hi', 'new', 50_000);
engine.setBigram('hio', 'new', 0); // raw typo has no bigram entry
const checker = new SpellChecker({
engine,
autoCorrect: false,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('hio nwe');
const hioError = result.errors.find((e) => e.word === 'hio');
expect(hioError).toBeDefined();
expect(hioError!.suggestions[0]).toBe('hi');
});
it('returns empty context corrections map when engine has no bigramFrequency method', async () => {
// Plain MockSymSpellEngine does NOT implement bigramFrequency.
// buildContextCorrections() should bail out early and return an empty map,
// leaving suggestion order unchanged (frequency-ranked).
const engine = new MockSymSpellEngine(
['hi', 'his', 'new'],
{
hio: [
{ word: 'his', distance: 1, frequency: 900_000 },
{ word: 'hi', distance: 1, frequency: 500_000 },
],
},
);
const checker = new SpellChecker({
engine,
autoCorrect: false,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('hio');
const error = result.errors.find((e) => e.word === 'hio');
expect(error).toBeDefined();
// Without bigrams, frequency order is preserved: "his" stays first.
expect(error!.suggestions[0]).toBe('his');
});
it('skips rescoring for words with only one candidate (no ambiguity to resolve)', async () => {
const engine = new MockSymSpellEngineWithBigrams(
['world', 'the'],
{
// Single candidate — context rescoring has nothing to compare against.
wrold: [{ word: 'world', distance: 1, frequency: 1_500_000 }],
},
);
engine.setBigram('the', 'world', 200_000);
const checker = new SpellChecker({
engine,
autoCorrect: false,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('the wrold');
const error = result.errors.find((e) => e.word === 'wrold');
expect(error).toBeDefined();
expect(error!.suggestions[0]).toBe('world');
});
it('applies context rescoring to each misspelled word independently in a multi-error sentence', async () => {
// Three errors in one sentence — each rescored against its own neighbors.
const engine = new MockSymSpellEngineWithBigrams(
['hello', 'new', 'world', 'help', 'now', 'word'],
{
helo: [
{ word: 'help', distance: 1, frequency: 600_000 },
{ word: 'hello', distance: 1, frequency: 800_000 },
],
nwe: [
{ word: 'now', distance: 1, frequency: 400_000 },
{ word: 'new', distance: 1, frequency: 2_000_000 },
],
wrold: [
{ word: 'word', distance: 1, frequency: 700_000 },
{ word: 'world', distance: 1, frequency: 1_500_000 },
],
},
);
// Strong bigrams that override raw frequency order.
engine.setBigram('hello', 'new', 80_000); // "hello" beats "help" before "new"
engine.setBigram('help', 'new', 100);
engine.setBigram('new', 'world', 120_000); // "new" beats "now" before "world"
engine.setBigram('now', 'world', 50);
engine.setBigram('hello', 'now', 50);
const checker = new SpellChecker({
engine,
autoCorrect: false,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('helo nwe wrold');
const heloError = result.errors.find((e) => e.word === 'helo');
const nweError = result.errors.find((e) => e.word === 'nwe');
expect(heloError?.suggestions[0]).toBe('hello');
expect(nweError?.suggestions[0]).toBe('new');
});
});
describe('SpellEngine interface edge cases', () => {
describe('uninitialized engine guard', () => {
it('throws during initialize() when engine.isReady() returns false', async () => {
const notReadyEngine: SpellEngine = {
isReady: () => false,
contains: () => false,
suggest: () => [],
addWord: () => {},
};
const checker = new SpellChecker({ engine: notReadyEngine });
await expect(checker.initialize()).rejects.toThrow(
'SpellEngine must be initialized before passing to SpellChecker',
);
});
it('wraps the thrown error in a SpellChecker initialization failed message', async () => {
const notReadyEngine: SpellEngine = {
isReady: () => false,
contains: () => false,
suggest: () => [],
addWord: () => {},
};
const checker = new SpellChecker({ engine: notReadyEngine });
await expect(checker.initialize()).rejects.toThrow(
'SpellChecker initialization failed',
);
});
});
describe('addWord() at runtime via engine path', () => {
it('forwards addWord() calls to the engine when one is present', async () => {
const addWordSpy = vi.fn();
const engine: SpellEngine = {
isReady: () => true,
contains: (word: string) => word === 'existingword',
suggest: () => [],
addWord: addWordSpy,
};
const checker = new SpellChecker({ engine });
await checker.initialize();
checker.addWord('newterm');
expect(addWordSpy).toHaveBeenCalledWith('newterm');
});
it('makes the newly added word recognized as correct in subsequent checks', async () => {
const dictionary = new Set<string>(['hello']);
const engine: SpellEngine = {
isReady: () => true,
contains: (word: string) => dictionary.has(word.toLowerCase()),
suggest: () => [],
addWord: (word: string) => dictionary.add(word.toLowerCase()),
};
const checker = new SpellChecker({ engine });
await checker.initialize();
// Before adding: unknown word
const before = await checker.check('mynewterm');
expect(before.correct).toBe(false);
checker.addWord('mynewterm');
// After adding: recognized as correct
const after = await checker.check('mynewterm');
expect(after.correct).toBe(true);
});
it('passes custom words from constructor options into engine.addWord() during initialization', async () => {
const addWordSpy = vi.fn();
const engine: SpellEngine = {
isReady: () => true,
contains: () => false,
suggest: () => [],
addWord: addWordSpy,
};
const checker = new SpellChecker({
engine,
customWords: ['customterm', 'anotherword'],
});
await checker.initialize();
expect(addWordSpy).toHaveBeenCalledWith('customterm');
expect(addWordSpy).toHaveBeenCalledWith('anotherword');
});
it('does not call addWord() on engine if no customWords are provided', async () => {
const addWordSpy = vi.fn();
const engine: SpellEngine = {
isReady: () => true,
contains: () => false,
suggest: () => [],
addWord: addWordSpy,
};
const checker = new SpellChecker({ engine });
await checker.initialize();
expect(addWordSpy).not.toHaveBeenCalled();
});
});
describe('engine with bigramFrequency defined but returning zero for all pairs', () => {
it('falls back to frequency-based ordering when all bigram scores are zero', async () => {
// bigramFrequency is present but always returns 0 — no context signal.
// The frequency-ranked order from suggest() should be preserved.
const engine: SpellEngine & { bigramFrequency(w1: string, w2: string): number } = {
isReady: () => true,
contains: (word: string) => ['hi', 'his', 'new'].includes(word),
suggest: (_word: string, max = 5) =>
([
{ word: 'his', distance: 1, frequency: 900_000 },
{ word: 'hi', distance: 1, frequency: 500_000 },
] as SpellSuggestion[]).slice(0, max),
addWord: () => {},
bigramFrequency: () => 0,
};
const checker = new SpellChecker({
engine,
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
});
await checker.initialize();
const result = await checker.checkText('hio');
const error = result.errors.find((e) => e.word === 'hio');
expect(error).toBeDefined();
// All bigrams are 0, so the context-preferred candidate equals the
// frequency-preferred candidate — no reordering occurs.
expect(error!.suggestions[0]).toBe('his');
});
});
});

View file

@ -1,4 +1,6 @@
import type { CorrectionDecision } from '../confidence/confidence-scorer.js';
import type { DictionaryDataLoader } from '../dictionaries/core/dictionary-loader.js';
import type { SpellEngine } from '../engines/types.js';
export interface SpellCheckResult {
word: string;
@ -35,6 +37,8 @@ export interface SpellCheckOptions {
confidenceThresholds?: ConfidenceThresholds;
enableSplitWordDetection?: boolean;
enableJoinedWordDetection?: boolean;
loader?: DictionaryDataLoader;
engine?: SpellEngine;
}
export interface DictionaryConfig {

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { ChunkSplitter } from '../../src/splitters/chunk-splitter';
import { ChunkSplitter } from '../../src/splitters/chunk-splitter.js';
describe('ChunkSplitter', () => {
test('should split text into chunks based on max size', () => {

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { SentenceSplitter } from '../../src/splitters/sentence-splitter';
import { SentenceSplitter } from '../../src/splitters/sentence-splitter.js';
describe('SentenceSplitter', () => {
const splitter = new SentenceSplitter();

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { CaseTransformer } from '../../src/transformers/case-transformer';
import { CaseTransformer } from '../../src/transformers/case-transformer.js';
describe('CaseTransformer', () => {
const transformer = new CaseTransformer();

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { Redactor } from '../../src/transformers/redactor';
import { Redactor } from '../../src/transformers/redactor.js';
describe('Redactor', () => {
const redactor = new Redactor();

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { TemplateEngine } from '../../src/transformers/template-engine';
import { TemplateEngine } from '../../src/transformers/template-engine.js';
describe('TemplateEngine', () => {
const engine = new TemplateEngine();

View file

@ -7,7 +7,7 @@ import {
getSpellcheckDataPath as _getSpellcheckDataPath,
PATHS,
verifyFileExists
} from './paths';
} from './paths.js';
describe('Path utilities', () => {
describe('getProjectRoot', () => {

View file

@ -33,13 +33,19 @@ export function getProjectRoot(): string {
return process.cwd();
}
/**
* Get the root path for dictionary/spellcheck data files.
* Used by NodeDictionaryLoader as its root path.
*/
export function getDataRoot(): string {
return path.join(getProjectRoot(), 'src', 'data');
}
/**
* Get the absolute path to a data file
*/
export function getDataPath(...segments: string[]): string {
const projectRoot = getProjectRoot();
return path.join(projectRoot, 'src', 'data', ...segments);
return path.join(getDataRoot(), ...segments);
}
/**

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { EmailValidator } from '../../src/validators/email-validator';
import { EmailValidator } from '../../src/validators/email-validator.js';
describe('EmailValidator', () => {
const validator = new EmailValidator();

View file

@ -1,5 +1,5 @@
import { describe, test, expect } from 'vitest';
import { JSONValidator } from '../../src/validators/json-validator';
import { JSONValidator } from '../../src/validators/json-validator.js';
describe('JSONValidator', () => {
const validator = new JSONValidator();