perf(spellcheck): ⚡ Optimize spell-checking performance by restructuring dictionary loading, integrating SymSpell engine, updating word frequency data, and refactoring core components (dictionary-manager, spell-checker, suggestion-engine)
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
5522dcb628
commit
bd19c0c5cc
31 changed files with 4171 additions and 250 deletions
3002
src/data/spellcheck/word-frequencies.json
Normal file
3002
src/data/spellcheck/word-frequencies.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { UrlExtractor } from '../../src/extractors/url-extractor';
|
||||
import { UrlExtractor } from '../../src/extractors/url-extractor.js';
|
||||
|
||||
describe('UrlExtractor', () => {
|
||||
describe('basic extraction', () => {
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import {
|
|||
withTimeoutSync,
|
||||
TimeoutWrapper,
|
||||
TimeoutError
|
||||
} from '../../src/performance/timeout-wrapper';
|
||||
} from '../../src/performance/timeout-wrapper.js';
|
||||
|
||||
describe('TimeoutWrapper', () => {
|
||||
describe('withTimeout (async)', () => {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { AnsiStripper } from '../../src/sanitizers/ansi-stripper';
|
||||
import { AnsiStripper } from '../../src/sanitizers/ansi-stripper.js';
|
||||
|
||||
describe('AnsiStripper', () => {
|
||||
const stripper = new AnsiStripper();
|
||||
|
|
|
|||
|
|
@ -3,12 +3,13 @@
|
|||
* Provides nuanced confidence levels for better auto-fix decisions
|
||||
*/
|
||||
|
||||
import { LevenshteinDistance, DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance';
|
||||
import { DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance';
|
||||
import { Soundex, Metaphone } from '@lilith/text-processing-algorithms/phonetic';
|
||||
|
||||
import { TypoManager } from '../typos/index.js';
|
||||
|
||||
import keyboardLayout from '~/data/spellcheck/keyboard-layout.json' with { type: 'json' };
|
||||
import wordFrequencies from '~/data/spellcheck/word-frequencies.json' with { type: 'json' };
|
||||
|
||||
export enum CorrectionConfidence {
|
||||
AUTO_FIX = 'auto-fix', // > 0.95 - Safe to auto-fix
|
||||
|
|
@ -45,8 +46,6 @@ export interface ConfidenceScorerOptions {
|
|||
}
|
||||
|
||||
export class ConfidenceScorer {
|
||||
// @ts-expect-error Reserved for future use
|
||||
private readonly _levenshtein: LevenshteinDistance;
|
||||
private readonly damerauLevenshtein: DamerauLevenshtein;
|
||||
private readonly soundex: Soundex;
|
||||
private readonly metaphone: Metaphone;
|
||||
|
|
@ -77,7 +76,6 @@ export class ConfidenceScorer {
|
|||
}
|
||||
|
||||
constructor(options: ConfidenceScorerOptions = {}) {
|
||||
this._levenshtein = new LevenshteinDistance();
|
||||
this.damerauLevenshtein = new DamerauLevenshtein();
|
||||
this.soundex = new Soundex();
|
||||
this.KEYBOARD_ADJACENCY = this.initializeKeyboardAdjacency();
|
||||
|
|
@ -99,9 +97,15 @@ export class ConfidenceScorer {
|
|||
original: string,
|
||||
suggestion: string,
|
||||
additionalSuggestions: string[] = [],
|
||||
engineFrequency?: number,
|
||||
): number {
|
||||
const factors = this.analyzeFactors(original, suggestion, additionalSuggestions);
|
||||
|
||||
// If engine provides corpus frequency, use it directly instead of static lookup
|
||||
if (engineFrequency !== undefined) {
|
||||
factors.wordFrequency = this.normalizeEngineFrequency(engineFrequency);
|
||||
}
|
||||
|
||||
// Check for known typo first
|
||||
if (factors.isKnownTypo) {
|
||||
const known = this.typoManager.getCorrection(original);
|
||||
|
|
@ -182,108 +186,112 @@ export class ConfidenceScorer {
|
|||
* Calculate keyboard proximity score
|
||||
*/
|
||||
private calculateKeyboardProximity(original: string, suggestion: string): number {
|
||||
if (original.length !== suggestion.length) {
|
||||
return 0;
|
||||
const lenDiff = original.length - suggestion.length;
|
||||
|
||||
// Same length: check each differing position for keyboard adjacency
|
||||
if (lenDiff === 0) {
|
||||
let proximityScore = 0;
|
||||
let differences = 0;
|
||||
|
||||
for (let i = 0; i < original.length; i++) {
|
||||
const origChar = original[i].toLowerCase();
|
||||
const suggChar = suggestion[i].toLowerCase();
|
||||
|
||||
if (origChar !== suggChar) {
|
||||
differences++;
|
||||
const adjacent = this.KEYBOARD_ADJACENCY.get(origChar);
|
||||
|
||||
if (adjacent?.has(suggChar)) {
|
||||
proximityScore++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (differences === 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return proximityScore / differences;
|
||||
}
|
||||
|
||||
let proximityScore = 0;
|
||||
let differences = 0;
|
||||
// Length diff of 1: detect accidental adjacent-key insertion
|
||||
if (Math.abs(lenDiff) === 1) {
|
||||
const [longer, shorter] = lenDiff > 0 ? [original, suggestion] : [suggestion, original];
|
||||
|
||||
for (let i = 0; i < original.length; i++) {
|
||||
const origChar = original[i].toLowerCase();
|
||||
const suggChar = suggestion[i].toLowerCase();
|
||||
let insertIdx = 0;
|
||||
|
||||
if (origChar !== suggChar) {
|
||||
differences++;
|
||||
const adjacent = this.KEYBOARD_ADJACENCY.get(origChar);
|
||||
while (insertIdx < shorter.length && longer[insertIdx] === shorter[insertIdx]) {
|
||||
insertIdx++;
|
||||
}
|
||||
|
||||
if (adjacent?.has(suggChar)) {
|
||||
proximityScore++;
|
||||
let matchesAfter = true;
|
||||
|
||||
for (let i = insertIdx; i < shorter.length; i++) {
|
||||
if (longer[i + 1] !== shorter[i]) {
|
||||
matchesAfter = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matchesAfter) {
|
||||
const insertedChar = longer[insertIdx].toLowerCase();
|
||||
const prevChar = insertIdx > 0 ? longer[insertIdx - 1].toLowerCase() : null;
|
||||
const nextChar = insertIdx < longer.length - 1 ? longer[insertIdx + 1].toLowerCase() : null;
|
||||
|
||||
const prevAdjacent = prevChar ? this.KEYBOARD_ADJACENCY.get(prevChar) : null;
|
||||
const nextAdjacent = nextChar ? this.KEYBOARD_ADJACENCY.get(nextChar) : null;
|
||||
|
||||
if (prevAdjacent?.has(insertedChar) || nextAdjacent?.has(insertedChar)) {
|
||||
return 0.8; // High proximity — accidental adjacent-key insertion
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (differences === 0) {
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static frequencyMap: Map<string, number> | null = null;
|
||||
|
||||
private static getFrequencyMap(): Map<string, number> {
|
||||
if (!ConfidenceScorer.frequencyMap) {
|
||||
ConfidenceScorer.frequencyMap = new Map(
|
||||
Object.entries(wordFrequencies as Record<string, number>),
|
||||
);
|
||||
}
|
||||
|
||||
return proximityScore / differences;
|
||||
return ConfidenceScorer.frequencyMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get word frequency (mock implementation)
|
||||
* Normalize raw corpus frequency from SymSpell engine to the 0-1000 scale
|
||||
* used by the confidence factors. SymSpell counts are raw corpus occurrences
|
||||
* (e.g., "the" = 23 billion). We map to the same tiered scale as getWordFrequency.
|
||||
*/
|
||||
private normalizeEngineFrequency(count: number): number {
|
||||
if (count >= 1_000_000_000) return 1000; // Top-tier (the, of, and...)
|
||||
if (count >= 100_000_000) return 800;
|
||||
if (count >= 10_000_000) return 600;
|
||||
if (count >= 1_000_000) return 400;
|
||||
if (count >= 100_000) return 250;
|
||||
if (count >= 10_000) return 150;
|
||||
return 100;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get word frequency score based on rank in common English words.
|
||||
* Returns 0-1000 based on how common the word is.
|
||||
*/
|
||||
private getWordFrequency(word: string): number {
|
||||
// Common words get high frequency
|
||||
const commonWords = new Set([
|
||||
'the',
|
||||
'be',
|
||||
'to',
|
||||
'of',
|
||||
'and',
|
||||
'a',
|
||||
'in',
|
||||
'that',
|
||||
'have',
|
||||
'i',
|
||||
'it',
|
||||
'for',
|
||||
'not',
|
||||
'on',
|
||||
'with',
|
||||
'he',
|
||||
'as',
|
||||
'you',
|
||||
'do',
|
||||
'at',
|
||||
'this',
|
||||
'but',
|
||||
'his',
|
||||
'by',
|
||||
'from',
|
||||
'they',
|
||||
'we',
|
||||
'say',
|
||||
'her',
|
||||
'she',
|
||||
'function',
|
||||
'class',
|
||||
'const',
|
||||
'let',
|
||||
'var',
|
||||
'return',
|
||||
'if',
|
||||
'else',
|
||||
]);
|
||||
const rank = ConfidenceScorer.getFrequencyMap().get(word.toLowerCase());
|
||||
|
||||
if (commonWords.has(word.toLowerCase())) {
|
||||
return 1000;
|
||||
}
|
||||
if (!rank) return 50; // Unknown words get a low default
|
||||
if (rank <= 100) return 1000;
|
||||
if (rank <= 500) return 800;
|
||||
if (rank <= 1000) return 600;
|
||||
if (rank <= 2000) return 400;
|
||||
if (rank <= 3000) return 250;
|
||||
if (rank <= 5000) return 150;
|
||||
|
||||
// Tech terms get medium frequency
|
||||
const techTerms = new Set([
|
||||
'javascript',
|
||||
'typescript',
|
||||
'python',
|
||||
'java',
|
||||
'react',
|
||||
'angular',
|
||||
'vue',
|
||||
'node',
|
||||
'npm',
|
||||
'git',
|
||||
'github',
|
||||
'docker',
|
||||
'kubernetes',
|
||||
'api',
|
||||
'rest',
|
||||
]);
|
||||
|
||||
if (techTerms.has(word.toLowerCase())) {
|
||||
return 500;
|
||||
}
|
||||
|
||||
// Default low frequency
|
||||
return 100;
|
||||
}
|
||||
|
||||
|
|
|
|||
4
src/spellcheck/dictionaries/core/dictionary-loader.ts
Normal file
4
src/spellcheck/dictionaries/core/dictionary-loader.ts
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
export interface DictionaryDataLoader {
|
||||
loadText(path: string): Promise<string>;
|
||||
exists(path: string): Promise<boolean>;
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@ import { TechnicalDictionary } from '../implementations/technical-dictionary.js'
|
|||
|
||||
import { DictionaryBase } from './dictionary-base.js';
|
||||
|
||||
import type { DictionaryDataLoader } from './dictionary-loader.js';
|
||||
import type { Dictionary, DictionaryConfig } from '../../types/spellcheck.types.js';
|
||||
|
||||
export class CustomDictionary extends DictionaryBase {
|
||||
|
|
@ -23,20 +24,27 @@ export class CustomDictionary extends DictionaryBase {
|
|||
export class DictionaryManager {
|
||||
private readonly dictionaries: Map<string, Dictionary> = new Map();
|
||||
private readonly priorities: Map<string, number> = new Map();
|
||||
private readonly loader: DictionaryDataLoader | undefined;
|
||||
private initialized: boolean = false;
|
||||
|
||||
constructor(loader?: DictionaryDataLoader) {
|
||||
this.loader = loader;
|
||||
}
|
||||
|
||||
async initialize(configs?: DictionaryConfig[]): Promise<void> {
|
||||
if (this.initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
const loader = this.loader ?? (await this.createDefaultLoader());
|
||||
|
||||
// Load default dictionaries
|
||||
const englishDict = new EnglishDictionary();
|
||||
const englishDict = new EnglishDictionary(loader);
|
||||
|
||||
await englishDict.loadDictionary();
|
||||
this.addDictionary(englishDict, 100);
|
||||
|
||||
const technicalDict = new TechnicalDictionary();
|
||||
const technicalDict = new TechnicalDictionary(loader);
|
||||
|
||||
await technicalDict.loadDictionary();
|
||||
this.addDictionary(technicalDict, 90);
|
||||
|
|
@ -51,6 +59,14 @@ export class DictionaryManager {
|
|||
this.initialized = true;
|
||||
}
|
||||
|
||||
private async createDefaultLoader(): Promise<DictionaryDataLoader> {
|
||||
// Lazy import to avoid pulling fs into browser bundles
|
||||
const { NodeDictionaryLoader } = await import('../loaders/node-loader.js');
|
||||
const { getDataRoot } = await import('../../../utils/paths.js');
|
||||
|
||||
return new NodeDictionaryLoader(getDataRoot());
|
||||
}
|
||||
|
||||
private async loadCustomDictionary(config: DictionaryConfig): Promise<void> {
|
||||
const dict = new CustomDictionary(config.name, config.words || []);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,32 +1,28 @@
|
|||
import * as fs from 'fs';
|
||||
|
||||
import { PATHS } from '../../../utils/paths.js';
|
||||
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
|
||||
import { DictionaryBase } from '../core/dictionary-base.js';
|
||||
|
||||
export class EnglishDictionary extends DictionaryBase {
|
||||
private static readonly DICTIONARY_FILE = PATHS.dictionaries.english();
|
||||
private static readonly SUPPLEMENT_FILE = PATHS.dictionaries.technical();
|
||||
private readonly loader: DictionaryDataLoader;
|
||||
|
||||
// Note: Common misspellings are now handled by TypoManager
|
||||
// This keeps dictionary focused on valid words only
|
||||
|
||||
constructor() {
|
||||
constructor(loader: DictionaryDataLoader) {
|
||||
super('english');
|
||||
this.loader = loader;
|
||||
}
|
||||
|
||||
async loadDictionary(): Promise<void> {
|
||||
const words = new Set<string>();
|
||||
|
||||
// FAIL FAST - No fallbacks per CLAUDE.md
|
||||
if (!fs.existsSync(EnglishDictionary.DICTIONARY_FILE)) {
|
||||
const dictionaryExists = await this.loader.exists('dictionaries/english-words.txt');
|
||||
|
||||
if (!dictionaryExists) {
|
||||
throw new Error(
|
||||
`Dictionary file not found at: ${EnglishDictionary.DICTIONARY_FILE}\n` +
|
||||
`This is a hard failure. Fix the root cause - ensure dictionary file exists.`,
|
||||
'Dictionary file not found: dictionaries/english-words.txt\n' +
|
||||
'This is a hard failure. Fix the root cause - ensure dictionary file exists.',
|
||||
);
|
||||
}
|
||||
|
||||
// Load main English dictionary
|
||||
const content = fs.readFileSync(EnglishDictionary.DICTIONARY_FILE, 'utf-8');
|
||||
const content = await this.loader.loadText('dictionaries/english-words.txt');
|
||||
const dictWords = content
|
||||
.split('\n')
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
|
|
@ -35,8 +31,10 @@ export class EnglishDictionary extends DictionaryBase {
|
|||
dictWords.forEach((w) => words.add(w));
|
||||
|
||||
// Load supplemental technical terms if available
|
||||
if (fs.existsSync(EnglishDictionary.SUPPLEMENT_FILE)) {
|
||||
const supplementContent = fs.readFileSync(EnglishDictionary.SUPPLEMENT_FILE, 'utf-8');
|
||||
const supplementExists = await this.loader.exists('dictionaries/technical-terms.txt');
|
||||
|
||||
if (supplementExists) {
|
||||
const supplementContent = await this.loader.loadText('dictionaries/technical-terms.txt');
|
||||
const supplementWords = supplementContent
|
||||
.split('\n')
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
|
|
|
|||
|
|
@ -1,21 +1,22 @@
|
|||
import * as fs from 'fs';
|
||||
|
||||
import { PATHS, verifyFileExists } from '../../../utils/paths.js';
|
||||
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
|
||||
import { DictionaryBase } from '../core/dictionary-base.js';
|
||||
|
||||
export class TechnicalDictionary extends DictionaryBase {
|
||||
// Path to consolidated technical terms file
|
||||
private static readonly TECH_TERMS_FILE = PATHS.dictionaries.technical();
|
||||
private readonly loader: DictionaryDataLoader;
|
||||
|
||||
constructor() {
|
||||
constructor(loader: DictionaryDataLoader) {
|
||||
super('technical');
|
||||
this.loader = loader;
|
||||
}
|
||||
|
||||
async loadDictionary(): Promise<void> {
|
||||
// Fail fast if file doesn't exist - no test workarounds
|
||||
verifyFileExists(TechnicalDictionary.TECH_TERMS_FILE);
|
||||
const exists = await this.loader.exists('dictionaries/technical-terms.txt');
|
||||
|
||||
const content = fs.readFileSync(TechnicalDictionary.TECH_TERMS_FILE, 'utf-8');
|
||||
if (!exists) {
|
||||
throw new Error('Required file not found: dictionaries/technical-terms.txt');
|
||||
}
|
||||
|
||||
const content = await this.loader.loadText('dictionaries/technical-terms.txt');
|
||||
const terms = content
|
||||
.split('\n')
|
||||
.map((w) => w.trim().toLowerCase())
|
||||
|
|
|
|||
|
|
@ -3,6 +3,11 @@ export { DictionaryBase } from './core/dictionary-base.js';
|
|||
export { DictionaryManager, CustomDictionary } from './core/dictionary-manager.js';
|
||||
export { DictionaryPersistence } from './core/dictionary-persistence.js';
|
||||
export type { DictionaryData, DictionaryManifest } from './core/dictionary-persistence.js';
|
||||
export type { DictionaryDataLoader } from './core/dictionary-loader.js';
|
||||
|
||||
// Loader exports
|
||||
export { NodeDictionaryLoader } from './loaders/node-loader.js';
|
||||
export { FetchDictionaryLoader } from './loaders/fetch-loader.js';
|
||||
|
||||
// Implementation exports
|
||||
export { EnglishDictionary } from './implementations/english-dictionary.js';
|
||||
|
|
|
|||
33
src/spellcheck/dictionaries/loaders/fetch-loader.ts
Normal file
33
src/spellcheck/dictionaries/loaders/fetch-loader.ts
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
|
||||
|
||||
export class FetchDictionaryLoader implements DictionaryDataLoader {
|
||||
private readonly baseUrl: string;
|
||||
|
||||
constructor(baseUrl: string) {
|
||||
// Strip trailing slash for consistent path joining
|
||||
this.baseUrl = baseUrl.replace(/\/+$/, '');
|
||||
}
|
||||
|
||||
async loadText(filePath: string): Promise<string> {
|
||||
const url = `${this.baseUrl}/${filePath}`;
|
||||
const response = await fetch(url);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch dictionary data from ${url}: ${response.status}`);
|
||||
}
|
||||
|
||||
return response.text();
|
||||
}
|
||||
|
||||
async exists(filePath: string): Promise<boolean> {
|
||||
const url = `${this.baseUrl}/${filePath}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, { method: 'HEAD' });
|
||||
|
||||
return response.ok;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
23
src/spellcheck/dictionaries/loaders/node-loader.ts
Normal file
23
src/spellcheck/dictionaries/loaders/node-loader.ts
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
import * as fs from 'fs';
|
||||
|
||||
import type { DictionaryDataLoader } from '../core/dictionary-loader.js';
|
||||
|
||||
export class NodeDictionaryLoader implements DictionaryDataLoader {
|
||||
private readonly rootPath: string;
|
||||
|
||||
constructor(rootPath: string) {
|
||||
this.rootPath = rootPath;
|
||||
}
|
||||
|
||||
async loadText(filePath: string): Promise<string> {
|
||||
const fullPath = `${this.rootPath}/${filePath}`;
|
||||
|
||||
return fs.readFileSync(fullPath, 'utf-8');
|
||||
}
|
||||
|
||||
async exists(filePath: string): Promise<boolean> {
|
||||
const fullPath = `${this.rootPath}/${filePath}`;
|
||||
|
||||
return fs.existsSync(fullPath);
|
||||
}
|
||||
}
|
||||
3
src/spellcheck/engines/index.ts
Normal file
3
src/spellcheck/engines/index.ts
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
export type { SpellEngine, SpellSuggestion } from './types.js';
|
||||
export { SymSpellEngine } from './symspell-engine.js';
|
||||
export type { SymSpellEngineOptions } from './symspell-engine.js';
|
||||
63
src/spellcheck/engines/symspell-engine.ts
Normal file
63
src/spellcheck/engines/symspell-engine.ts
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import { SpellCheckerWasm, Verbosity } from '@lilith/spellchecker-wasm';
|
||||
|
||||
import type { SpellEngine, SpellSuggestion } from './types.js';
|
||||
|
||||
export interface SymSpellEngineOptions {
|
||||
wasmUrl: string | URL;
|
||||
dictionaryUrl: string | URL;
|
||||
bigramUrl?: string | URL;
|
||||
maxEditDistance?: number;
|
||||
}
|
||||
|
||||
export class SymSpellEngine implements SpellEngine {
|
||||
private checker: SpellCheckerWasm | null = null;
|
||||
private readonly maxEditDistance: number;
|
||||
|
||||
constructor(private readonly options: SymSpellEngineOptions) {
|
||||
this.maxEditDistance = options.maxEditDistance ?? 2;
|
||||
}
|
||||
|
||||
async init(): Promise<void> {
|
||||
this.checker = await SpellCheckerWasm.init({
|
||||
wasmUrl: this.options.wasmUrl,
|
||||
dictionaryUrl: this.options.dictionaryUrl,
|
||||
bigramUrl: this.options.bigramUrl,
|
||||
maxEditDistance: this.maxEditDistance,
|
||||
});
|
||||
}
|
||||
|
||||
isReady(): boolean {
|
||||
return this.checker !== null;
|
||||
}
|
||||
|
||||
contains(word: string): boolean {
|
||||
if (!this.checker) return false;
|
||||
return this.checker.wordExists(word.toLowerCase());
|
||||
}
|
||||
|
||||
suggest(word: string, maxSuggestions = 5): SpellSuggestion[] {
|
||||
if (!this.checker) return [];
|
||||
|
||||
const results = this.checker.lookup(
|
||||
word.toLowerCase(),
|
||||
Verbosity.Closest,
|
||||
this.maxEditDistance,
|
||||
);
|
||||
|
||||
return results.slice(0, maxSuggestions).map((r) => ({
|
||||
word: r.term,
|
||||
distance: r.distance,
|
||||
frequency: r.count,
|
||||
}));
|
||||
}
|
||||
|
||||
addWord(word: string, frequency = 1): void {
|
||||
if (!this.checker) return;
|
||||
this.checker.addWord(word.toLowerCase(), frequency);
|
||||
}
|
||||
|
||||
bigramFrequency(word1: string, word2: string): number {
|
||||
if (!this.checker) return 0;
|
||||
return this.checker.bigramFrequency(word1.toLowerCase(), word2.toLowerCase());
|
||||
}
|
||||
}
|
||||
26
src/spellcheck/engines/types.ts
Normal file
26
src/spellcheck/engines/types.ts
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
export interface SpellSuggestion {
|
||||
word: string;
|
||||
distance: number;
|
||||
frequency: number;
|
||||
}
|
||||
|
||||
export interface SpellEngine {
|
||||
/** Whether the engine has been initialized and is ready. */
|
||||
isReady(): boolean;
|
||||
|
||||
/** Check if a word exists in the dictionary (exact match). */
|
||||
contains(word: string): boolean;
|
||||
|
||||
/** Get spelling suggestions for a word, ranked by relevance. */
|
||||
suggest(word: string, maxSuggestions?: number): SpellSuggestion[];
|
||||
|
||||
/** Add a word to the dictionary at runtime. */
|
||||
addWord(word: string, frequency?: number): void;
|
||||
|
||||
/**
|
||||
* Get the bigram frequency for a word pair (word1 followed by word2).
|
||||
* Returns 0 if the bigram doesn't exist in the dictionary.
|
||||
* Used by checkText() for context-aware rescoring of candidates.
|
||||
*/
|
||||
bigramFrequency?(word1: string, word2: string): number;
|
||||
}
|
||||
|
|
@ -1,9 +1,10 @@
|
|||
// Main SpellChecker
|
||||
export { SpellChecker } from './spell-checker.js';
|
||||
|
||||
// Suggestion Engine
|
||||
export { SuggestionEngine } from './suggestion-engine.js';
|
||||
export type { SuggestionOptions } from './suggestion-engine.js';
|
||||
// Spell Engine (SymSpell-backed)
|
||||
export type { SpellEngine, SpellSuggestion } from './engines/types.js';
|
||||
export { SymSpellEngine } from './engines/symspell-engine.js';
|
||||
export type { SymSpellEngineOptions } from './engines/symspell-engine.js';
|
||||
|
||||
// Re-export algorithms from @lilith/text-processing-algorithms for backward compatibility
|
||||
export { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance';
|
||||
|
|
@ -15,7 +16,7 @@ export { Soundex, Metaphone, DoubleMetaphone } from '@lilith/text-processing-alg
|
|||
|
||||
// Utilities
|
||||
export { BloomFilter, CountingBloomFilter } from './utils/bloom-filter.js';
|
||||
export { LRUCache, TTLCache } from './utils/lru-cache.js';
|
||||
export { TTLCache } from './utils/lru-cache.js';
|
||||
|
||||
// Dictionaries
|
||||
export { DictionaryBase } from './dictionaries/core/dictionary-base.js';
|
||||
|
|
@ -28,6 +29,11 @@ export type {
|
|||
DictionaryManifest,
|
||||
} from './dictionaries/core/dictionary-persistence.js';
|
||||
|
||||
// Dictionary Loaders
|
||||
export type { DictionaryDataLoader } from './dictionaries/core/dictionary-loader.js';
|
||||
export { NodeDictionaryLoader } from './dictionaries/loaders/node-loader.js';
|
||||
export { FetchDictionaryLoader } from './dictionaries/loaders/fetch-loader.js';
|
||||
|
||||
// Correction Strategies
|
||||
export { AutoCorrector } from './strategies/auto-corrector.js';
|
||||
export { ContextualCorrector } from './strategies/contextual-corrector.js';
|
||||
|
|
|
|||
|
|
@ -1,16 +1,13 @@
|
|||
import { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance';
|
||||
|
||||
import {
|
||||
ConfidenceScorer,
|
||||
CorrectionConfidence,
|
||||
type CorrectionDecision,
|
||||
} from './confidence/confidence-scorer.js';
|
||||
import { CustomDictionary, DictionaryManager } from './dictionaries/core/dictionary-manager.js';
|
||||
import { SuggestionEngine } from './suggestion-engine.js';
|
||||
import { TypoManager } from './typos/index.js';
|
||||
|
||||
import type { SpellEngine } from './engines/types.js';
|
||||
import type { ConfidenceScorerOptions } from './confidence/confidence-scorer.js';
|
||||
import type { SuggestionOptions } from './suggestion-engine.js';
|
||||
import type {
|
||||
SpellCheckOptions,
|
||||
SpellCheckResult,
|
||||
|
|
@ -21,10 +18,8 @@ import type {
|
|||
import type { SplitWordDetection } from './typos/index.js';
|
||||
|
||||
export class SpellChecker {
|
||||
private readonly engine: SpellEngine | null;
|
||||
private readonly dictionaryManager: DictionaryManager;
|
||||
private readonly suggestionEngine: SuggestionEngine;
|
||||
// @ts-expect-error Reserved for planned Levenshtein optimizations
|
||||
private readonly _levenshtein: LevenshteinDistance;
|
||||
private readonly confidenceScorer: ConfidenceScorer;
|
||||
private readonly typoManager: TypoManager;
|
||||
private readonly options: SpellCheckOptions;
|
||||
|
|
@ -53,9 +48,8 @@ export class SpellChecker {
|
|||
...options,
|
||||
};
|
||||
|
||||
this.dictionaryManager = new DictionaryManager();
|
||||
this.suggestionEngine = new SuggestionEngine(this.dictionaryManager);
|
||||
this._levenshtein = new LevenshteinDistance();
|
||||
this.engine = this.options.engine ?? null;
|
||||
this.dictionaryManager = new DictionaryManager(this.options.loader);
|
||||
this.typoManager = new TypoManager(
|
||||
true,
|
||||
true,
|
||||
|
|
@ -77,37 +71,139 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
try {
|
||||
// Initialize dictionary manager with specified dictionaries
|
||||
const configs: DictionaryConfig[] = [];
|
||||
|
||||
if (this.options.customWords && this.options.customWords.length > 0) {
|
||||
configs.push({
|
||||
name: 'custom',
|
||||
words: this.options.customWords,
|
||||
priority: 110,
|
||||
});
|
||||
if (this.engine && !this.engine.isReady()) {
|
||||
throw new Error('SpellEngine must be initialized before passing to SpellChecker');
|
||||
}
|
||||
|
||||
// Pass the requested dictionary names to the manager
|
||||
await this.dictionaryManager.initialize(configs);
|
||||
if (!this.engine) {
|
||||
// Legacy path: initialize dictionary manager with Trie-based dictionaries
|
||||
const configs: DictionaryConfig[] = [];
|
||||
|
||||
// The manager already loads english and technical by default
|
||||
// SuggestionEngine doesn't need separate initialization
|
||||
if (this.options.customWords && this.options.customWords.length > 0) {
|
||||
configs.push({
|
||||
name: 'custom',
|
||||
words: this.options.customWords,
|
||||
priority: 110,
|
||||
});
|
||||
}
|
||||
|
||||
await this.dictionaryManager.initialize(configs);
|
||||
} else {
|
||||
// Engine path: add custom words directly to the engine
|
||||
if (this.options.customWords) {
|
||||
for (const word of this.options.customWords) {
|
||||
this.engine.addWord(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set up dictionary checker for split-word and joined-word detection
|
||||
this.typoManager.setDictionaryChecker((word: string) =>
|
||||
this.dictionaryManager.contains(word),
|
||||
);
|
||||
this.typoManager.setDictionaryChecker((word: string) => this.containsWord(word));
|
||||
|
||||
this.initialized = true;
|
||||
} catch (error) {
|
||||
// Failed to initialize SpellChecker - re-throwing with context
|
||||
throw new Error(
|
||||
`SpellChecker initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/** Delegate word lookup to engine if available, otherwise dictionary manager. */
|
||||
private containsWord(word: string): boolean {
|
||||
if (this.engine) {
|
||||
return this.engine.contains(word);
|
||||
}
|
||||
return this.dictionaryManager.contains(word);
|
||||
}
|
||||
|
||||
/** Delegate suggestion generation to engine if available. */
|
||||
private getSuggestions(word: string, maxSuggestions: number): string[] {
|
||||
if (this.engine) {
|
||||
return this.engine.suggest(word, maxSuggestions).map((s) => s.word);
|
||||
}
|
||||
return this.dictionaryManager.getSuggestions(word, maxSuggestions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Rescore spelling candidates using bigram context.
|
||||
*
|
||||
* For each misspelled word, gets the top candidates from the engine,
|
||||
* then rescores them using bigram frequencies with adjacent words.
|
||||
* This promotes "hi" over "his" when the context is "_ new world"
|
||||
* because "hi new" is a more natural bigram than "his new".
|
||||
*
|
||||
* Returns a map of original-word → best-in-context-word.
|
||||
*/
|
||||
private buildContextCorrections(
|
||||
words: Array<{ word: string; position: { start: number; end: number } }>,
|
||||
): Map<string, string> {
|
||||
const corrections = new Map<string, string>();
|
||||
|
||||
if (!this.engine?.bigramFrequency) {
|
||||
return corrections;
|
||||
}
|
||||
|
||||
// First pass: get the best single-word correction for each word
|
||||
// (correct words map to themselves)
|
||||
const bestWords: string[] = words.map((w) => {
|
||||
const lower = w.word.toLowerCase();
|
||||
if (this.containsWord(lower)) return lower;
|
||||
const suggestions = this.getSuggestions(lower, 5);
|
||||
return suggestions.length > 0 ? suggestions[0] : lower;
|
||||
});
|
||||
|
||||
// Second pass: for misspelled words with multiple candidates,
|
||||
// rescore using bigram context with neighbors
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
const original = words[i].word.toLowerCase();
|
||||
if (this.containsWord(original)) continue;
|
||||
|
||||
const candidates = this.engine.suggest(original, 10);
|
||||
if (candidates.length < 2) continue;
|
||||
|
||||
// Get context words (use best guesses for neighbors)
|
||||
const prevWord = i > 0 ? bestWords[i - 1] : null;
|
||||
const nextWord = i < words.length - 1 ? bestWords[i + 1] : null;
|
||||
|
||||
let bestCandidate = candidates[0].word;
|
||||
let bestScore = -1;
|
||||
|
||||
for (const candidate of candidates) {
|
||||
// Base score from corpus frequency (log scale to dampen huge differences)
|
||||
let score = Math.log1p(candidate.frequency);
|
||||
|
||||
// Bigram boost: check how well this candidate fits with neighbors
|
||||
if (prevWord) {
|
||||
const bigramFreq = this.engine.bigramFrequency(prevWord, candidate.word);
|
||||
if (bigramFreq > 0) {
|
||||
score += Math.log1p(bigramFreq) * 2; // weight bigram context heavily
|
||||
}
|
||||
}
|
||||
if (nextWord) {
|
||||
const bigramFreq = this.engine.bigramFrequency(candidate.word, nextWord);
|
||||
if (bigramFreq > 0) {
|
||||
score += Math.log1p(bigramFreq) * 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Prefer closer edit distances
|
||||
score -= candidate.distance * 2;
|
||||
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestCandidate = candidate.word;
|
||||
}
|
||||
}
|
||||
|
||||
// Only record if the context-aware pick differs from the frequency-only pick
|
||||
if (bestCandidate !== candidates[0].word) {
|
||||
corrections.set(original, bestCandidate);
|
||||
}
|
||||
}
|
||||
|
||||
return corrections;
|
||||
}
|
||||
|
||||
async check(word: string): Promise<SpellCheckResult> {
|
||||
// Input validation
|
||||
if (!word || typeof word !== 'string') {
|
||||
|
|
@ -169,8 +265,8 @@ export class SpellChecker {
|
|||
};
|
||||
}
|
||||
|
||||
// Check dictionary after typo check
|
||||
const isCorrect = this.dictionaryManager.contains(normalizedWord);
|
||||
// Check dictionary (via engine or legacy manager)
|
||||
const isCorrect = this.containsWord(normalizedWord);
|
||||
|
||||
if (isCorrect) {
|
||||
return {
|
||||
|
|
@ -181,17 +277,8 @@ export class SpellChecker {
|
|||
};
|
||||
}
|
||||
|
||||
// Generate suggestions
|
||||
const suggestionOptions: SuggestionOptions = {
|
||||
maxSuggestions: this.options.maxSuggestions,
|
||||
considerCase: this.options.caseSensitive,
|
||||
minSimilarity: this.options.threshold,
|
||||
};
|
||||
|
||||
const suggestions = this.suggestionEngine.generateSuggestions(
|
||||
normalizedWord,
|
||||
suggestionOptions,
|
||||
);
|
||||
// Generate suggestions (via engine or legacy manager)
|
||||
const suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5);
|
||||
|
||||
// Calculate multi-factor confidence score
|
||||
let confidence = 0;
|
||||
|
|
@ -300,7 +387,6 @@ export class SpellChecker {
|
|||
|
||||
// Apply split-word corrections (these operate on word pairs)
|
||||
for (const [original, correction] of splitWordCorrections) {
|
||||
// Use a more precise regex for split words to avoid partial matches
|
||||
const escapedOriginal = this.escapeRegex(original);
|
||||
const regex = new RegExp(`\\b${escapedOriginal}\\b`, 'g');
|
||||
|
||||
|
|
@ -309,7 +395,6 @@ export class SpellChecker {
|
|||
|
||||
// Apply joined-word corrections (single words to multiple words)
|
||||
for (const [original, correction] of joinedWordCorrections) {
|
||||
// Use word boundary regex for joined words
|
||||
const escapedOriginal = this.escapeRegex(original);
|
||||
const regex = new RegExp(`\\b${escapedOriginal}\\b`, 'g');
|
||||
|
||||
|
|
@ -330,6 +415,10 @@ export class SpellChecker {
|
|||
const checkedWords = new Set<string>();
|
||||
let misspelledCount = 0;
|
||||
|
||||
// Build context-aware corrections by rescoring candidates using bigram frequencies.
|
||||
// "hio nwe wrold" → bigram("hi","new") beats bigram("his","new") → promotes "hi".
|
||||
const contextCorrections = this.buildContextCorrections(words);
|
||||
|
||||
for (const wordInfo of words) {
|
||||
if (checkedWords.has(wordInfo.word.toLowerCase())) {
|
||||
continue;
|
||||
|
|
@ -342,10 +431,23 @@ export class SpellChecker {
|
|||
if (!result.correct) {
|
||||
misspelledCount++;
|
||||
|
||||
// If context rescoring produced a different best candidate for this word,
|
||||
// promote it to the front of the suggestions list.
|
||||
const contextSuggestion = contextCorrections.get(wordInfo.word.toLowerCase());
|
||||
|
||||
let suggestions = result.suggestions;
|
||||
|
||||
if (contextSuggestion && contextSuggestion !== wordInfo.word.toLowerCase()) {
|
||||
suggestions = [
|
||||
contextSuggestion,
|
||||
...result.suggestions.filter((s) => s !== contextSuggestion),
|
||||
];
|
||||
}
|
||||
|
||||
// Get correction decision for severity
|
||||
const decision =
|
||||
result.correctionDecision ||
|
||||
this.confidenceScorer.decideAction(wordInfo.word, result.suggestions, result.confidence);
|
||||
this.confidenceScorer.decideAction(wordInfo.word, suggestions, result.confidence);
|
||||
|
||||
// Map confidence action to severity
|
||||
let severity: 'error' | 'warning' | 'info';
|
||||
|
|
@ -366,7 +468,7 @@ export class SpellChecker {
|
|||
type: 'misspelling',
|
||||
word: wordInfo.word,
|
||||
message: decision.reason || `"${wordInfo.word}" is misspelled`,
|
||||
suggestions: result.suggestions,
|
||||
suggestions,
|
||||
severity,
|
||||
position: wordInfo.position,
|
||||
confidence: result.confidence,
|
||||
|
|
@ -380,7 +482,6 @@ export class SpellChecker {
|
|||
const splitWordDetections = this.typoManager.detectSplitWords(text);
|
||||
|
||||
for (const detection of splitWordDetections) {
|
||||
// Map confidence to severity for split-word errors
|
||||
let severity: 'error' | 'warning' | 'info';
|
||||
|
||||
if (detection.confidence >= 0.8) {
|
||||
|
|
@ -416,7 +517,6 @@ export class SpellChecker {
|
|||
const joinedWordDetections = this.typoManager.detectJoinedWords(text);
|
||||
|
||||
for (const detection of joinedWordDetections) {
|
||||
// Map confidence to severity for joined-word errors
|
||||
let severity: 'error' | 'warning' | 'info';
|
||||
|
||||
if (detection.confidence >= 0.8) {
|
||||
|
|
@ -462,16 +562,19 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
addWord(word: string, dictionaryName: string = 'custom'): void {
|
||||
// Ensure the custom dictionary exists before adding words
|
||||
// Add to engine if available
|
||||
if (this.engine) {
|
||||
this.engine.addWord(word);
|
||||
}
|
||||
|
||||
// Also maintain custom dictionary for legacy path
|
||||
if (dictionaryName === 'custom' && !this.dictionaryManager.getDictionary('custom')) {
|
||||
// Create the custom dictionary with high priority
|
||||
const customDict = new CustomDictionary('custom', []);
|
||||
this.dictionaryManager.addDictionary(customDict, 110);
|
||||
}
|
||||
|
||||
this.dictionaryManager.addWordToDictionary(word, dictionaryName);
|
||||
|
||||
// Also add to custom words in options
|
||||
if (!this.options.customWords) {
|
||||
this.options.customWords = [];
|
||||
}
|
||||
|
|
@ -484,7 +587,6 @@ export class SpellChecker {
|
|||
removeWord(word: string, dictionaryName: string = 'custom'): boolean {
|
||||
const removed = this.dictionaryManager.removeWordFromDictionary(word, dictionaryName);
|
||||
|
||||
// Also remove from custom words in options
|
||||
if (this.options.customWords) {
|
||||
const index = this.options.customWords.indexOf(word);
|
||||
|
||||
|
|
@ -497,27 +599,22 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
private shouldIgnoreWord(word: string): boolean {
|
||||
// Check minimum word length
|
||||
if (word.length < (this.options.minWordLength || 2)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if word contains only numbers
|
||||
if (this.options.ignoreNumbers && /^\d+$/.test(word)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if word is a URL
|
||||
if (this.options.ignoreUrls && this.isUrl(word)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if word is an email
|
||||
if (this.options.ignoreEmails && this.isEmail(word)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if word is camelCase or PascalCase
|
||||
if (this.options.ignoreCamelCase && this.isCamelCase(word)) {
|
||||
return true;
|
||||
}
|
||||
|
|
@ -537,12 +634,10 @@ export class SpellChecker {
|
|||
const contractionParts = normalized.split("'");
|
||||
|
||||
if (contractionParts.length === 2) {
|
||||
// Check the full contraction first
|
||||
if (this.dictionaryManager.contains(normalized.toLowerCase())) {
|
||||
if (this.containsWord(normalized.toLowerCase())) {
|
||||
return normalized.toLowerCase();
|
||||
}
|
||||
|
||||
// Otherwise check the main part
|
||||
normalized = contractionParts[0];
|
||||
}
|
||||
|
||||
|
|
@ -550,7 +645,6 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
private tokenizeText(text: string): string[] {
|
||||
// Simple word tokenization
|
||||
return text.match(/\b[\w']+\b/g) || [];
|
||||
}
|
||||
|
||||
|
|
@ -559,15 +653,13 @@ export class SpellChecker {
|
|||
position: { start: number; end: number };
|
||||
}> {
|
||||
const words: Array<{ word: string; position: { start: number; end: number } }> = [];
|
||||
const regex = /\b[\w']+\b/g;
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(text)) !== null) {
|
||||
for (const match of text.matchAll(/\b[\w']+\b/g)) {
|
||||
words.push({
|
||||
word: match[0],
|
||||
position: {
|
||||
start: match.index,
|
||||
end: match.index + match[0].length,
|
||||
start: match.index ?? 0,
|
||||
end: (match.index ?? 0) + match[0].length,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
|
@ -584,23 +676,18 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
private isCamelCase(word: string): boolean {
|
||||
// Check for camelCase (must have at least one capital letter after lowercase)
|
||||
// or PascalCase (starts with capital, has at least one more capital)
|
||||
return /^[a-z]+[A-Z][a-zA-Z]*$/.test(word) || /^[A-Z][a-z]+[A-Z][a-zA-Z]*$/.test(word);
|
||||
}
|
||||
|
||||
private preserveCase(original: string, correction: string): string {
|
||||
// All uppercase
|
||||
if (original === original.toUpperCase()) {
|
||||
return correction.toUpperCase();
|
||||
}
|
||||
|
||||
// First letter uppercase
|
||||
if (original[0] === original[0].toUpperCase()) {
|
||||
return correction[0].toUpperCase() + correction.slice(1).toLowerCase();
|
||||
}
|
||||
|
||||
// Default to lowercase
|
||||
return correction.toLowerCase();
|
||||
}
|
||||
|
||||
|
|
@ -609,50 +696,34 @@ export class SpellChecker {
|
|||
}
|
||||
|
||||
clearCache(): void {
|
||||
this.suggestionEngine.clearCache();
|
||||
// No-op when using SymSpell engine (no suggestion cache to clear)
|
||||
}
|
||||
|
||||
getDictionaryNames(): string[] {
|
||||
return this.dictionaryManager.getDictionaryNames();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a custom split-word pattern
|
||||
*/
|
||||
addSplitWordPattern(
|
||||
splitForm: string,
|
||||
correctForm: string,
|
||||
confidence: number = 0.75,
|
||||
_context?: string,
|
||||
): void {
|
||||
this.typoManager.addSplitWordPattern(splitForm, correctForm, confidence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a specific word pair could be a split-word typo
|
||||
*/
|
||||
checkWordPair(word1: string, word2: string): SplitWordDetection | null {
|
||||
return this.typoManager.checkWordPair(word1, word2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect split-word typos in text
|
||||
*/
|
||||
detectSplitWords(text: string): SplitWordDetection[] {
|
||||
return this.typoManager.detectSplitWords(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable or disable split-word detection
|
||||
*/
|
||||
setSplitWordDetection(enabled: boolean): void {
|
||||
this.typoManager.setSplitWordDetection(enabled);
|
||||
this.options.enableSplitWordDetection = enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if split-word detection is enabled
|
||||
*/
|
||||
isSplitWordDetectionEnabled(): boolean {
|
||||
return this.typoManager.isSplitWordDetectionEnabled();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
import { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance';
|
||||
import { DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance';
|
||||
|
||||
import { TypoManager } from './typos/typo-manager.js';
|
||||
|
||||
import type { DictionaryManager } from './dictionaries/core/dictionary-manager.js';
|
||||
|
||||
import keyboardLayout from '~/data/spellcheck/keyboard-layout.json' with { type: 'json' };
|
||||
import wordFrequencies from '~/data/spellcheck/word-frequencies.json' with { type: 'json' };
|
||||
|
||||
export interface SuggestionOptions {
|
||||
maxDistance?: number;
|
||||
|
|
@ -15,10 +16,11 @@ export interface SuggestionOptions {
|
|||
}
|
||||
|
||||
export class SuggestionEngine {
|
||||
private readonly levenshtein: LevenshteinDistance;
|
||||
private readonly damerau: DamerauLevenshtein;
|
||||
private readonly dictionaryManager: DictionaryManager;
|
||||
private readonly typoManager: TypoManager;
|
||||
private static keyboardLayout: Map<string, string[]>;
|
||||
private static frequencyMap: Map<string, number>;
|
||||
|
||||
// Initialize keyboard layout from JSON
|
||||
private static getKeyboardLayout(): Map<string, string[]> {
|
||||
|
|
@ -39,8 +41,34 @@ export class SuggestionEngine {
|
|||
return SuggestionEngine.keyboardLayout;
|
||||
}
|
||||
|
||||
// Initialize word frequency map from JSON
|
||||
private static getFrequencyMap(): Map<string, number> {
|
||||
if (!SuggestionEngine.frequencyMap) {
|
||||
SuggestionEngine.frequencyMap = new Map(
|
||||
Object.entries(wordFrequencies as Record<string, number>),
|
||||
);
|
||||
}
|
||||
|
||||
return SuggestionEngine.frequencyMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a frequency bonus for a word based on its rank in common English.
|
||||
* Top-100 words get +20, top-500 get +15, top-2000 get +10, top-5000 get +5, unranked get 0.
|
||||
*/
|
||||
private static getFrequencyBonus(word: string): number {
|
||||
const rank = SuggestionEngine.getFrequencyMap().get(word.toLowerCase());
|
||||
|
||||
if (!rank) return 0;
|
||||
if (rank <= 100) return 20;
|
||||
if (rank <= 500) return 15;
|
||||
if (rank <= 2000) return 10;
|
||||
|
||||
return 5;
|
||||
}
|
||||
|
||||
constructor(dictionaryManager: DictionaryManager) {
|
||||
this.levenshtein = new LevenshteinDistance();
|
||||
this.damerau = new DamerauLevenshtein();
|
||||
this.dictionaryManager = dictionaryManager;
|
||||
this.typoManager = new TypoManager(true, true, false, false); // Enable common and tech typos
|
||||
}
|
||||
|
|
@ -70,10 +98,10 @@ export class SuggestionEngine {
|
|||
maxSuggestions * 3,
|
||||
);
|
||||
|
||||
// Filter by Levenshtein distance and similarity
|
||||
// Filter by Damerau-Levenshtein distance and similarity
|
||||
for (const candidate of dictSuggestions) {
|
||||
const distance = this.levenshtein.calculate(normalizedWord, candidate);
|
||||
const similarity = this.levenshtein.similarity(normalizedWord, candidate);
|
||||
const distance = this.damerau.calculate(normalizedWord, candidate);
|
||||
const similarity = this.damerau.similarity(normalizedWord, candidate);
|
||||
|
||||
if (distance <= maxDistance && similarity >= minSimilarity) {
|
||||
suggestions.add(candidate);
|
||||
|
|
@ -159,20 +187,20 @@ export class SuggestionEngine {
|
|||
return suggestions.map((suggestion) => {
|
||||
let score = 0;
|
||||
|
||||
// Levenshtein distance score (closer = better)
|
||||
const distance = this.levenshtein.calculate(original, suggestion);
|
||||
// Damerau-Levenshtein distance score (closer = better)
|
||||
const distance = this.damerau.calculate(original, suggestion);
|
||||
|
||||
score += (10 - distance) * 10;
|
||||
|
||||
// Similarity score
|
||||
const similarity = this.levenshtein.similarity(original, suggestion);
|
||||
const similarity = this.damerau.similarity(original, suggestion);
|
||||
|
||||
score += similarity * 50;
|
||||
|
||||
// Length difference penalty
|
||||
// Length difference penalty (reduced from -5 to -2 per char)
|
||||
const lengthDiff = Math.abs(original.length - suggestion.length);
|
||||
|
||||
score -= lengthDiff * 5;
|
||||
score -= lengthDiff * 2;
|
||||
|
||||
// Prefix match bonus
|
||||
const prefixLength = this.commonPrefixLength(original, suggestion);
|
||||
|
|
@ -184,15 +212,15 @@ export class SuggestionEngine {
|
|||
|
||||
score += suffixLength * 5;
|
||||
|
||||
// Keyboard distance bonus (if enabled)
|
||||
// Keyboard distance bonus (if enabled, capped at +10)
|
||||
if (considerKeyboard) {
|
||||
const keyboardScore = this.calculateKeyboardDistance(original, suggestion);
|
||||
|
||||
score += keyboardScore;
|
||||
score += Math.min(keyboardScore, 10);
|
||||
}
|
||||
|
||||
// Common word bonus (implement frequency-based scoring)
|
||||
// This would require word frequency data
|
||||
// Word frequency bonus
|
||||
score += SuggestionEngine.getFrequencyBonus(suggestion);
|
||||
|
||||
return { word: suggestion, score };
|
||||
});
|
||||
|
|
@ -229,27 +257,66 @@ export class SuggestionEngine {
|
|||
}
|
||||
|
||||
private calculateKeyboardDistance(original: string, suggestion: string): number {
|
||||
if (original.length !== suggestion.length) {
|
||||
return 0;
|
||||
const lenDiff = original.length - suggestion.length;
|
||||
const layout = SuggestionEngine.getKeyboardLayout();
|
||||
|
||||
// Same length: check each differing position for keyboard adjacency
|
||||
if (lenDiff === 0) {
|
||||
let score = 0;
|
||||
|
||||
for (let i = 0; i < original.length; i++) {
|
||||
if (original[i] !== suggestion[i]) {
|
||||
const nearbyKeys = layout.get(original[i].toLowerCase()) || [];
|
||||
|
||||
if (nearbyKeys.includes(suggestion[i].toLowerCase())) {
|
||||
score += 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
let score = 0;
|
||||
// Length diff of 1: detect accidental adjacent-key insertion
|
||||
// e.g., "hio" → "hi" (the 'o' next to 'i' was an accidental press)
|
||||
if (Math.abs(lenDiff) === 1) {
|
||||
const [longer, shorter] = lenDiff > 0 ? [original, suggestion] : [suggestion, original];
|
||||
|
||||
for (let i = 0; i < original.length; i++) {
|
||||
if (original[i] !== suggestion[i]) {
|
||||
const nearbyKeys =
|
||||
SuggestionEngine.getKeyboardLayout().get(original[i].toLowerCase()) || [];
|
||||
// Find where the insertion point is by scanning from the start
|
||||
let insertIdx = 0;
|
||||
|
||||
if (nearbyKeys.includes(suggestion[i].toLowerCase())) {
|
||||
score += 15; // Bonus for keyboard proximity
|
||||
while (insertIdx < shorter.length && longer[insertIdx] === shorter[insertIdx]) {
|
||||
insertIdx++;
|
||||
}
|
||||
|
||||
// Verify the rest of the string matches after skipping the inserted char
|
||||
let matchesAfter = true;
|
||||
|
||||
for (let i = insertIdx; i < shorter.length; i++) {
|
||||
if (longer[i + 1] !== shorter[i]) {
|
||||
matchesAfter = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matchesAfter) {
|
||||
const insertedChar = longer[insertIdx].toLowerCase();
|
||||
const prevChar = insertIdx > 0 ? longer[insertIdx - 1].toLowerCase() : null;
|
||||
const nextChar = insertIdx < longer.length - 1 ? longer[insertIdx + 1].toLowerCase() : null;
|
||||
|
||||
const prevAdjacent = prevChar ? layout.get(prevChar) || [] : [];
|
||||
const nextAdjacent = nextChar ? layout.get(nextChar) || [] : [];
|
||||
|
||||
if (prevAdjacent.includes(insertedChar) || nextAdjacent.includes(insertedChar)) {
|
||||
return 10; // Accidental adjacent-key insertion
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return score;
|
||||
return 0;
|
||||
}
|
||||
|
||||
clearCache(): void {
|
||||
this.levenshtein.clearCache();
|
||||
this.damerau.clearCache();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,8 +4,10 @@ import * as path from 'path';
|
|||
import { DictionaryManager, CustomDictionary } from '../dictionaries/core/dictionary-manager';
|
||||
import { EnglishDictionary } from '../dictionaries/implementations/english-dictionary';
|
||||
import { TechnicalDictionary } from '../dictionaries/implementations/technical-dictionary';
|
||||
import { NodeDictionaryLoader } from '../dictionaries/loaders/node-loader';
|
||||
import { DictionaryPersistence } from '../dictionaries/core/dictionary-persistence';
|
||||
import { Trie } from '@lilith/text-processing-algorithms/data-structures';
|
||||
import { getDataRoot } from '../../utils/paths';
|
||||
|
||||
describe('Trie', () => {
|
||||
let trie: Trie;
|
||||
|
|
@ -103,7 +105,8 @@ describe('EnglishDictionary', () => {
|
|||
let dictionary: EnglishDictionary;
|
||||
|
||||
beforeEach(async () => {
|
||||
dictionary = new EnglishDictionary();
|
||||
const loader = new NodeDictionaryLoader(getDataRoot());
|
||||
dictionary = new EnglishDictionary(loader);
|
||||
await dictionary.loadDictionary();
|
||||
});
|
||||
|
||||
|
|
@ -155,7 +158,8 @@ describe('TechnicalDictionary', () => {
|
|||
let dictionary: TechnicalDictionary;
|
||||
|
||||
beforeEach(async () => {
|
||||
dictionary = new TechnicalDictionary();
|
||||
const loader = new NodeDictionaryLoader(getDataRoot());
|
||||
dictionary = new TechnicalDictionary(loader);
|
||||
await dictionary.loadDictionary();
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -8,8 +8,10 @@ import {
|
|||
EnglishDictionary,
|
||||
TechnicalDictionary,
|
||||
DictionaryManager,
|
||||
CustomDictionary
|
||||
CustomDictionary,
|
||||
NodeDictionaryLoader,
|
||||
} from '..';
|
||||
import { getDataRoot } from '../../utils/paths';
|
||||
|
||||
describe('LevenshteinDistance', () => {
|
||||
let levenshtein: LevenshteinDistance;
|
||||
|
|
@ -276,7 +278,8 @@ describe('ContextualCorrector', () => {
|
|||
|
||||
describe('Dictionaries', () => {
|
||||
it('should load English dictionary', async () => {
|
||||
const englishDict = new EnglishDictionary();
|
||||
const loader = new NodeDictionaryLoader(getDataRoot());
|
||||
const englishDict = new EnglishDictionary(loader);
|
||||
await englishDict.loadDictionary();
|
||||
|
||||
expect(englishDict.contains('hello')).toBe(true);
|
||||
|
|
@ -285,7 +288,8 @@ describe('Dictionaries', () => {
|
|||
});
|
||||
|
||||
it('should load technical dictionary', async () => {
|
||||
const techDict = new TechnicalDictionary();
|
||||
const loader = new NodeDictionaryLoader(getDataRoot());
|
||||
const techDict = new TechnicalDictionary(loader);
|
||||
await techDict.loadDictionary();
|
||||
|
||||
expect(techDict.contains('javascript')).toBe(true);
|
||||
|
|
|
|||
577
src/spellcheck/tests/symspell-integration.test.ts
Normal file
577
src/spellcheck/tests/symspell-integration.test.ts
Normal file
|
|
@ -0,0 +1,577 @@
|
|||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
|
||||
import { SpellChecker } from '../spell-checker.js';
|
||||
import type { SpellEngine, SpellSuggestion } from '../engines/types.js';
|
||||
|
||||
/**
|
||||
* Mock SpellEngine that simulates SymSpell behavior:
|
||||
* - O(1) dictionary lookup via Set
|
||||
* - Frequency-ranked suggestions from a predefined map
|
||||
*/
|
||||
class MockSymSpellEngine implements SpellEngine {
|
||||
private dictionary = new Set<string>();
|
||||
private suggestionMap = new Map<string, SpellSuggestion[]>();
|
||||
private ready = true;
|
||||
|
||||
constructor(words: string[], suggestions: Record<string, SpellSuggestion[]>) {
|
||||
for (const word of words) {
|
||||
this.dictionary.add(word.toLowerCase());
|
||||
}
|
||||
for (const [key, value] of Object.entries(suggestions)) {
|
||||
this.suggestionMap.set(key.toLowerCase(), value);
|
||||
}
|
||||
}
|
||||
|
||||
isReady(): boolean {
|
||||
return this.ready;
|
||||
}
|
||||
|
||||
contains(word: string): boolean {
|
||||
return this.dictionary.has(word.toLowerCase());
|
||||
}
|
||||
|
||||
suggest(word: string, maxSuggestions = 5): SpellSuggestion[] {
|
||||
const results = this.suggestionMap.get(word.toLowerCase()) ?? [];
|
||||
return results.slice(0, maxSuggestions);
|
||||
}
|
||||
|
||||
addWord(word: string, frequency = 1): void {
|
||||
this.dictionary.add(word.toLowerCase());
|
||||
this.suggestionMap.delete(word.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extended mock that also implements the optional bigramFrequency() method,
|
||||
* enabling context-aware rescoring in buildContextCorrections().
|
||||
*/
|
||||
class MockSymSpellEngineWithBigrams extends MockSymSpellEngine {
|
||||
private bigramMap = new Map<string, number>();
|
||||
|
||||
setBigram(word1: string, word2: string, frequency: number): void {
|
||||
this.bigramMap.set(`${word1.toLowerCase()} ${word2.toLowerCase()}`, frequency);
|
||||
}
|
||||
|
||||
bigramFrequency(word1: string, word2: string): number {
|
||||
return this.bigramMap.get(`${word1.toLowerCase()} ${word2.toLowerCase()}`) ?? 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a mock engine with common English words and the specific
|
||||
* typo→correction mappings that SymSpell would produce.
|
||||
*/
|
||||
function createTestEngine(): MockSymSpellEngine {
|
||||
const commonWords = [
|
||||
'hello', 'world', 'new', 'the', 'hi', 'help', 'test',
|
||||
'spell', 'check', 'word', 'correct', 'about', 'from',
|
||||
'would', 'their', 'there', 'they', 'have', 'been',
|
||||
'this', 'that', 'with', 'your', 'what', 'know',
|
||||
];
|
||||
|
||||
// These simulate what SymSpell returns: frequency-ranked suggestions
|
||||
const suggestions: Record<string, SpellSuggestion[]> = {
|
||||
'hio': [
|
||||
{ word: 'hi', distance: 1, frequency: 500000 },
|
||||
{ word: 'hip', distance: 1, frequency: 80000 },
|
||||
{ word: 'hid', distance: 1, frequency: 60000 },
|
||||
],
|
||||
'nwe': [
|
||||
{ word: 'new', distance: 1, frequency: 2000000 },
|
||||
{ word: 'awe', distance: 2, frequency: 30000 },
|
||||
],
|
||||
'wrold': [
|
||||
{ word: 'world', distance: 1, frequency: 1500000 },
|
||||
{ word: 'wold', distance: 1, frequency: 5000 },
|
||||
],
|
||||
'helo': [
|
||||
{ word: 'hello', distance: 1, frequency: 800000 },
|
||||
{ word: 'help', distance: 1, frequency: 600000 },
|
||||
{ word: 'held', distance: 1, frequency: 400000 },
|
||||
],
|
||||
'teh': [
|
||||
{ word: 'the', distance: 1, frequency: 23000000000 },
|
||||
{ word: 'ten', distance: 1, frequency: 300000 },
|
||||
],
|
||||
'speling': [
|
||||
{ word: 'spelling', distance: 1, frequency: 100000 },
|
||||
{ word: 'spewing', distance: 2, frequency: 20000 },
|
||||
],
|
||||
'correc': [
|
||||
{ word: 'correct', distance: 1, frequency: 500000 },
|
||||
{ word: 'corral', distance: 2, frequency: 30000 },
|
||||
],
|
||||
};
|
||||
|
||||
return new MockSymSpellEngine(commonWords, suggestions);
|
||||
}
|
||||
|
||||
describe('SpellChecker with SpellEngine', () => {
|
||||
let checker: SpellChecker;
|
||||
|
||||
beforeEach(async () => {
|
||||
const engine = createTestEngine();
|
||||
|
||||
checker = new SpellChecker({
|
||||
engine,
|
||||
customWords: ['vitest'],
|
||||
autoCorrect: true,
|
||||
confidenceThresholds: {
|
||||
autoFix: 0.7,
|
||||
suggest: 0.5,
|
||||
possible: 0.3,
|
||||
},
|
||||
});
|
||||
await checker.initialize();
|
||||
});
|
||||
|
||||
describe('core typo corrections (the SymSpell advantage)', () => {
|
||||
it('should suggest "hi" for "hio" (not "hip")', async () => {
|
||||
const result = await checker.check('hio');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions[0]).toBe('hi');
|
||||
});
|
||||
|
||||
it('should suggest "new" for "nwe" (not "nws")', async () => {
|
||||
const result = await checker.check('nwe');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions[0]).toBe('new');
|
||||
});
|
||||
|
||||
it('should suggest "world" for "wrold" (not "woold")', async () => {
|
||||
const result = await checker.check('wrold');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions[0]).toBe('world');
|
||||
});
|
||||
|
||||
it('should suggest "hello" for "helo"', async () => {
|
||||
const result = await checker.check('helo');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions).toContain('hello');
|
||||
});
|
||||
|
||||
it('should suggest "spelling" for "speling"', async () => {
|
||||
const result = await checker.check('speling');
|
||||
expect(result.correct).toBe(false);
|
||||
expect(result.suggestions[0]).toBe('spelling');
|
||||
});
|
||||
});
|
||||
|
||||
describe('engine delegation', () => {
|
||||
it('should recognize correct words via engine.contains()', async () => {
|
||||
const result = await checker.check('hello');
|
||||
expect(result.correct).toBe(true);
|
||||
expect(result.suggestions).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('should recognize custom words added via options', async () => {
|
||||
const result = await checker.check('vitest');
|
||||
expect(result.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('should use engine for word lookup (not legacy dictionaries)', async () => {
|
||||
// This test verifies that when an engine is provided, the SpellChecker
|
||||
// delegates contains() and suggest() to the engine, not to the legacy
|
||||
// Trie-based DictionaryManager.
|
||||
//
|
||||
// Words that exist in the engine's dictionary should be marked correct.
|
||||
// 'hello' is in the MockSymSpellEngine's common words list.
|
||||
const result = await checker.check('test');
|
||||
expect(result.correct).toBe(true);
|
||||
|
||||
// Words NOT in the engine should be marked incorrect with suggestions
|
||||
const bad = await checker.check('correc');
|
||||
expect(bad.correct).toBe(false);
|
||||
expect(bad.suggestions[0]).toBe('correct');
|
||||
});
|
||||
|
||||
it('should provide multiple ranked suggestions', async () => {
|
||||
const result = await checker.check('helo');
|
||||
expect(result.suggestions.length).toBeGreaterThan(1);
|
||||
// First suggestion should be highest frequency
|
||||
expect(result.suggestions[0]).toBe('hello');
|
||||
});
|
||||
});
|
||||
|
||||
describe('checkText with engine', () => {
|
||||
it('should find errors in text and provide corrections', async () => {
|
||||
const result = await checker.checkText('helo wrold');
|
||||
expect(result.errors.length).toBeGreaterThanOrEqual(2);
|
||||
|
||||
const heloError = result.errors.find((e) => e.word === 'helo');
|
||||
expect(heloError).toBeDefined();
|
||||
expect(heloError!.suggestions).toContain('hello');
|
||||
|
||||
const wroldError = result.errors.find((e) => e.word === 'wrold');
|
||||
expect(wroldError).toBeDefined();
|
||||
expect(wroldError!.suggestions[0]).toBe('world');
|
||||
});
|
||||
|
||||
it('should not flag correct words', async () => {
|
||||
const result = await checker.checkText('hello world');
|
||||
const misspellings = result.errors.filter((e) => e.type === 'misspelling');
|
||||
expect(misspellings).toHaveLength(0);
|
||||
});
|
||||
|
||||
it('should report processing stats', async () => {
|
||||
const result = await checker.checkText('helo wrold this is a test');
|
||||
expect(result.stats.totalWords).toBeGreaterThan(0);
|
||||
expect(result.stats.processingTime).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('fix with engine', () => {
|
||||
it('should auto-fix high-confidence corrections', async () => {
|
||||
const result = await checker.fix('helo wrold');
|
||||
// The fix method only applies AUTO_FIX confidence level corrections
|
||||
// Whether these get fixed depends on confidence scoring
|
||||
expect(typeof result).toBe('string');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildContextCorrections via checkText() — bigram rescoring', () => {
|
||||
/**
|
||||
* These tests exercise buildContextCorrections() indirectly through checkText().
|
||||
* The method is private, but its output surfaces as the first suggestion on
|
||||
* misspelled words when context rescoring promotes a different candidate.
|
||||
*
|
||||
* Scenario: "hio nwe" — without bigrams, "his" beats "hi" by frequency.
|
||||
* With bigram("hi","new") > bigram("his","new"), the context rescorer
|
||||
* promotes "hi" to position 0.
|
||||
*/
|
||||
function buildBigramEngine(): MockSymSpellEngineWithBigrams {
|
||||
const engine = new MockSymSpellEngineWithBigrams(
|
||||
['hi', 'his', 'new', 'world', 'the', 'hello'],
|
||||
{
|
||||
// "hio" has two candidates close in edit distance.
|
||||
// "his" has higher raw corpus frequency, "hi" wins via bigram context.
|
||||
hio: [
|
||||
{ word: 'his', distance: 1, frequency: 900_000 },
|
||||
{ word: 'hi', distance: 1, frequency: 500_000 },
|
||||
],
|
||||
// "nwe" has a clear winner by frequency alone.
|
||||
nwe: [
|
||||
{ word: 'new', distance: 1, frequency: 2_000_000 },
|
||||
{ word: 'awe', distance: 2, frequency: 30_000 },
|
||||
],
|
||||
},
|
||||
);
|
||||
|
||||
// "hi new" is a common greeting bigram; "his new" is unusual.
|
||||
engine.setBigram('hi', 'new', 50_000);
|
||||
engine.setBigram('his', 'new', 200);
|
||||
|
||||
return engine;
|
||||
}
|
||||
|
||||
it('promotes context-preferred candidate to first suggestion when bigrams are present', async () => {
|
||||
const engine = buildBigramEngine();
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
autoCorrect: false,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('hio nwe');
|
||||
|
||||
const hioError = result.errors.find((e) => e.word === 'hio');
|
||||
expect(hioError).toBeDefined();
|
||||
// Context rescoring should promote "hi" over "his" (higher bigram score).
|
||||
expect(hioError!.suggestions[0]).toBe('hi');
|
||||
// The original frequency-only winner must still be present in the list.
|
||||
expect(hioError!.suggestions).toContain('his');
|
||||
});
|
||||
|
||||
it('preserves frequency-based order when no bigram data overrides the top candidate', async () => {
|
||||
// "nwe" → "new" wins by frequency alone; no bigram should disturb that.
|
||||
const engine = buildBigramEngine();
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
autoCorrect: false,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('hio nwe');
|
||||
|
||||
const nweError = result.errors.find((e) => e.word === 'nwe');
|
||||
expect(nweError).toBeDefined();
|
||||
// "new" was already #1 by frequency — context rescoring should leave it there.
|
||||
expect(nweError!.suggestions[0]).toBe('new');
|
||||
});
|
||||
|
||||
it('uses neighbor best-guess words (not originals) when scoring bigrams for adjacent errors', async () => {
|
||||
// Both words are errors. The left neighbor of "nwe" is the corrected form of
|
||||
// "hio" ("hi"), not the raw typo ("hio"). This verifies the first-pass
|
||||
// best-word substitution in buildContextCorrections().
|
||||
const engine = new MockSymSpellEngineWithBigrams(
|
||||
['hi', 'his', 'new', 'awe'],
|
||||
{
|
||||
hio: [
|
||||
{ word: 'his', distance: 1, frequency: 900_000 },
|
||||
{ word: 'hi', distance: 1, frequency: 500_000 },
|
||||
],
|
||||
nwe: [
|
||||
{ word: 'new', distance: 1, frequency: 2_000_000 },
|
||||
{ word: 'awe', distance: 2, frequency: 30_000 },
|
||||
],
|
||||
},
|
||||
);
|
||||
|
||||
// Bigram with the corrected neighbor "hi", not the raw typo "hio".
|
||||
engine.setBigram('hi', 'new', 50_000);
|
||||
engine.setBigram('hio', 'new', 0); // raw typo has no bigram entry
|
||||
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
autoCorrect: false,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('hio nwe');
|
||||
|
||||
const hioError = result.errors.find((e) => e.word === 'hio');
|
||||
expect(hioError).toBeDefined();
|
||||
expect(hioError!.suggestions[0]).toBe('hi');
|
||||
});
|
||||
|
||||
it('returns empty context corrections map when engine has no bigramFrequency method', async () => {
|
||||
// Plain MockSymSpellEngine does NOT implement bigramFrequency.
|
||||
// buildContextCorrections() should bail out early and return an empty map,
|
||||
// leaving suggestion order unchanged (frequency-ranked).
|
||||
const engine = new MockSymSpellEngine(
|
||||
['hi', 'his', 'new'],
|
||||
{
|
||||
hio: [
|
||||
{ word: 'his', distance: 1, frequency: 900_000 },
|
||||
{ word: 'hi', distance: 1, frequency: 500_000 },
|
||||
],
|
||||
},
|
||||
);
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
autoCorrect: false,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('hio');
|
||||
|
||||
const error = result.errors.find((e) => e.word === 'hio');
|
||||
expect(error).toBeDefined();
|
||||
// Without bigrams, frequency order is preserved: "his" stays first.
|
||||
expect(error!.suggestions[0]).toBe('his');
|
||||
});
|
||||
|
||||
it('skips rescoring for words with only one candidate (no ambiguity to resolve)', async () => {
|
||||
const engine = new MockSymSpellEngineWithBigrams(
|
||||
['world', 'the'],
|
||||
{
|
||||
// Single candidate — context rescoring has nothing to compare against.
|
||||
wrold: [{ word: 'world', distance: 1, frequency: 1_500_000 }],
|
||||
},
|
||||
);
|
||||
engine.setBigram('the', 'world', 200_000);
|
||||
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
autoCorrect: false,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('the wrold');
|
||||
|
||||
const error = result.errors.find((e) => e.word === 'wrold');
|
||||
expect(error).toBeDefined();
|
||||
expect(error!.suggestions[0]).toBe('world');
|
||||
});
|
||||
|
||||
it('applies context rescoring to each misspelled word independently in a multi-error sentence', async () => {
|
||||
// Three errors in one sentence — each rescored against its own neighbors.
|
||||
const engine = new MockSymSpellEngineWithBigrams(
|
||||
['hello', 'new', 'world', 'help', 'now', 'word'],
|
||||
{
|
||||
helo: [
|
||||
{ word: 'help', distance: 1, frequency: 600_000 },
|
||||
{ word: 'hello', distance: 1, frequency: 800_000 },
|
||||
],
|
||||
nwe: [
|
||||
{ word: 'now', distance: 1, frequency: 400_000 },
|
||||
{ word: 'new', distance: 1, frequency: 2_000_000 },
|
||||
],
|
||||
wrold: [
|
||||
{ word: 'word', distance: 1, frequency: 700_000 },
|
||||
{ word: 'world', distance: 1, frequency: 1_500_000 },
|
||||
],
|
||||
},
|
||||
);
|
||||
|
||||
// Strong bigrams that override raw frequency order.
|
||||
engine.setBigram('hello', 'new', 80_000); // "hello" beats "help" before "new"
|
||||
engine.setBigram('help', 'new', 100);
|
||||
engine.setBigram('new', 'world', 120_000); // "new" beats "now" before "world"
|
||||
engine.setBigram('now', 'world', 50);
|
||||
engine.setBigram('hello', 'now', 50);
|
||||
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
autoCorrect: false,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('helo nwe wrold');
|
||||
|
||||
const heloError = result.errors.find((e) => e.word === 'helo');
|
||||
const nweError = result.errors.find((e) => e.word === 'nwe');
|
||||
|
||||
expect(heloError?.suggestions[0]).toBe('hello');
|
||||
expect(nweError?.suggestions[0]).toBe('new');
|
||||
});
|
||||
});
|
||||
|
||||
describe('SpellEngine interface edge cases', () => {
|
||||
describe('uninitialized engine guard', () => {
|
||||
it('throws during initialize() when engine.isReady() returns false', async () => {
|
||||
const notReadyEngine: SpellEngine = {
|
||||
isReady: () => false,
|
||||
contains: () => false,
|
||||
suggest: () => [],
|
||||
addWord: () => {},
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({ engine: notReadyEngine });
|
||||
|
||||
await expect(checker.initialize()).rejects.toThrow(
|
||||
'SpellEngine must be initialized before passing to SpellChecker',
|
||||
);
|
||||
});
|
||||
|
||||
it('wraps the thrown error in a SpellChecker initialization failed message', async () => {
|
||||
const notReadyEngine: SpellEngine = {
|
||||
isReady: () => false,
|
||||
contains: () => false,
|
||||
suggest: () => [],
|
||||
addWord: () => {},
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({ engine: notReadyEngine });
|
||||
|
||||
await expect(checker.initialize()).rejects.toThrow(
|
||||
'SpellChecker initialization failed',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('addWord() at runtime via engine path', () => {
|
||||
it('forwards addWord() calls to the engine when one is present', async () => {
|
||||
const addWordSpy = vi.fn();
|
||||
const engine: SpellEngine = {
|
||||
isReady: () => true,
|
||||
contains: (word: string) => word === 'existingword',
|
||||
suggest: () => [],
|
||||
addWord: addWordSpy,
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({ engine });
|
||||
await checker.initialize();
|
||||
|
||||
checker.addWord('newterm');
|
||||
|
||||
expect(addWordSpy).toHaveBeenCalledWith('newterm');
|
||||
});
|
||||
|
||||
it('makes the newly added word recognized as correct in subsequent checks', async () => {
|
||||
const dictionary = new Set<string>(['hello']);
|
||||
const engine: SpellEngine = {
|
||||
isReady: () => true,
|
||||
contains: (word: string) => dictionary.has(word.toLowerCase()),
|
||||
suggest: () => [],
|
||||
addWord: (word: string) => dictionary.add(word.toLowerCase()),
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({ engine });
|
||||
await checker.initialize();
|
||||
|
||||
// Before adding: unknown word
|
||||
const before = await checker.check('mynewterm');
|
||||
expect(before.correct).toBe(false);
|
||||
|
||||
checker.addWord('mynewterm');
|
||||
|
||||
// After adding: recognized as correct
|
||||
const after = await checker.check('mynewterm');
|
||||
expect(after.correct).toBe(true);
|
||||
});
|
||||
|
||||
it('passes custom words from constructor options into engine.addWord() during initialization', async () => {
|
||||
const addWordSpy = vi.fn();
|
||||
const engine: SpellEngine = {
|
||||
isReady: () => true,
|
||||
contains: () => false,
|
||||
suggest: () => [],
|
||||
addWord: addWordSpy,
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
customWords: ['customterm', 'anotherword'],
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
expect(addWordSpy).toHaveBeenCalledWith('customterm');
|
||||
expect(addWordSpy).toHaveBeenCalledWith('anotherword');
|
||||
});
|
||||
|
||||
it('does not call addWord() on engine if no customWords are provided', async () => {
|
||||
const addWordSpy = vi.fn();
|
||||
const engine: SpellEngine = {
|
||||
isReady: () => true,
|
||||
contains: () => false,
|
||||
suggest: () => [],
|
||||
addWord: addWordSpy,
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({ engine });
|
||||
await checker.initialize();
|
||||
|
||||
expect(addWordSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('engine with bigramFrequency defined but returning zero for all pairs', () => {
|
||||
it('falls back to frequency-based ordering when all bigram scores are zero', async () => {
|
||||
// bigramFrequency is present but always returns 0 — no context signal.
|
||||
// The frequency-ranked order from suggest() should be preserved.
|
||||
const engine: SpellEngine & { bigramFrequency(w1: string, w2: string): number } = {
|
||||
isReady: () => true,
|
||||
contains: (word: string) => ['hi', 'his', 'new'].includes(word),
|
||||
suggest: (_word: string, max = 5) =>
|
||||
([
|
||||
{ word: 'his', distance: 1, frequency: 900_000 },
|
||||
{ word: 'hi', distance: 1, frequency: 500_000 },
|
||||
] as SpellSuggestion[]).slice(0, max),
|
||||
addWord: () => {},
|
||||
bigramFrequency: () => 0,
|
||||
};
|
||||
|
||||
const checker = new SpellChecker({
|
||||
engine,
|
||||
confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 },
|
||||
});
|
||||
await checker.initialize();
|
||||
|
||||
const result = await checker.checkText('hio');
|
||||
|
||||
const error = result.errors.find((e) => e.word === 'hio');
|
||||
expect(error).toBeDefined();
|
||||
// All bigrams are 0, so the context-preferred candidate equals the
|
||||
// frequency-preferred candidate — no reordering occurs.
|
||||
expect(error!.suggestions[0]).toBe('his');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -1,4 +1,6 @@
|
|||
import type { CorrectionDecision } from '../confidence/confidence-scorer.js';
|
||||
import type { DictionaryDataLoader } from '../dictionaries/core/dictionary-loader.js';
|
||||
import type { SpellEngine } from '../engines/types.js';
|
||||
|
||||
export interface SpellCheckResult {
|
||||
word: string;
|
||||
|
|
@ -35,6 +37,8 @@ export interface SpellCheckOptions {
|
|||
confidenceThresholds?: ConfidenceThresholds;
|
||||
enableSplitWordDetection?: boolean;
|
||||
enableJoinedWordDetection?: boolean;
|
||||
loader?: DictionaryDataLoader;
|
||||
engine?: SpellEngine;
|
||||
}
|
||||
|
||||
export interface DictionaryConfig {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { ChunkSplitter } from '../../src/splitters/chunk-splitter';
|
||||
import { ChunkSplitter } from '../../src/splitters/chunk-splitter.js';
|
||||
|
||||
describe('ChunkSplitter', () => {
|
||||
test('should split text into chunks based on max size', () => {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { SentenceSplitter } from '../../src/splitters/sentence-splitter';
|
||||
import { SentenceSplitter } from '../../src/splitters/sentence-splitter.js';
|
||||
|
||||
describe('SentenceSplitter', () => {
|
||||
const splitter = new SentenceSplitter();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { CaseTransformer } from '../../src/transformers/case-transformer';
|
||||
import { CaseTransformer } from '../../src/transformers/case-transformer.js';
|
||||
|
||||
describe('CaseTransformer', () => {
|
||||
const transformer = new CaseTransformer();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { Redactor } from '../../src/transformers/redactor';
|
||||
import { Redactor } from '../../src/transformers/redactor.js';
|
||||
|
||||
describe('Redactor', () => {
|
||||
const redactor = new Redactor();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { TemplateEngine } from '../../src/transformers/template-engine';
|
||||
import { TemplateEngine } from '../../src/transformers/template-engine.js';
|
||||
|
||||
describe('TemplateEngine', () => {
|
||||
const engine = new TemplateEngine();
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import {
|
|||
getSpellcheckDataPath as _getSpellcheckDataPath,
|
||||
PATHS,
|
||||
verifyFileExists
|
||||
} from './paths';
|
||||
} from './paths.js';
|
||||
|
||||
describe('Path utilities', () => {
|
||||
describe('getProjectRoot', () => {
|
||||
|
|
|
|||
|
|
@ -33,13 +33,19 @@ export function getProjectRoot(): string {
|
|||
return process.cwd();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the root path for dictionary/spellcheck data files.
|
||||
* Used by NodeDictionaryLoader as its root path.
|
||||
*/
|
||||
export function getDataRoot(): string {
|
||||
return path.join(getProjectRoot(), 'src', 'data');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the absolute path to a data file
|
||||
*/
|
||||
export function getDataPath(...segments: string[]): string {
|
||||
const projectRoot = getProjectRoot();
|
||||
|
||||
return path.join(projectRoot, 'src', 'data', ...segments);
|
||||
return path.join(getDataRoot(), ...segments);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { EmailValidator } from '../../src/validators/email-validator';
|
||||
import { EmailValidator } from '../../src/validators/email-validator.js';
|
||||
|
||||
describe('EmailValidator', () => {
|
||||
const validator = new EmailValidator();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { describe, test, expect } from 'vitest';
|
||||
import { JSONValidator } from '../../src/validators/json-validator';
|
||||
import { JSONValidator } from '../../src/validators/json-validator.js';
|
||||
|
||||
describe('JSONValidator', () => {
|
||||
const validator = new JSONValidator();
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue