text-processing-utils/benchmarks/spellcheck.bench.ts
2026-01-21 11:37:27 -08:00

272 lines
No EOL
8 KiB
TypeScript

/**
* Performance benchmarks for spellcheck optimizations
* Run with: npm run benchmark
*/
import { LevenshteinDistance } from '../src/spellcheck/algorithms/levenshtein';
import { OptimizedLevenshtein } from '../src/spellcheck/algorithms/levenshtein-optimized';
import { DamerauLevenshtein } from '../src/spellcheck/algorithms/damerau-levenshtein';
import { Soundex, Metaphone } from '../src/spellcheck/algorithms/phonetic';
import { BloomFilter } from '../src/spellcheck/utils/bloom-filter';
import { LRUCache } from '../src/spellcheck/utils/lru-cache';
import { Trie } from '../src/spellcheck/algorithms/trie';
// Test data
const TEST_WORDS = [
'hello', 'world', 'javascript', 'typescript', 'programming',
'algorithm', 'performance', 'optimization', 'benchmark', 'testing'
];
const MISSPELLED_WORDS = [
'helo', 'wrold', 'javascrpt', 'typscript', 'programing',
'algoritm', 'performence', 'optimizaton', 'benchmrk', 'testng'
];
const LARGE_DICTIONARY = Array.from({ length: 10000 }, (_, i) =>
`word${i}${String.fromCharCode(97 + (i % 26))}`
);
/**
* Benchmark runner
*/
class Benchmark {
private results: Map<string, number[]> = new Map();
async run(name: string, fn: () => void | Promise<void>, iterations: number = 1000): Promise<void> {
const times: number[] = [];
// Warm up
for (let i = 0; i < 10; i++) {
await fn();
}
// Actual benchmark
for (let i = 0; i < iterations; i++) {
const start = performance.now();
await fn();
const end = performance.now();
times.push(end - start);
}
this.results.set(name, times);
}
report(): void {
console.log('\n=== SpellCheck Performance Benchmarks ===\n');
for (const [name, times] of this.results) {
const avg = times.reduce((a, b) => a + b, 0) / times.length;
const min = Math.min(...times);
const max = Math.max(...times);
const median = this.median(times);
console.log(`${name}:`);
console.log(` Average: ${avg.toFixed(3)}ms`);
console.log(` Median: ${median.toFixed(3)}ms`);
console.log(` Min: ${min.toFixed(3)}ms`);
console.log(` Max: ${max.toFixed(3)}ms`);
console.log('');
}
}
private median(arr: number[]): number {
const sorted = [...arr].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 === 0
? (sorted[mid - 1] + sorted[mid]) / 2
: sorted[mid];
}
}
/**
* Run all benchmarks
*/
async function runBenchmarks() {
const benchmark = new Benchmark();
console.log('Preparing benchmarks...');
// Initialize algorithms
const oldLevenshtein = new LevenshteinDistance();
const newLevenshtein = new OptimizedLevenshtein();
const damerauLevenshtein = new DamerauLevenshtein();
const soundex = new Soundex();
const metaphone = new Metaphone();
const bloomFilter = new BloomFilter(10000, 0.01);
const lruCache = new LRUCache<string, number>(1000);
const trie = new Trie();
// Populate data structures
for (const word of LARGE_DICTIONARY) {
bloomFilter.add(word);
trie.insert(word);
}
// === Levenshtein Distance Benchmarks ===
console.log('\nRunning Levenshtein benchmarks...');
await benchmark.run('Old Levenshtein (10 words)', () => {
for (let i = 0; i < TEST_WORDS.length; i++) {
oldLevenshtein.calculate(TEST_WORDS[i], MISSPELLED_WORDS[i]);
}
});
await benchmark.run('Optimized Levenshtein (10 words)', () => {
for (let i = 0; i < TEST_WORDS.length; i++) {
newLevenshtein.calculate(TEST_WORDS[i], MISSPELLED_WORDS[i]);
}
});
await benchmark.run('Optimized Levenshtein with early termination', () => {
for (let i = 0; i < TEST_WORDS.length; i++) {
newLevenshtein.calculate(TEST_WORDS[i], MISSPELLED_WORDS[i], 2);
}
});
await benchmark.run('Damerau-Levenshtein (10 words)', () => {
for (let i = 0; i < TEST_WORDS.length; i++) {
damerauLevenshtein.calculate(TEST_WORDS[i], MISSPELLED_WORDS[i]);
}
});
// === Phonetic Algorithm Benchmarks ===
console.log('\nRunning phonetic algorithm benchmarks...');
await benchmark.run('Soundex encoding (10 words)', () => {
for (const word of TEST_WORDS) {
soundex.encode(word);
}
});
await benchmark.run('Metaphone encoding (10 words)', () => {
for (const word of TEST_WORDS) {
metaphone.encode(word);
}
});
// === Bloom Filter Benchmarks ===
console.log('\nRunning Bloom filter benchmarks...');
await benchmark.run('Bloom filter lookup (1000 checks)', () => {
for (let i = 0; i < 1000; i++) {
bloomFilter.mightContain(`test${i}`);
}
});
await benchmark.run('Bloom filter negative lookups', () => {
for (let i = 0; i < 1000; i++) {
bloomFilter.definitelyNotContains(`notinset${i}`);
}
});
// === LRU Cache Benchmarks ===
console.log('\nRunning LRU cache benchmarks...');
await benchmark.run('LRU cache set/get (1000 ops)', () => {
for (let i = 0; i < 500; i++) {
lruCache.set(`key${i}`, i);
lruCache.get(`key${i % 100}`);
}
});
// === Trie Benchmarks ===
console.log('\nRunning Trie benchmarks...');
await benchmark.run('Trie search (1000 lookups)', () => {
for (let i = 0; i < 1000; i++) {
trie.search(`word${i % 10000}a`);
}
});
await benchmark.run('Trie prefix search', () => {
for (let i = 0; i < 100; i++) {
trie.getSuggestions(`word${i}`, 10);
}
});
// === Memory Usage Comparison ===
console.log('\nCalculating memory usage...');
const memoryBefore = process.memoryUsage().heapUsed;
// Create large data structures
const largeTrie = new Trie();
const largeBloom = new BloomFilter(100000, 0.01);
for (let i = 0; i < 50000; i++) {
const word = `testword${i}`;
largeTrie.insert(word);
largeBloom.add(word);
}
const memoryAfter = process.memoryUsage().heapUsed;
const memoryUsed = (memoryAfter - memoryBefore) / 1024 / 1024;
console.log(`\nMemory used for 50,000 words:`);
console.log(` Trie + Bloom Filter: ${memoryUsed.toFixed(2)} MB`);
// Report results
benchmark.report();
// === Accuracy Comparison ===
console.log('\n=== Accuracy Comparison ===\n');
const testPairs = [
['their', 'there'],
['definitely', 'definately'],
['occurrence', 'occurence'],
['separate', 'seperate'],
['receive', 'recieve']
];
for (const [correct, misspelled] of testPairs) {
const levDist = oldLevenshtein.calculate(correct, misspelled);
const optLevDist = newLevenshtein.calculate(correct, misspelled);
const damLevDist = damerauLevenshtein.calculate(correct, misspelled);
const soundexMatch = soundex.soundsLike(correct, misspelled);
const metaphoneMatch = metaphone.soundsLike(correct, misspelled);
console.log(`${misspelled} -> ${correct}:`);
console.log(` Levenshtein: ${levDist}`);
console.log(` Optimized Levenshtein: ${optLevDist}`);
console.log(` Damerau-Levenshtein: ${damLevDist}`);
console.log(` Soundex match: ${soundexMatch}`);
console.log(` Metaphone match: ${metaphoneMatch}`);
console.log('');
}
// === Cache Performance ===
console.log('=== Cache Performance ===\n');
const cacheTest = new LRUCache<string, number>(100);
let hits = 0;
let misses = 0;
// Simulate realistic cache usage
for (let i = 0; i < 1000; i++) {
const key = `key${Math.floor(Math.random() * 150)}`;
if (cacheTest.has(key)) {
cacheTest.get(key);
hits++;
} else {
cacheTest.set(key, i);
misses++;
}
}
const stats = cacheTest.getStats();
console.log(`Cache Statistics:`);
console.log(` Size: ${stats.size}/${stats.maxSize}`);
console.log(` Hits: ${hits}`);
console.log(` Misses: ${misses}`);
console.log(` Hit Rate: ${((hits / (hits + misses)) * 100).toFixed(1)}%`);
console.log('\n=== Benchmark Complete ===');
}
// Run if executed directly
if (require.main === module) {
runBenchmarks().catch(console.error);
}
export { runBenchmarks, Benchmark };