From bd19c0c5cc8d5793cab2af18e4bba051a3d47361 Mon Sep 17 00:00:00 2001 From: Lilith Date: Thu, 26 Feb 2026 15:51:46 -0800 Subject: [PATCH] =?UTF-8?q?perf(spellcheck):=20=E2=9A=A1=20Optimize=20spel?= =?UTF-8?q?l-checking=20performance=20by=20restructuring=20dictionary=20lo?= =?UTF-8?q?ading,=20integrating=20SymSpell=20engine,=20updating=20word=20f?= =?UTF-8?q?requency=20data,=20and=20refactoring=20core=20components=20(dic?= =?UTF-8?q?tionary-manager,=20spell-checker,=20suggestion-engine)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- src/data/spellcheck/word-frequencies.json | 3002 +++++++++++++++++ src/extractors/url-extractor.test.ts | 2 +- src/performance/timeout-wrapper.test.ts | 2 +- src/sanitizers/ansi-stripper.test.ts | 2 +- .../confidence/confidence-scorer.ts | 184 +- .../dictionaries/core/dictionary-loader.ts | 4 + .../dictionaries/core/dictionary-manager.ts | 20 +- .../implementations/english-dictionary.ts | 30 +- .../implementations/technical-dictionary.ts | 19 +- src/spellcheck/dictionaries/index.ts | 5 + .../dictionaries/loaders/fetch-loader.ts | 33 + .../dictionaries/loaders/node-loader.ts | 23 + src/spellcheck/engines/index.ts | 3 + src/spellcheck/engines/symspell-engine.ts | 63 + src/spellcheck/engines/types.ts | 26 + src/spellcheck/index.ts | 14 +- src/spellcheck/spell-checker.ts | 243 +- src/spellcheck/suggestion-engine.ts | 119 +- src/spellcheck/tests/dictionaries.test.ts | 8 +- src/spellcheck/tests/spellcheck.test.ts | 10 +- .../tests/symspell-integration.test.ts | 577 ++++ src/spellcheck/types/spellcheck.types.ts | 4 + src/splitters/chunk-splitter.test.ts | 2 +- src/splitters/sentence-splitter.test.ts | 2 +- src/transformers/case-transformer.test.ts | 2 +- src/transformers/redactor.test.ts | 2 +- src/transformers/template-engine.test.ts | 2 +- src/utils/paths.test.ts | 2 +- src/utils/paths.ts | 12 +- src/validators/email-validator.test.ts | 2 +- src/validators/json-validator.test.ts | 2 +- 31 files changed, 4171 insertions(+), 250 deletions(-) create mode 100644 src/data/spellcheck/word-frequencies.json create mode 100644 src/spellcheck/dictionaries/core/dictionary-loader.ts create mode 100644 src/spellcheck/dictionaries/loaders/fetch-loader.ts create mode 100644 src/spellcheck/dictionaries/loaders/node-loader.ts create mode 100644 src/spellcheck/engines/index.ts create mode 100644 src/spellcheck/engines/symspell-engine.ts create mode 100644 src/spellcheck/engines/types.ts create mode 100644 src/spellcheck/tests/symspell-integration.test.ts diff --git a/src/data/spellcheck/word-frequencies.json b/src/data/spellcheck/word-frequencies.json new file mode 100644 index 0000000..b4b15ff --- /dev/null +++ b/src/data/spellcheck/word-frequencies.json @@ -0,0 +1,3002 @@ +{ + "the": 1, + "be": 2, + "to": 3, + "of": 4, + "and": 5, + "a": 6, + "in": 7, + "that": 8, + "have": 9, + "i": 10, + "it": 11, + "for": 12, + "not": 13, + "on": 14, + "with": 15, + "he": 16, + "as": 17, + "you": 18, + "do": 19, + "at": 20, + "this": 21, + "but": 22, + "his": 23, + "by": 24, + "from": 25, + "they": 26, + "we": 27, + "her": 28, + "she": 29, + "or": 30, + "an": 31, + "will": 32, + "my": 33, + "one": 34, + "all": 35, + "would": 36, + "there": 37, + "their": 38, + "what": 39, + "so": 40, + "up": 41, + "out": 42, + "if": 43, + "about": 44, + "who": 45, + "get": 46, + "which": 47, + "go": 48, + "me": 49, + "when": 50, + "make": 51, + "can": 52, + "like": 53, + "time": 54, + "no": 55, + "just": 56, + "him": 57, + "know": 58, + "take": 59, + "people": 60, + "into": 61, + "year": 62, + "your": 63, + "good": 64, + "some": 65, + "could": 66, + "them": 67, + "see": 68, + "other": 69, + "than": 70, + "then": 71, + "now": 72, + "look": 73, + "only": 74, + "come": 75, + "its": 76, + "over": 77, + "think": 78, + "also": 79, + "back": 80, + "after": 81, + "use": 82, + "two": 83, + "how": 84, + "our": 85, + "work": 86, + "first": 87, + "well": 88, + "way": 89, + "even": 90, + "new": 91, + "want": 92, + "because": 93, + "any": 94, + "these": 95, + "give": 96, + "day": 97, + "most": 98, + "us": 99, + "great": 100, + "find": 101, + "here": 102, + "thing": 103, + "many": 104, + "still": 105, + "between": 106, + "life": 107, + "being": 108, + "under": 109, + "never": 110, + "same": 111, + "another": 112, + "much": 113, + "where": 114, + "before": 115, + "must": 116, + "through": 117, + "own": 118, + "tell": 119, + "home": 120, + "may": 121, + "world": 122, + "high": 123, + "last": 124, + "long": 125, + "very": 126, + "might": 127, + "say": 128, + "old": 129, + "right": 130, + "too": 131, + "does": 132, + "should": 133, + "need": 134, + "call": 135, + "hand": 136, + "keep": 137, + "let": 138, + "begin": 139, + "seem": 140, + "help": 141, + "show": 142, + "every": 143, + "place": 144, + "name": 145, + "move": 146, + "live": 147, + "turn": 148, + "each": 149, + "part": 150, + "feel": 151, + "against": 152, + "child": 153, + "few": 154, + "already": 155, + "play": 156, + "small": 157, + "end": 158, + "put": 159, + "while": 160, + "head": 161, + "house": 162, + "run": 163, + "off": 164, + "big": 165, + "really": 166, + "man": 167, + "why": 168, + "try": 169, + "start": 170, + "point": 171, + "city": 172, + "read": 173, + "number": 174, + "always": 175, + "night": 176, + "add": 177, + "side": 178, + "water": 179, + "state": 180, + "since": 181, + "pay": 182, + "problem": 183, + "become": 184, + "school": 185, + "such": 186, + "late": 187, + "set": 188, + "didn't": 189, + "open": 190, + "until": 191, + "far": 192, + "those": 193, + "both": 194, + "change": 195, + "close": 196, + "group": 197, + "leave": 198, + "face": 199, + "kind": 200, + "young": 201, + "down": 202, + "enough": 203, + "study": 204, + "line": 205, + "girl": 206, + "without": 207, + "ask": 208, + "woman": 209, + "country": 210, + "family": 211, + "during": 212, + "story": 213, + "often": 214, + "body": 215, + "left": 216, + "food": 217, + "light": 218, + "system": 219, + "power": 220, + "again": 221, + "best": 222, + "money": 223, + "though": 224, + "mean": 225, + "love": 226, + "plan": 227, + "follow": 228, + "next": 229, + "less": 230, + "little": 231, + "talk": 232, + "write": 233, + "sure": 234, + "around": 235, + "children": 236, + "stop": 237, + "along": 238, + "once": 239, + "real": 240, + "four": 241, + "area": 242, + "cut": 243, + "bring": 244, + "hard": 245, + "word": 246, + "possible": 247, + "heart": 248, + "stand": 249, + "form": 250, + "watch": 251, + "war": 252, + "nothing": 253, + "something": 254, + "important": 255, + "yet": 256, + "idea": 257, + "book": 258, + "early": 259, + "believe": 260, + "room": 261, + "minute": 262, + "today": 263, + "morning": 264, + "table": 265, + "door": 266, + "cover": 267, + "game": 268, + "course": 269, + "air": 270, + "friend": 271, + "grow": 272, + "together": 273, + "team": 274, + "learn": 275, + "white": 276, + "week": 277, + "several": 278, + "hold": 279, + "program": 280, + "hear": 281, + "sit": 282, + "different": 283, + "step": 284, + "moment": 285, + "company": 286, + "order": 287, + "general": 288, + "political": 289, + "social": 290, + "eye": 291, + "member": 292, + "age": 293, + "mother": 294, + "five": 295, + "result": 296, + "car": 297, + "level": 298, + "almost": 299, + "happen": 300, + "carry": 301, + "father": 302, + "information": 303, + "business": 304, + "half": 305, + "service": 306, + "later": 307, + "bit": 308, + "case": 309, + "second": 310, + "three": 311, + "interest": 312, + "question": 313, + "person": 314, + "experience": 315, + "job": 316, + "win": 317, + "death": 318, + "community": 319, + "full": 320, + "behind": 321, + "lead": 322, + "among": 323, + "party": 324, + "street": 325, + "town": 326, + "class": 327, + "office": 328, + "market": 329, + "whole": 330, + "clear": 331, + "land": 332, + "development": 333, + "past": 334, + "music": 335, + "student": 336, + "build": 337, + "fact": 338, + "rather": 339, + "human": 340, + "reason": 341, + "stay": 342, + "picture": 343, + "six": 344, + "season": 345, + "create": 346, + "low": 347, + "law": 348, + "report": 349, + "pass": 350, + "understand": 351, + "south": 352, + "church": 353, + "force": 354, + "foot": 355, + "wait": 356, + "break": 357, + "center": 358, + "develop": 359, + "north": 360, + "toward": 361, + "view": 362, + "public": 363, + "sense": 364, + "rest": 365, + "month": 366, + "bed": 367, + "west": 368, + "piece": 369, + "hour": 370, + "true": 371, + "care": 372, + "figure": 373, + "model": 374, + "policy": 375, + "type": 376, + "practice": 377, + "process": 378, + "local": 379, + "meet": 380, + "either": 381, + "include": 382, + "continue": 383, + "nature": 384, + "drive": 385, + "issue": 386, + "certain": 387, + "third": 388, + "better": 389, + "pick": 390, + "ground": 391, + "support": 392, + "speak": 393, + "research": 394, + "president": 395, + "position": 396, + "effort": 397, + "appear": 398, + "offer": 399, + "field": 400, + "paper": 401, + "else": 402, + "fall": 403, + "product": 404, + "technology": 405, + "national": 406, + "condition": 407, + "whether": 408, + "ten": 409, + "fight": 410, + "expect": 411, + "million": 412, + "east": 413, + "act": 414, + "ready": 415, + "record": 416, + "health": 417, + "serve": 418, + "star": 419, + "wish": 420, + "strong": 421, + "according": 422, + "free": 423, + "press": 424, + "sort": 425, + "note": 426, + "cost": 427, + "history": 428, + "term": 429, + "government": 430, + "color": 431, + "rate": 432, + "role": 433, + "trade": 434, + "eat": 435, + "reach": 436, + "society": 437, + "space": 438, + "stock": 439, + "value": 440, + "pull": 441, + "walk": 442, + "page": 443, + "rule": 444, + "art": 445, + "special": 446, + "agree": 447, + "recent": 448, + "return": 449, + "test": 450, + "action": 451, + "couple": 452, + "simple": 453, + "project": 454, + "manager": 455, + "fill": 456, + "control": 457, + "raise": 458, + "able": 459, + "actually": 460, + "common": 461, + "produce": 462, + "cause": 463, + "fear": 464, + "director": 465, + "sign": 466, + "range": 467, + "hot": 468, + "single": 469, + "check": 470, + "oil": 471, + "imagine": 472, + "choose": 473, + "design": 474, + "material": 475, + "culture": 476, + "hundred": 477, + "base": 478, + "send": 479, + "least": 480, + "major": 481, + "miss": 482, + "require": 483, + "player": 484, + "teacher": 485, + "data": 486, + "above": 487, + "accept": 488, + "enter": 489, + "remain": 490, + "energy": 491, + "road": 492, + "success": 493, + "store": 494, + "sell": 495, + "share": 496, + "concern": 497, + "answer": 498, + "approach": 499, + "relate": 500, + "draw": 501, + "account": 502, + "attention": 503, + "particular": 504, + "dark": 505, + "nice": 506, + "affect": 507, + "apply": 508, + "example": 509, + "dead": 510, + "present": 511, + "language": 512, + "response": 513, + "chance": 514, + "economic": 515, + "performance": 516, + "deal": 517, + "measure": 518, + "theory": 519, + "current": 520, + "involve": 521, + "sport": 522, + "rise": 523, + "image": 524, + "pressure": 525, + "future": 526, + "finally": 527, + "amount": 528, + "physical": 529, + "entire": 530, + "treat": 531, + "available": 532, + "voice": 533, + "structure": 534, + "decision": 535, + "popular": 536, + "science": 537, + "unit": 538, + "basic": 539, + "activity": 540, + "operation": 541, + "safe": 542, + "detail": 543, + "employee": 544, + "event": 545, + "serious": 546, + "legal": 547, + "happy": 548, + "central": 549, + "wide": 550, + "green": 551, + "tree": 552, + "stage": 553, + "doctor": 554, + "blue": 555, + "complete": 556, + "style": 557, + "describe": 558, + "film": 559, + "trouble": 560, + "bar": 561, + "heavy": 562, + "edge": 563, + "ball": 564, + "arm": 565, + "quality": 566, + "standard": 567, + "floor": 568, + "indeed": 569, + "individual": 570, + "wrong": 571, + "hit": 572, + "media": 573, + "network": 574, + "pretty": 575, + "fire": 576, + "forget": 577, + "personal": 578, + "evening": 579, + "window": 580, + "deep": 581, + "top": 582, + "method": 583, + "laugh": 584, + "allow": 585, + "brother": 586, + "size": 587, + "bank": 588, + "glass": 589, + "song": 590, + "explain": 591, + "wonder": 592, + "list": 593, + "likely": 594, + "fine": 595, + "loss": 596, + "resource": 597, + "opportunity": 598, + "blood": 599, + "save": 600, + "plant": 601, + "suddenly": 602, + "drop": 603, + "cold": 604, + "prepare": 605, + "wall": 606, + "challenge": 607, + "foreign": 608, + "statement": 609, + "natural": 610, + "guess": 611, + "news": 612, + "direction": 613, + "goal": 614, + "animal": 615, + "title": 616, + "soldier": 617, + "machine": 618, + "rock": 619, + "card": 620, + "summer": 621, + "wind": 622, + "scene": 623, + "consider": 624, + "character": 625, + "degree": 626, + "fish": 627, + "kitchen": 628, + "inside": 629, + "movement": 630, + "river": 631, + "sleep": 632, + "memory": 633, + "claim": 634, + "property": 635, + "security": 636, + "increase": 637, + "section": 638, + "demand": 639, + "benefit": 640, + "nor": 641, + "responsibility": 642, + "agency": 643, + "economy": 644, + "significant": 645, + "industry": 646, + "defense": 647, + "situation": 648, + "worker": 649, + "cell": 650, + "within": 651, + "tax": 652, + "peace": 653, + "similar": 654, + "poor": 655, + "key": 656, + "organization": 657, + "phone": 658, + "middle": 659, + "nation": 660, + "education": 661, + "attack": 662, + "remove": 663, + "total": 664, + "contain": 665, + "article": 666, + "garden": 667, + "smile": 668, + "alone": 669, + "box": 670, + "represent": 671, + "brown": 672, + "skin": 673, + "environment": 674, + "college": 675, + "beyond": 676, + "traditional": 677, + "shot": 678, + "international": 679, + "chair": 680, + "baby": 681, + "fly": 682, + "shoulder": 683, + "receive": 684, + "campaign": 685, + "born": 686, + "debate": 687, + "hang": 688, + "pain": 689, + "wear": 690, + "reality": 691, + "realize": 692, + "western": 693, + "author": 694, + "thus": 695, + "finger": 696, + "source": 697, + "investment": 698, + "weight": 699, + "strategy": 700, + "risk": 701, + "former": 702, + "red": 703, + "tough": 704, + "sea": 705, + "prove": 706, + "race": 707, + "trip": 708, + "dog": 709, + "join": 710, + "catch": 711, + "corner": 712, + "easily": 713, + "match": 714, + "teach": 715, + "management": 716, + "population": 717, + "focus": 718, + "discussion": 719, + "hospital": 720, + "feeling": 721, + "effect": 722, + "impact": 723, + "perhaps": 724, + "region": 725, + "relationship": 726, + "maintain": 727, + "training": 728, + "drug": 729, + "patient": 730, + "push": 731, + "agreement": 732, + "fast": 733, + "financial": 734, + "per": 735, + "whatever": 736, + "average": 737, + "daughter": 738, + "purpose": 739, + "victim": 740, + "original": 741, + "subject": 742, + "spring": 743, + "protect": 744, + "necessary": 745, + "leg": 746, + "hair": 747, + "various": 748, + "price": 749, + "officer": 750, + "address": 751, + "beautiful": 752, + "eight": 753, + "shape": 754, + "enjoy": 755, + "straight": 756, + "regard": 757, + "prevent": 758, + "task": 759, + "finish": 760, + "collection": 761, + "huge": 762, + "generation": 763, + "professional": 764, + "century": 765, + "seek": 766, + "connection": 767, + "release": 768, + "speech": 769, + "travel": 770, + "behavior": 771, + "audience": 772, + "cultural": 773, + "dinner": 774, + "female": 775, + "mine": 776, + "park": 777, + "opinion": 778, + "specific": 779, + "exchange": 780, + "professor": 781, + "reflect": 782, + "dream": 783, + "newspaper": 784, + "budget": 785, + "sexual": 786, + "adult": 787, + "mouth": 788, + "trial": 789, + "commission": 790, + "choice": 791, + "variety": 792, + "access": 793, + "analysis": 794, + "assume": 795, + "judge": 796, + "critical": 797, + "knowledge": 798, + "citizen": 799, + "option": 800, + "identify": 801, + "lack": 802, + "interview": 803, + "site": 804, + "message": 805, + "positive": 806, + "horse": 807, + "nine": 808, + "compare": 809, + "reduce": 810, + "supply": 811, + "religious": 812, + "profit": 813, + "democratic": 814, + "conference": 815, + "candidate": 816, + "crime": 817, + "commercial": 818, + "touch": 819, + "mile": 820, + "male": 821, + "responsible": 822, + "perfect": 823, + "civil": 824, + "global": 825, + "refer": 826, + "tiny": 827, + "feature": 828, + "chief": 829, + "contract": 830, + "regular": 831, + "spirit": 832, + "weapon": 833, + "advantage": 834, + "threat": 835, + "bottom": 836, + "spot": 837, + "clean": 838, + "income": 839, + "argue": 840, + "collect": 841, + "establish": 842, + "gas": 843, + "balance": 844, + "surface": 845, + "target": 846, + "danger": 847, + "magazine": 848, + "count": 849, + "master": 850, + "engine": 851, + "ring": 852, + "crisis": 853, + "vast": 854, + "hello": 855, + "capital": 856, + "exist": 857, + "artist": 858, + "content": 859, + "credit": 860, + "spread": 861, + "primary": 862, + "electric": 863, + "bus": 864, + "separate": 865, + "pattern": 866, + "stuff": 867, + "sector": 868, + "hi": 869, + "independent": 870, + "metal": 871, + "flight": 872, + "rural": 873, + "largely": 874, + "fun": 875, + "neighborhood": 876, + "domestic": 877, + "emerge": 878, + "expand": 879, + "powerful": 880, + "relief": 881, + "battle": 882, + "vote": 883, + "fruit": 884, + "relatively": 885, + "liberal": 886, + "increasingly": 887, + "strike": 888, + "slightly": 889, + "birth": 890, + "seriously": 891, + "excellent": 892, + "loan": 893, + "smoke": 894, + "mix": 895, + "initiative": 896, + "shift": 897, + "participate": 898, + "literally": 899, + "noise": 900, + "tradition": 901, + "sharp": 902, + "reveal": 903, + "violent": 904, + "coffee": 905, + "urban": 906, + "quiet": 907, + "display": 908, + "worth": 909, + "repeat": 910, + "cap": 911, + "replace": 912, + "dress": 913, + "grand": 914, + "sample": 915, + "damage": 916, + "distance": 917, + "football": 918, + "broad": 919, + "approval": 920, + "aide": 921, + "typical": 922, + "passage": 923, + "wrap": 924, + "category": 925, + "trend": 926, + "reform": 927, + "corporate": 928, + "recover": 929, + "panel": 930, + "ancient": 931, + "adopt": 932, + "capable": 933, + "aid": 934, + "limit": 935, + "hate": 936, + "crew": 937, + "advance": 938, + "grant": 939, + "snow": 940, + "expression": 941, + "proportion": 942, + "earn": 943, + "literature": 944, + "warn": 945, + "executive": 946, + "climate": 947, + "thin": 948, + "leadership": 949, + "instrument": 950, + "dramatic": 951, + "cloud": 952, + "mass": 953, + "minority": 954, + "operate": 955, + "handle": 956, + "household": 957, + "oppose": 958, + "gift": 959, + "territory": 960, + "rain": 961, + "estate": 962, + "associate": 963, + "emotion": 964, + "upper": 965, + "combine": 966, + "foundation": 967, + "internal": 968, + "conservative": 969, + "neighbor": 970, + "acquire": 971, + "cancer": 972, + "extend": 973, + "suit": 974, + "criminal": 975, + "resolve": 976, + "observe": 977, + "knee": 978, + "investigation": 979, + "mayor": 980, + "award": 981, + "studio": 982, + "emphasis": 983, + "explore": 984, + "mountain": 985, + "element": 986, + "assess": 987, + "therapy": 988, + "constant": 989, + "visitor": 990, + "coast": 991, + "golden": 992, + "border": 993, + "phenomenon": 994, + "digital": 995, + "gap": 996, + "transition": 997, + "theme": 998, + "meanwhile": 999, + "secure": 1000, + "generate": 1001, + "abuse": 1002, + "celebrate": 1003, + "exception": 1004, + "eliminate": 1005, + "consequence": 1006, + "extraordinary": 1007, + "conclude": 1008, + "deny": 1009, + "interpret": 1010, + "attractive": 1011, + "portion": 1012, + "minimum": 1013, + "complaint": 1014, + "communicate": 1015, + "academic": 1016, + "psychological": 1017, + "signal": 1018, + "engage": 1019, + "alter": 1020, + "convert": 1021, + "revolution": 1022, + "intention": 1023, + "assist": 1024, + "integrate": 1025, + "overcome": 1026, + "crucial": 1027, + "episode": 1028, + "reaction": 1029, + "column": 1030, + "comprehensive": 1031, + "emotional": 1032, + "contribution": 1033, + "awareness": 1034, + "chapter": 1035, + "terrorist": 1036, + "honor": 1037, + "intellectual": 1038, + "evolve": 1039, + "compete": 1040, + "welfare": 1041, + "string": 1042, + "variable": 1043, + "assign": 1044, + "implement": 1045, + "phase": 1046, + "estimate": 1047, + "modify": 1048, + "moderate": 1049, + "attribute": 1050, + "contribute": 1051, + "bomb": 1052, + "slight": 1053, + "storm": 1054, + "truck": 1055, + "lunch": 1056, + "cat": 1057, + "scientist": 1058, + "sand": 1059, + "native": 1060, + "fiction": 1061, + "lake": 1062, + "flower": 1063, + "boat": 1064, + "prison": 1065, + "solar": 1066, + "weekend": 1067, + "instruction": 1068, + "cry": 1069, + "celebration": 1070, + "leaf": 1071, + "pitch": 1072, + "hunt": 1073, + "chain": 1074, + "glad": 1075, + "nurse": 1076, + "deputy": 1077, + "grab": 1078, + "layer": 1079, + "commit": 1080, + "struggle": 1081, + "iron": 1082, + "draft": 1083, + "plate": 1084, + "circle": 1085, + "comfort": 1086, + "moon": 1087, + "dust": 1088, + "regime": 1089, + "nervous": 1090, + "silver": 1091, + "coal": 1092, + "inner": 1093, + "chest": 1094, + "butter": 1095, + "fold": 1096, + "track": 1097, + "bone": 1098, + "clock": 1099, + "mirror": 1100, + "bread": 1101, + "camp": 1102, + "lift": 1103, + "shell": 1104, + "cabin": 1105, + "ticket": 1106, + "piano": 1107, + "wing": 1108, + "bath": 1109, + "harbor": 1110, + "gallery": 1111, + "pipe": 1112, + "silk": 1113, + "pine": 1114, + "honey": 1115, + "cable": 1116, + "mask": 1117, + "bench": 1118, + "nest": 1119, + "salt": 1120, + "ribbon": 1121, + "trail": 1122, + "swift": 1123, + "calm": 1124, + "mild": 1125, + "grave": 1126, + "tender": 1127, + "noble": 1128, + "plain": 1129, + "frost": 1130, + "stem": 1131, + "arch": 1132, + "flame": 1133, + "lawn": 1134, + "pole": 1135, + "clay": 1136, + "beam": 1137, + "dock": 1138, + "barn": 1139, + "dam": 1140, + "gem": 1141, + "chalk": 1142, + "coral": 1143, + "pearl": 1144, + "maple": 1145, + "cedar": 1146, + "elm": 1147, + "vine": 1148, + "oak": 1149, + "ash": 1150, + "ivy": 1151, + "fern": 1152, + "moss": 1153, + "reed": 1154, + "sage": 1155, + "dew": 1156, + "fog": 1157, + "mist": 1158, + "hail": 1159, + "sleet": 1160, + "breeze": 1161, + "gale": 1162, + "blaze": 1163, + "ember": 1164, + "spark": 1165, + "glow": 1166, + "shade": 1167, + "gleam": 1168, + "bloom": 1169, + "bud": 1170, + "thorn": 1171, + "petal": 1172, + "seed": 1173, + "root": 1174, + "bark": 1175, + "sap": 1176, + "grain": 1177, + "crop": 1178, + "wheat": 1179, + "corn": 1180, + "rice": 1181, + "wool": 1182, + "linen": 1183, + "hemp": 1184, + "jute": 1185, + "cotton": 1186, + "lace": 1187, + "velvet": 1188, + "satin": 1189, + "denim": 1190, + "tweed": 1191, + "plaid": 1192, + "mesh": 1193, + "gauze": 1194, + "felt": 1195, + "suede": 1196, + "hide": 1197, + "pelt": 1198, + "fur": 1199, + "feather": 1200, + "scale": 1201, + "horn": 1202, + "tusk": 1203, + "claw": 1204, + "fang": 1205, + "beak": 1206, + "fin": 1207, + "gill": 1208, + "hoof": 1209, + "mane": 1210, + "tail": 1211, + "paw": 1212, + "snout": 1213, + "antler": 1214, + "quill": 1215, + "web": 1216, + "hive": 1217, + "den": 1218, + "lair": 1219, + "burrow": 1220, + "cave": 1221, + "pit": 1222, + "pond": 1223, + "creek": 1224, + "brook": 1225, + "marsh": 1226, + "meadow": 1227, + "prairie": 1228, + "valley": 1229, + "ridge": 1230, + "cliff": 1231, + "ledge": 1232, + "slope": 1233, + "peak": 1234, + "summit": 1235, + "plateau": 1236, + "canyon": 1237, + "gorge": 1238, + "ravine": 1239, + "dune": 1240, + "delta": 1241, + "cove": 1242, + "bay": 1243, + "inlet": 1244, + "cape": 1245, + "isle": 1246, + "reef": 1247, + "shoal": 1248, + "tide": 1249, + "wave": 1250, + "surf": 1251, + "foam": 1252, + "spray": 1253, + "drift": 1254, + "eddy": 1255, + "swirl": 1256, + "ripple": 1257, + "splash": 1258, + "drip": 1259, + "trickle": 1260, + "pour": 1261, + "flood": 1262, + "surge": 1263, + "swell": 1264, + "crest": 1265, + "orbit": 1266, + "axis": 1267, + "sphere": 1268, + "globe": 1269, + "disc": 1270, + "cube": 1271, + "cone": 1272, + "prism": 1273, + "spiral": 1274, + "helix": 1275, + "matrix": 1276, + "grid": 1277, + "loop": 1278, + "knot": 1279, + "link": 1280, + "node": 1281, + "hub": 1282, + "core": 1283, + "tier": 1284, + "frame": 1285, + "vault": 1286, + "dome": 1287, + "tower": 1288, + "spire": 1289, + "pillar": 1290, + "rafter": 1291, + "truss": 1292, + "brace": 1293, + "strut": 1294, + "bolt": 1295, + "rivet": 1296, + "weld": 1297, + "joint": 1298, + "hinge": 1299, + "latch": 1300, + "lock": 1301, + "grip": 1302, + "lever": 1303, + "pulley": 1304, + "gear": 1305, + "shaft": 1306, + "axle": 1307, + "wheel": 1308, + "spoke": 1309, + "rim": 1310, + "tire": 1311, + "brake": 1312, + "clutch": 1313, + "throttle": 1314, + "valve": 1315, + "pump": 1316, + "tube": 1317, + "hose": 1318, + "wire": 1319, + "cord": 1320, + "rope": 1321, + "belt": 1322, + "strap": 1323, + "band": 1324, + "clip": 1325, + "pin": 1326, + "nail": 1327, + "screw": 1328, + "nut": 1329, + "washer": 1330, + "coil": 1331, + "magnet": 1332, + "lens": 1333, + "filter": 1334, + "screen": 1335, + "board": 1336, + "slab": 1337, + "block": 1338, + "brick": 1339, + "tile": 1340, + "slate": 1341, + "granite": 1342, + "marble": 1343, + "quartz": 1344, + "flint": 1345, + "obsidian": 1346, + "basalt": 1347, + "limestone": 1348, + "sandstone": 1349, + "shale": 1350, + "gypsum": 1351, + "mica": 1352, + "talc": 1353, + "jade": 1354, + "amber": 1355, + "opal": 1356, + "ruby": 1357, + "sapphire": 1358, + "emerald": 1359, + "topaz": 1360, + "garnet": 1361, + "onyx": 1362, + "agate": 1363, + "jasper": 1364, + "turquoise": 1365, + "cobalt": 1366, + "chrome": 1367, + "nickel": 1368, + "zinc": 1369, + "tin": 1370, + "copper": 1371, + "bronze": 1372, + "brass": 1373, + "steel": 1374, + "alloy": 1375, + "gold": 1376, + "platinum": 1377, + "mercury": 1378, + "neon": 1379, + "argon": 1380, + "helium": 1381, + "xenon": 1382, + "radon": 1383, + "oxide": 1384, + "sulfide": 1385, + "chloride": 1386, + "nitrate": 1387, + "phosphate": 1388, + "carbonate": 1389, + "silicate": 1390, + "acetate": 1391, + "citrate": 1392, + "glucose": 1393, + "fructose": 1394, + "sucrose": 1395, + "lactose": 1396, + "starch": 1397, + "cellulose": 1398, + "protein": 1399, + "enzyme": 1400, + "hormone": 1401, + "vitamin": 1402, + "mineral": 1403, + "nutrient": 1404, + "calorie": 1405, + "fiber": 1406, + "fat": 1407, + "carb": 1408, + "sugar": 1409, + "acid": 1410, + "alkali": 1411, + "solvent": 1412, + "reagent": 1413, + "catalyst": 1414, + "polymer": 1415, + "monomer": 1416, + "isotope": 1417, + "electron": 1418, + "proton": 1419, + "neutron": 1420, + "photon": 1421, + "quantum": 1422, + "atom": 1423, + "molecule": 1424, + "ion": 1425, + "plasma": 1426, + "liquid": 1427, + "solid": 1428, + "crystal": 1429, + "powder": 1430, + "paste": 1431, + "gel": 1432, + "vapor": 1433, + "soot": 1434, + "char": 1435, + "slag": 1436, + "rust": 1437, + "tarnish": 1438, + "patina": 1439, + "glaze": 1440, + "enamel": 1441, + "lacquer": 1442, + "varnish": 1443, + "stain": 1444, + "dye": 1445, + "pigment": 1446, + "ink": 1447, + "paint": 1448, + "primer": 1449, + "sealer": 1450, + "adhesive": 1451, + "putty": 1452, + "mortar": 1453, + "cement": 1454, + "concrete": 1455, + "asphalt": 1456, + "gravel": 1457, + "mulch": 1458, + "compost": 1459, + "peat": 1460, + "loam": 1461, + "silt": 1462, + "mud": 1463, + "slime": 1464, + "tar": 1465, + "wax": 1466, + "resin": 1467, + "gum": 1468, + "latex": 1469, + "rubber": 1470, + "nylon": 1471, + "acrylic": 1472, + "polyester": 1473, + "spandex": 1474, + "rayon": 1475, + "bamboo": 1476, + "cork": 1477, + "balsa": 1478, + "teak": 1479, + "ebony": 1480, + "walnut": 1481, + "birch": 1482, + "poplar": 1483, + "spruce": 1484, + "fir": 1485, + "yew": 1486, + "willow": 1487, + "beech": 1488, + "chestnut": 1489, + "hickory": 1490, + "mahogany": 1491, + "redwood": 1492, + "sequoia": 1493, + "cypress": 1494, + "juniper": 1495, + "laurel": 1496, + "olive": 1497, + "palm": 1498, + "banana": 1499, + "mango": 1500, + "papaya": 1501, + "coconut": 1502, + "avocado": 1503, + "lemon": 1504, + "lime": 1505, + "orange": 1506, + "grape": 1507, + "apple": 1508, + "pear": 1509, + "peach": 1510, + "plum": 1511, + "cherry": 1512, + "berry": 1513, + "melon": 1514, + "fig": 1515, + "date": 1516, + "prune": 1517, + "raisin": 1518, + "almond": 1519, + "cashew": 1520, + "pecan": 1521, + "hazel": 1522, + "acorn": 1523, + "peanut": 1524, + "soy": 1525, + "bean": 1526, + "lentil": 1527, + "pea": 1528, + "chickpea": 1529, + "tofu": 1530, + "tempeh": 1531, + "seitan": 1532, + "quinoa": 1533, + "millet": 1534, + "barley": 1535, + "oat": 1536, + "rye": 1537, + "buckwheat": 1538, + "sorghum": 1539, + "basil": 1540, + "thyme": 1541, + "mint": 1542, + "dill": 1543, + "parsley": 1544, + "cilantro": 1545, + "oregano": 1546, + "rosemary": 1547, + "cumin": 1548, + "ginger": 1549, + "garlic": 1550, + "onion": 1551, + "pepper": 1552, + "chili": 1553, + "paprika": 1554, + "nutmeg": 1555, + "cinnamon": 1556, + "clove": 1557, + "vanilla": 1558, + "cocoa": 1559, + "chocolate": 1560, + "caramel": 1561, + "toffee": 1562, + "fudge": 1563, + "truffle": 1564, + "mousse": 1565, + "sorbet": 1566, + "gelato": 1567, + "custard": 1568, + "pudding": 1569, + "yogurt": 1570, + "cream": 1571, + "cheese": 1572, + "milk": 1573, + "whey": 1574, + "curd": 1575, + "broth": 1576, + "gravy": 1577, + "sauce": 1578, + "jam": 1579, + "jelly": 1580, + "syrup": 1581, + "nectar": 1582, + "juice": 1583, + "cider": 1584, + "tea": 1585, + "brew": 1586, + "latte": 1587, + "mocha": 1588, + "espresso": 1589, + "cappuccino": 1590, + "decaf": 1591, + "blend": 1592, + "roast": 1593, + "grind": 1594, + "steep": 1595, + "taste": 1596, + "flavor": 1597, + "aroma": 1598, + "scent": 1599, + "fragrance": 1600, + "perfume": 1601, + "cologne": 1602, + "lotion": 1603, + "balm": 1604, + "salve": 1605, + "ointment": 1606, + "poultice": 1607, + "tincture": 1608, + "elixir": 1609, + "serum": 1610, + "essence": 1611, + "extract": 1612, + "infusion": 1613, + "decoction": 1614, + "distillate": 1615, + "concentrate": 1616, + "dilute": 1617, + "dissolve": 1618, + "saturate": 1619, + "precipitate": 1620, + "ferment": 1621, + "strain": 1622, + "purify": 1623, + "refine": 1624, + "transform": 1625, + "adapt": 1626, + "adjust": 1627, + "calibrate": 1628, + "align": 1629, + "synchronize": 1630, + "coordinate": 1631, + "harmonize": 1632, + "merge": 1633, + "fuse": 1634, + "bond": 1635, + "attach": 1636, + "fasten": 1637, + "anchor": 1638, + "mount": 1639, + "install": 1640, + "deploy": 1641, + "launch": 1642, + "activate": 1643, + "enable": 1644, + "initialize": 1645, + "configure": 1646, + "setup": 1647, + "optimize": 1648, + "enhance": 1649, + "upgrade": 1650, + "update": 1651, + "patch": 1652, + "fix": 1653, + "repair": 1654, + "restore": 1655, + "refresh": 1656, + "renew": 1657, + "rebuild": 1658, + "reconstruct": 1659, + "revise": 1660, + "edit": 1661, + "format": 1662, + "compile": 1663, + "parse": 1664, + "validate": 1665, + "verify": 1666, + "authenticate": 1667, + "authorize": 1668, + "encrypt": 1669, + "decrypt": 1670, + "encode": 1671, + "decode": 1672, + "compress": 1673, + "archive": 1674, + "backup": 1675, + "sync": 1676, + "upload": 1677, + "download": 1678, + "transfer": 1679, + "stream": 1680, + "broadcast": 1681, + "publish": 1682, + "subscribe": 1683, + "notify": 1684, + "alert": 1685, + "monitor": 1686, + "log": 1687, + "debug": 1688, + "trace": 1689, + "profile": 1690, + "benchmark": 1691, + "audit": 1692, + "inspect": 1693, + "review": 1694, + "analyze": 1695, + "diagnose": 1696, + "troubleshoot": 1697, + "mitigate": 1698, + "permit": 1699, + "revoke": 1700, + "restrict": 1701, + "queue": 1702, + "buffer": 1703, + "cache": 1704, + "index": 1705, + "query": 1706, + "fetch": 1707, + "retrieve": 1708, + "load": 1709, + "persist": 1710, + "rollback": 1711, + "migrate": 1712, + "provision": 1713, + "route": 1714, + "proxy": 1715, + "forward": 1716, + "redirect": 1717, + "rewrite": 1718, + "map": 1719, + "bind": 1720, + "listen": 1721, + "connect": 1722, + "disconnect": 1723, + "timeout": 1724, + "retry": 1725, + "fallback": 1726, + "restart": 1727, + "shutdown": 1728, + "halt": 1729, + "pause": 1730, + "resume": 1731, + "cancel": 1732, + "abort": 1733, + "terminate": 1734, + "kill": 1735, + "spawn": 1736, + "fork": 1737, + "clone": 1738, + "branch": 1739, + "rebase": 1740, + "stash": 1741, + "pop": 1742, + "tag": 1743, + "label": 1744, + "annotate": 1745, + "mark": 1746, + "flag": 1747, + "bookmark": 1748, + "mention": 1749, + "reply": 1750, + "comment": 1751, + "post": 1752, + "tweet": 1753, + "blog": 1754, + "vlog": 1755, + "podcast": 1756, + "chat": 1757, + "email": 1758, + "text": 1759, + "dial": 1760, + "buzz": 1761, + "beep": 1762, + "chime": 1763, + "tone": 1764, + "volume": 1765, + "bass": 1766, + "treble": 1767, + "echo": 1768, + "reverb": 1769, + "delay": 1770, + "chorus": 1771, + "tremolo": 1772, + "vibrato": 1773, + "rhythm": 1774, + "tempo": 1775, + "beat": 1776, + "chord": 1777, + "octave": 1778, + "harmony": 1779, + "melody": 1780, + "tune": 1781, + "hymn": 1782, + "anthem": 1783, + "ballad": 1784, + "sonata": 1785, + "concerto": 1786, + "symphony": 1787, + "opera": 1788, + "ballet": 1789, + "jazz": 1790, + "blues": 1791, + "funk": 1792, + "soul": 1793, + "reggae": 1794, + "rap": 1795, + "folk": 1796, + "classical": 1797, + "baroque": 1798, + "romantic": 1799, + "modern": 1800, + "abstract": 1801, + "surreal": 1802, + "cubist": 1803, + "impressionist": 1804, + "realist": 1805, + "minimalist": 1806, + "gothic": 1807, + "renaissance": 1808, + "medieval": 1809, + "prehistoric": 1810, + "colonial": 1811, + "victorian": 1812, + "georgian": 1813, + "edwardian": 1814, + "tudor": 1815, + "norman": 1816, + "saxon": 1817, + "celtic": 1818, + "nordic": 1819, + "viking": 1820, + "spartan": 1821, + "athenian": 1822, + "roman": 1823, + "greek": 1824, + "egyptian": 1825, + "persian": 1826, + "ottoman": 1827, + "mongol": 1828, + "aztec": 1829, + "mayan": 1830, + "inca": 1831, + "sumerian": 1832, + "babylonian": 1833, + "alphabet": 1834, + "vowel": 1835, + "consonant": 1836, + "syllable": 1837, + "prefix": 1838, + "suffix": 1839, + "phrase": 1840, + "clause": 1841, + "sentence": 1842, + "paragraph": 1843, + "verse": 1844, + "stanza": 1845, + "couplet": 1846, + "sonnet": 1847, + "haiku": 1848, + "limerick": 1849, + "ode": 1850, + "epic": 1851, + "fable": 1852, + "myth": 1853, + "legend": 1854, + "saga": 1855, + "chronicle": 1856, + "memoir": 1857, + "biography": 1858, + "autobiography": 1859, + "diary": 1860, + "journal": 1861, + "essay": 1862, + "thesis": 1863, + "dissertation": 1864, + "treatise": 1865, + "manuscript": 1866, + "outline": 1867, + "synopsis": 1868, + "summary": 1869, + "preface": 1870, + "foreword": 1871, + "prologue": 1872, + "epilogue": 1873, + "appendix": 1874, + "glossary": 1875, + "bibliography": 1876, + "citation": 1877, + "reference": 1878, + "footnote": 1879, + "endnote": 1880, + "sidebar": 1881, + "caption": 1882, + "headline": 1883, + "subtitle": 1884, + "byline": 1885, + "masthead": 1886, + "banner": 1887, + "logo": 1888, + "icon": 1889, + "symbol": 1890, + "badge": 1891, + "emblem": 1892, + "seal": 1893, + "stamp": 1894, + "watermark": 1895, + "trademark": 1896, + "patent": 1897, + "copyright": 1898, + "license": 1899, + "certificate": 1900, + "diploma": 1901, + "credential": 1902, + "accreditation": 1903, + "endorsement": 1904, + "testimony": 1905, + "affidavit": 1906, + "deposition": 1907, + "subpoena": 1908, + "warrant": 1909, + "summons": 1910, + "indictment": 1911, + "verdict": 1912, + "parole": 1913, + "probation": 1914, + "bail": 1915, + "penalty": 1916, + "sanction": 1917, + "embargo": 1918, + "tariff": 1919, + "quota": 1920, + "subsidy": 1921, + "incentive": 1922, + "rebate": 1923, + "discount": 1924, + "coupon": 1925, + "voucher": 1926, + "token": 1927, + "debit": 1928, + "ledger": 1929, + "invoice": 1930, + "receipt": 1931, + "manifest": 1932, + "inventory": 1933, + "catalog": 1934, + "directory": 1935, + "registry": 1936, + "database": 1937, + "repository": 1938, + "locker": 1939, + "cabinet": 1940, + "drawer": 1941, + "shelf": 1942, + "rack": 1943, + "bin": 1944, + "crate": 1945, + "barrel": 1946, + "drum": 1947, + "tank": 1948, + "silo": 1949, + "hopper": 1950, + "funnel": 1951, + "chute": 1952, + "conveyor": 1953, + "elevator": 1954, + "escalator": 1955, + "ramp": 1956, + "staircase": 1957, + "ladder": 1958, + "scaffold": 1959, + "platform": 1960, + "pier": 1961, + "wharf": 1962, + "jetty": 1963, + "quay": 1964, + "berth": 1965, + "marina": 1966, + "port": 1967, + "terminal": 1968, + "depot": 1969, + "station": 1970, + "junction": 1971, + "intersection": 1972, + "crossroads": 1973, + "roundabout": 1974, + "bypass": 1975, + "detour": 1976, + "shortcut": 1977, + "highway": 1978, + "freeway": 1979, + "expressway": 1980, + "turnpike": 1981, + "boulevard": 1982, + "avenue": 1983, + "lane": 1984, + "alley": 1985, + "path": 1986, + "corridor": 1987, + "tunnel": 1988, + "bridge": 1989, + "overpass": 1990, + "underpass": 1991, + "viaduct": 1992, + "aqueduct": 1993, + "canal": 1994, + "channel": 1995, + "strait": 1996, + "fjord": 1997, + "lagoon": 1998, + "estuary": 1999, + "tributary": 2000, + "watershed": 2001, + "basin": 2002, + "reservoir": 2003, + "aquifer": 2004, + "fountain": 2005, + "geyser": 2006, + "waterfall": 2007, + "cascade": 2008, + "rapid": 2009, + "whirlpool": 2010, + "maelstrom": 2011, + "tsunami": 2012, + "typhoon": 2013, + "hurricane": 2014, + "cyclone": 2015, + "tornado": 2016, + "blizzard": 2017, + "drought": 2018, + "famine": 2019, + "plague": 2020, + "epidemic": 2021, + "pandemic": 2022, + "outbreak": 2023, + "contagion": 2024, + "infection": 2025, + "virus": 2026, + "bacteria": 2027, + "fungus": 2028, + "parasite": 2029, + "microbe": 2030, + "pathogen": 2031, + "antigen": 2032, + "antibody": 2033, + "vaccine": 2034, + "antidote": 2035, + "remedy": 2036, + "cure": 2037, + "treatment": 2038, + "surgery": 2039, + "procedure": 2040, + "diagnosis": 2041, + "prognosis": 2042, + "symptom": 2043, + "syndrome": 2044, + "chronic": 2045, + "acute": 2046, + "benign": 2047, + "malignant": 2048, + "dormant": 2049, + "latent": 2050, + "active": 2051, + "passive": 2052, + "neutral": 2053, + "negative": 2054, + "zero": 2055, + "null": 2056, + "void": 2057, + "blank": 2058, + "empty": 2059, + "quarter": 2060, + "double": 2061, + "triple": 2062, + "quadruple": 2063, + "dozen": 2064, + "score": 2065, + "thousand": 2066, + "billion": 2067, + "trillion": 2068, + "fraction": 2069, + "decimal": 2070, + "percent": 2071, + "ratio": 2072, + "median": 2073, + "mode": 2074, + "variance": 2075, + "deviation": 2076, + "correlation": 2077, + "regression": 2078, + "probability": 2079, + "statistics": 2080, + "calculus": 2081, + "algebra": 2082, + "geometry": 2083, + "trigonometry": 2084, + "arithmetic": 2085, + "equation": 2086, + "formula": 2087, + "theorem": 2088, + "proof": 2089, + "axiom": 2090, + "postulate": 2091, + "hypothesis": 2092, + "conjecture": 2093, + "paradox": 2094, + "dilemma": 2095, + "enigma": 2096, + "puzzle": 2097, + "riddle": 2098, + "mystery": 2099, + "secret": 2100, + "cipher": 2101, + "code": 2102, + "algorithm": 2103, + "protocol": 2104, + "specification": 2105, + "requirement": 2106, + "constraint": 2107, + "parameter": 2108, + "function": 2109, + "object": 2110, + "instance": 2111, + "module": 2112, + "package": 2113, + "library": 2114, + "framework": 2115, + "interface": 2116, + "component": 2117, + "endpoint": 2118, + "entity": 2119, + "schema": 2120, + "template": 2121, + "factory": 2122, + "builder": 2123, + "adapter": 2124, + "observer": 2125, + "listener": 2126, + "handler": 2127, + "middleware": 2128, + "pipeline": 2129, + "workflow": 2130, + "thread": 2131, + "stack": 2132, + "heap": 2133, + "graph": 2134, + "array": 2135, + "tuple": 2136, + "struct": 2137, + "enum": 2138, + "union": 2139, + "trait": 2140, + "mixin": 2141, + "aspect": 2142, + "annotation": 2143, + "decorator": 2144, + "wrapper": 2145, + "closure": 2146, + "callback": 2147, + "promise": 2148, + "socket": 2149, + "host": 2150, + "server": 2151, + "client": 2152, + "peer": 2153, + "cluster": 2154, + "shard": 2155, + "replica": 2156, + "partition": 2157, + "segment": 2158, + "chunk": 2159, + "cylinder": 2160, + "device": 2161, + "driver": 2162, + "firmware": 2163, + "kernel": 2164, + "console": 2165, + "prompt": 2166, + "cursor": 2167, + "pointer": 2168, + "mouse": 2169, + "keyboard": 2170, + "pixel": 2171, + "resolution": 2172, + "brightness": 2173, + "contrast": 2174, + "saturation": 2175, + "hue": 2176, + "tint": 2177, + "gradient": 2178, + "opacity": 2179, + "transparency": 2180, + "blur": 2181, + "shadow": 2182, + "reflection": 2183, + "texture": 2184, + "polygon": 2185, + "vertex": 2186, + "normal": 2187, + "tangent": 2188, + "curve": 2189, + "density": 2190, + "tension": 2191, + "stress": 2192, + "torque": 2193, + "momentum": 2194, + "velocity": 2195, + "speed": 2196, + "acceleration": 2197, + "gravity": 2198, + "friction": 2199, + "resistance": 2200, + "impedance": 2201, + "capacitance": 2202, + "inductance": 2203, + "voltage": 2204, + "frequency": 2205, + "wavelength": 2206, + "amplitude": 2207, + "spectrum": 2208, + "bandwidth": 2209, + "gain": 2210, + "attenuation": 2211, + "modulation": 2212, + "demodulation": 2213, + "encoding": 2214, + "decoding": 2215, + "sampling": 2216, + "quantization": 2217, + "interpolation": 2218, + "extrapolation": 2219, + "approximation": 2220, + "estimation": 2221, + "simulation": 2222, + "emulation": 2223, + "virtualization": 2224, + "containerization": 2225, + "orchestration": 2226, + "deployment": 2227, + "provisioning": 2228, + "monitoring": 2229, + "logging": 2230, + "alerting": 2231, + "tracing": 2232, + "profiling": 2233, + "debugging": 2234, + "testing": 2235, + "staging": 2236, + "production": 2237, + "integration": 2238, + "delivery": 2239, + "maintenance": 2240, + "documentation": 2241, + "architecture": 2242, + "prototype": 2243, + "iteration": 2244, + "sprint": 2245, + "version": 2246, + "changelog": 2247, + "roadmap": 2248, + "milestone": 2249, + "deadline": 2250, + "priority": 2251, + "severity": 2252, + "mitigation": 2253, + "contingency": 2254, + "escalation": 2255, + "postmortem": 2256, + "retrospective": 2257, + "standup": 2258, + "planning": 2259, + "grooming": 2260, + "burndown": 2261, + "backlog": 2262, + "acceptance": 2263, + "criteria": 2264, + "scenario": 2265, + "assertion": 2266, + "expectation": 2267, + "assumption": 2268, + "dependency": 2269, + "coupling": 2270, + "cohesion": 2271, + "abstraction": 2272, + "encapsulation": 2273, + "inheritance": 2274, + "polymorphism": 2275, + "composition": 2276, + "aggregation": 2277, + "delegation": 2278, + "injection": 2279, + "inversion": 2280, + "separation": 2281, + "isolation": 2282, + "idempotent": 2283, + "immutable": 2284, + "stateless": 2285, + "asynchronous": 2286, + "concurrent": 2287, + "parallel": 2288, + "distributed": 2289, + "replicated": 2290, + "partitioned": 2291, + "consistent": 2292, + "tolerant": 2293, + "resilient": 2294, + "scalable": 2295, + "elastic": 2296, + "durable": 2297, + "persistent": 2298, + "transient": 2299, + "ephemeral": 2300, + "volatile": 2301, + "atomic": 2302, + "sequential": 2303, + "eventual": 2304, + "causal": 2305, + "strict": 2306, + "relaxed": 2307, + "optimistic": 2308, + "pessimistic": 2309, + "locking": 2310, + "blocking": 2311, + "spinning": 2312, + "polling": 2313, + "interrupt": 2314, + "command": 2315, + "request": 2316, + "acknowledgment": 2317, + "confirmation": 2318, + "notification": 2319, + "subscription": 2320, + "publication": 2321, + "multicast": 2322, + "unicast": 2323, + "anycast": 2324, + "packet": 2325, + "datagram": 2326, + "payload": 2327, + "header": 2328, + "footer": 2329, + "checksum": 2330, + "hash": 2331, + "digest": 2332, + "signature": 2333, + "session": 2334, + "cookie": 2335, + "gateway": 2336, + "firewall": 2337, + "router": 2338, + "switch": 2339, + "modem": 2340, + "antenna": 2341, + "satellite": 2342, + "coaxial": 2343, + "twisted": 2344, + "wireless": 2345, + "bluetooth": 2346, + "cellular": 2347, + "broadband": 2348, + "narrowband": 2349, + "baseband": 2350, + "sideband": 2351, + "passband": 2352, + "stopband": 2353, + "lowpass": 2354, + "highpass": 2355, + "bandpass": 2356, + "threshold": 2357, + "ceiling": 2358, + "baseline": 2359, + "objective": 2360, + "metric": 2361, + "indicator": 2362, + "dashboard": 2363, + "widget": 2364, + "chart": 2365, + "input": 2366, + "output": 2367, + "button": 2368, + "toggle": 2369, + "slider": 2370, + "dropdown": 2371, + "menu": 2372, + "toolbar": 2373, + "navigation": 2374, + "breadcrumb": 2375, + "pagination": 2376, + "tooltip": 2377, + "popover": 2378, + "modal": 2379, + "dialog": 2380, + "toast": 2381, + "chip": 2382, + "avatar": 2383, + "thumbnail": 2384, + "carousel": 2385, + "accordion": 2386, + "tab": 2387, + "timeline": 2388, + "calendar": 2389, + "picker": 2390, + "selector": 2391, + "editor": 2392, + "viewer": 2393, + "recorder": 2394, + "uploader": 2395, + "downloader": 2396, + "exporter": 2397, + "importer": 2398, + "converter": 2399, + "formatter": 2400, + "validator": 2401, + "sanitizer": 2402, + "serializer": 2403, + "deserializer": 2404, + "encoder": 2405, + "decoder": 2406, + "parser": 2407, + "lexer": 2408, + "tokenizer": 2409, + "compiler": 2410, + "interpreter": 2411, + "transpiler": 2412, + "bundler": 2413, + "minifier": 2414, + "optimizer": 2415, + "linter": 2416, + "analyzer": 2417, + "profiler": 2418, + "debugger": 2419, + "logger": 2420, + "tracer": 2421, + "watcher": 2422, + "scheduler": 2423, + "dispatcher": 2424, + "executor": 2425, + "runner": 2426, + "controller": 2427, + "coordinator": 2428, + "orchestrator": 2429, + "mediator": 2430, + "facilitator": 2431, + "moderator": 2432, + "administrator": 2433, + "supervisor": 2434, + "inspector": 2435, + "auditor": 2436, + "reviewer": 2437, + "approver": 2438, + "publisher": 2439, + "subscriber": 2440, + "producer": 2441, + "consumer": 2442, + "sender": 2443, + "receiver": 2444, + "emitter": 2445, + "collector": 2446, + "aggregator": 2447, + "reducer": 2448, + "mapper": 2449, + "sorter": 2450, + "grouper": 2451, + "merger": 2452, + "splitter": 2453, + "balancer": 2454, + "throttler": 2455, + "limiter": 2456, + "circuit": 2457, + "breaker": 2458, + "bulkhead": 2459, + "backoff": 2460, + "jitter": 2461, + "shedding": 2462, + "caching": 2463, + "prefetching": 2464, + "batching": 2465, + "pipelining": 2466, + "streaming": 2467, + "buffering": 2468, + "spooling": 2469, + "queueing": 2470, + "scheduling": 2471, + "dispatching": 2472, + "routing": 2473, + "forwarding": 2474, + "proxying": 2475, + "mirroring": 2476, + "replicating": 2477, + "sharding": 2478, + "partitioning": 2479, + "indexing": 2480, + "searching": 2481, + "sorting": 2482, + "filtering": 2483, + "mapping": 2484, + "reducing": 2485, + "folding": 2486, + "scanning": 2487, + "iterating": 2488, + "traversing": 2489, + "visiting": 2490, + "walking": 2491, + "crawling": 2492, + "scraping": 2493, + "parsing": 2494, + "extracting": 2495, + "transforming": 2496, + "loading": 2497, + "cleaning": 2498, + "normalizing": 2499, + "deduplicating": 2500, + "enriching": 2501, + "validating": 2502, + "verifying": 2503, + "certifying": 2504, + "approving": 2505, + "rejecting": 2506, + "accepting": 2507, + "declining": 2508, + "confirming": 2509, + "denying": 2510, + "granting": 2511, + "revoking": 2512, + "suspending": 2513, + "reinstating": 2514, + "activating": 2515, + "deactivating": 2516, + "enabling": 2517, + "disabling": 2518, + "unlocking": 2519, + "opening": 2520, + "closing": 2521, + "starting": 2522, + "stopping": 2523, + "pausing": 2524, + "resuming": 2525, + "resetting": 2526, + "clearing": 2527, + "flushing": 2528, + "purging": 2529, + "archiving": 2530, + "restoring": 2531, + "recovering": 2532, + "rebuilding": 2533, + "refreshing": 2534, + "reloading": 2535, + "retrying": 2536, + "reconnecting": 2537, + "renegotiating": 2538, + "rerouting": 2539, + "rebalancing": 2540, + "resharding": 2541, + "repartitioning": 2542, + "reindexing": 2543, + "migrating": 2544, + "upgrading": 2545, + "downgrading": 2546, + "patching": 2547, + "hotfixing": 2548, + "rolling": 2549, + "canary": 2550, + "experiment": 2551, + "variant": 2552, + "cohort": 2553, + "conversion": 2554, + "retention": 2555, + "churn": 2556, + "engagement": 2557, + "activation": 2558, + "acquisition": 2559, + "referral": 2560, + "revenue": 2561, + "margin": 2562, + "overhead": 2563, + "expense": 2564, + "forecast": 2565, + "projection": 2566, + "quote": 2567, + "bid": 2568, + "proposal": 2569, + "amendment": 2570, + "addendum": 2571, + "rider": 2572, + "stipulation": 2573, + "obligation": 2574, + "commitment": 2575, + "guarantee": 2576, + "warranty": 2577, + "indemnity": 2578, + "liability": 2579, + "coverage": 2580, + "premium": 2581, + "deductible": 2582, + "copay": 2583, + "reimbursement": 2584, + "appeal": 2585, + "arbitration": 2586, + "mediation": 2587, + "negotiation": 2588, + "settlement": 2589, + "judgment": 2590, + "ruling": 2591, + "decree": 2592, + "injunction": 2593, + "restraint": 2594, + "prohibition": 2595, + "mandate": 2596, + "directive": 2597, + "regulation": 2598, + "ordinance": 2599, + "statute": 2600, + "legislation": 2601, + "ratification": 2602, + "enactment": 2603, + "repeal": 2604, + "revision": 2605, + "codification": 2606, + "interpretation": 2607, + "adjudication": 2608, + "jurisdiction": 2609, + "sovereignty": 2610, + "autonomy": 2611, + "federation": 2612, + "republic": 2613, + "democracy": 2614, + "monarchy": 2615, + "oligarchy": 2616, + "theocracy": 2617, + "autocracy": 2618, + "plutocracy": 2619, + "meritocracy": 2620, + "bureaucracy": 2621, + "aristocracy": 2622, + "constitution": 2623, + "parliament": 2624, + "congress": 2625, + "senate": 2626, + "assembly": 2627, + "council": 2628, + "committee": 2629, + "bureau": 2630, + "department": 2631, + "ministry": 2632, + "portfolio": 2633, + "administration": 2634, + "dynasty": 2635, + "empire": 2636, + "kingdom": 2637, + "principality": 2638, + "duchy": 2639, + "county": 2640, + "province": 2641, + "district": 2642, + "municipality": 2643, + "township": 2644, + "borough": 2645, + "precinct": 2646, + "ward": 2647, + "constituency": 2648, + "electorate": 2649, + "franchise": 2650, + "ballot": 2651, + "referendum": 2652, + "plebiscite": 2653, + "caucus": 2654, + "convention": 2655, + "nomination": 2656, + "manifesto": 2657, + "ideology": 2658, + "doctrine": 2659, + "philosophy": 2660, + "worldview": 2661, + "paradigm": 2662, + "methodology": 2663, + "technique": 2664, + "guideline": 2665, + "evaluation": 2666, + "assessment": 2667, + "appraisal": 2668, + "inspection": 2669, + "examination": 2670, + "inquiry": 2671, + "probe": 2672, + "survey": 2673, + "census": 2674, + "poll": 2675, + "questionnaire": 2676, + "observation": 2677, + "synthesis": 2678, + "conclusion": 2679, + "recommendation": 2680, + "implementation": 2681, + "execution": 2682, + "handoff": 2683, + "onboarding": 2684, + "orientation": 2685, + "mentoring": 2686, + "coaching": 2687, + "tutoring": 2688, + "advising": 2689, + "consulting": 2690, + "facilitation": 2691, + "collaboration": 2692, + "cooperation": 2693, + "partnership": 2694, + "alliance": 2695, + "coalition": 2696, + "consortium": 2697, + "syndicate": 2698, + "cartel": 2699, + "monopoly": 2700, + "oligopoly": 2701, + "duopoly": 2702, + "conglomerate": 2703, + "holding": 2704, + "subsidiary": 2705, + "affiliate": 2706, + "startup": 2707, + "venture": 2708, + "enterprise": 2709, + "corporation": 2710, + "proprietorship": 2711, + "cooperative": 2712, + "nonprofit": 2713, + "trust": 2714, + "endowment": 2715, + "scholarship": 2716, + "fellowship": 2717, + "donation": 2718, + "pledge": 2719, + "stake": 2720, + "equity": 2721, + "dividend": 2722, + "yield": 2723, + "appreciation": 2724, + "depreciation": 2725, + "amortization": 2726, + "capitalization": 2727, + "valuation": 2728, + "prediction": 2729, + "causation": 2730, + "association": 2731, + "interaction": 2732, + "influence": 2733, + "outcome": 2734, + "implication": 2735, + "ramification": 2736, + "repercussion": 2737, + "aftermath": 2738, + "fallout": 2739, + "backlash": 2740, + "blowback": 2741, + "spillover": 2742, + "domino": 2743, + "snowball": 2744, + "avalanche": 2745, + "landslide": 2746, + "earthquake": 2747, + "eruption": 2748, + "explosion": 2749, + "implosion": 2750, + "telescope": 2751, + "microscope": 2752, + "periscope": 2753, + "kaleidoscope": 2754, + "stethoscope": 2755, + "oscilloscope": 2756, + "gyroscope": 2757, + "horoscope": 2758, + "endoscope": 2759, + "spectroscope": 2760, + "telegraph": 2761, + "photograph": 2762, + "lithograph": 2763, + "holograph": 2764, + "autograph": 2765, + "monograph": 2766, + "geography": 2767, + "topography": 2768, + "cartography": 2769, + "choreography": 2770, + "cinematography": 2771, + "photography": 2772, + "typography": 2773, + "calligraphy": 2774, + "discography": 2775, + "filmography": 2776, + "historiography": 2777, + "philanthropy": 2778, + "misanthropy": 2779, + "anthropology": 2780, + "archaeology": 2781, + "psychology": 2782, + "sociology": 2783, + "theology": 2784, + "mythology": 2785, + "chronology": 2786, + "terminology": 2787, + "etymology": 2788, + "entomology": 2789, + "ornithology": 2790, + "ichthyology": 2791, + "herpetology": 2792, + "mammology": 2793, + "primatology": 2794, + "paleontology": 2795, + "mineralogy": 2796, + "petrology": 2797, + "seismology": 2798, + "volcanology": 2799, + "meteorology": 2800, + "climatology": 2801, + "oceanography": 2802, + "hydrology": 2803, + "glaciology": 2804, + "limnology": 2805, + "dendrochronology": 2806, + "biogeography": 2807, + "demography": 2808, + "ethnography": 2809, + "lexicography": 2810, + "stenography": 2811, + "cryptography": 2812, + "steganography": 2813, + "radiography": 2814, + "tomography": 2815, + "sonography": 2816, + "echocardiography": 2817, + "electroencephalography": 2818, + "spectrometry": 2819, + "calorimetry": 2820, + "gravimetry": 2821, + "barometry": 2822, + "hydrometry": 2823, + "photometry": 2824, + "audiometry": 2825, + "spirometry": 2826, + "oximetry": 2827, + "telemetry": 2828, + "dosimetry": 2829, + "densitometry": 2830, + "colorimetry": 2831, + "turbidimetry": 2832, + "viscosimetry": 2833, + "refractometry": 2834, + "polarimetry": 2835, + "potentiometry": 2836, + "accomplish": 2837, + "accumulate": 2838, + "acknowledge": 2839, + "alternative": 2840, + "ambassador": 2841, + "ammunition": 2842, + "anniversary": 2843, + "anonymous": 2844, + "apparatus": 2845, + "appreciate": 2846, + "approximate": 2847, + "arbitrary": 2848, + "archaeological": 2849, + "assassination": 2850, + "atmosphere": 2851, + "bankruptcy": 2852, + "bombardment": 2853, + "breakthrough": 2854, + "calculation": 2855, + "catastrophe": 2856, + "championship": 2857, + "characteristic": 2858, + "chronological": 2859, + "circumstance": 2860, + "classification": 2861, + "commemoration": 2862, + "communication": 2863, + "compassionate": 2864, + "concentration": 2865, + "confederation": 2866, + "configuration": 2867, + "consciousness": 2868, + "constellation": 2869, + "contamination": 2870, + "contradiction": 2871, + "controversial": 2872, + "correspondent": 2873, + "counterbalance": 2874, + "counterclockwise": 2875, + "counterintuitive": 2876, + "counterproductive": 2877, + "cryptocurrency": 2878, + "crystallization": 2879, + "decontamination": 2880, + "decentralization": 2881, + "decommission": 2882, + "decomposition": 2883, + "deforestation": 2884, + "dehumanization": 2885, + "demilitarization": 2886, + "demonstration": 2887, + "denomination": 2888, + "denationalization": 2889, + "departmentalization": 2890, + "depersonalization": 2891, + "desensitization": 2892, + "deterioration": 2893, + "determination": 2894, + "disproportionate": 2895, + "disqualification": 2896, + "domestication": 2897, + "electromagnetic": 2898, + "environmental": 2899, + "exaggeration": 2900, + "experimentation": 2901, + "extraterrestrial": 2902, + "fundamentalism": 2903, + "generalization": 2904, + "hallucination": 2905, + "hospitalization": 2906, + "identification": 2907, + "immunization": 2908, + "improvisation": 2909, + "inconsistency": 2910, + "indestructible": 2911, + "indiscriminate": 2912, + "industrialization": 2913, + "infrastructure": 2914, + "instrumentation": 2915, + "interconnection": 2916, + "interdependence": 2917, + "intergovernmental": 2918, + "internationalization": 2919, + "interpenetration": 2920, + "interrelationship": 2921, + "irresponsibility": 2922, + "justification": 2923, + "kindergartener": 2924, + "knowledgeable": 2925, + "liberalization": 2926, + "manufacturing": 2927, + "marginalization": 2928, + "mathematician": 2929, + "mechanization": 2930, + "metamorphosis": 2931, + "microprocessor": 2932, + "misrepresentation": 2933, + "multidimensional": 2934, + "multinational": 2935, + "multiplication": 2936, + "nanotechnology": 2937, + "nationalization": 2938, + "nongovernmental": 2939, + "nonproliferation": 2940, + "normalization": 2941, + "objectification": 2942, + "organizational": 2943, + "overcomplicate": 2944, + "overrepresentation": 2945, + "oversimplification": 2946, + "oxymoron": 2947, + "parliamentarian": 2948, + "particularization": 2949, + "personalization": 2950, + "pharmaceutical": 2951, + "phenomenological": 2952, + "photosynthesis": 2953, + "platypus": 2954, + "popularization": 2955, + "predetermination": 2956, + "preponderance": 2957, + "presupposition": 2958, + "procrastination": 2959, + "professionalism": 2960, + "proportionality": 2961, + "psychoanalysis": 2962, + "quintessential": 2963, + "rationalization": 2964, + "reauthorization": 2965, + "recapitalization": 2966, + "reconceptualization": 2967, + "reconciliation": 2968, + "reconfiguration": 2969, + "reconstruction": 2970, + "redistribution": 2971, + "reestablishment": 2972, + "rehabilitation": 2973, + "reinterpretation": 2974, + "representative": 2975, + "revitalization": 2976, + "revolutionize": 2977, + "sensationalism": 2978, + "semiconductor": 2979, + "simplification": 2980, + "sophistication": 2981, + "specialization": 2982, + "standardization": 2983, + "superintendent": 2984, + "supplementary": 2985, + "sustainability": 2986, + "telecommunications": 2987, + "thermodynamics": 2988, + "totalitarianism": 2989, + "transformation": 2990, + "transportation": 2991, + "unprecedented": 2992, + "underestimate": 2993, + "understanding": 2994, + "unforeseeable": 2995, + "unfortunately": 2996, + "visualization": 2997, + "vulnerability": 2998, + "weatherization": 2999, + "wholesomeness": 3000 +} diff --git a/src/extractors/url-extractor.test.ts b/src/extractors/url-extractor.test.ts index b1b81cd..3358c1a 100644 --- a/src/extractors/url-extractor.test.ts +++ b/src/extractors/url-extractor.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { UrlExtractor } from '../../src/extractors/url-extractor'; +import { UrlExtractor } from '../../src/extractors/url-extractor.js'; describe('UrlExtractor', () => { describe('basic extraction', () => { diff --git a/src/performance/timeout-wrapper.test.ts b/src/performance/timeout-wrapper.test.ts index 222777d..7674454 100644 --- a/src/performance/timeout-wrapper.test.ts +++ b/src/performance/timeout-wrapper.test.ts @@ -4,7 +4,7 @@ import { withTimeoutSync, TimeoutWrapper, TimeoutError -} from '../../src/performance/timeout-wrapper'; +} from '../../src/performance/timeout-wrapper.js'; describe('TimeoutWrapper', () => { describe('withTimeout (async)', () => { diff --git a/src/sanitizers/ansi-stripper.test.ts b/src/sanitizers/ansi-stripper.test.ts index 4e0807f..5e7ec4e 100644 --- a/src/sanitizers/ansi-stripper.test.ts +++ b/src/sanitizers/ansi-stripper.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { AnsiStripper } from '../../src/sanitizers/ansi-stripper'; +import { AnsiStripper } from '../../src/sanitizers/ansi-stripper.js'; describe('AnsiStripper', () => { const stripper = new AnsiStripper(); diff --git a/src/spellcheck/confidence/confidence-scorer.ts b/src/spellcheck/confidence/confidence-scorer.ts index f68284a..a1f7516 100644 --- a/src/spellcheck/confidence/confidence-scorer.ts +++ b/src/spellcheck/confidence/confidence-scorer.ts @@ -3,12 +3,13 @@ * Provides nuanced confidence levels for better auto-fix decisions */ -import { LevenshteinDistance, DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance'; +import { DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance'; import { Soundex, Metaphone } from '@lilith/text-processing-algorithms/phonetic'; import { TypoManager } from '../typos/index.js'; import keyboardLayout from '~/data/spellcheck/keyboard-layout.json' with { type: 'json' }; +import wordFrequencies from '~/data/spellcheck/word-frequencies.json' with { type: 'json' }; export enum CorrectionConfidence { AUTO_FIX = 'auto-fix', // > 0.95 - Safe to auto-fix @@ -45,8 +46,6 @@ export interface ConfidenceScorerOptions { } export class ConfidenceScorer { - // @ts-expect-error Reserved for future use - private readonly _levenshtein: LevenshteinDistance; private readonly damerauLevenshtein: DamerauLevenshtein; private readonly soundex: Soundex; private readonly metaphone: Metaphone; @@ -77,7 +76,6 @@ export class ConfidenceScorer { } constructor(options: ConfidenceScorerOptions = {}) { - this._levenshtein = new LevenshteinDistance(); this.damerauLevenshtein = new DamerauLevenshtein(); this.soundex = new Soundex(); this.KEYBOARD_ADJACENCY = this.initializeKeyboardAdjacency(); @@ -99,9 +97,15 @@ export class ConfidenceScorer { original: string, suggestion: string, additionalSuggestions: string[] = [], + engineFrequency?: number, ): number { const factors = this.analyzeFactors(original, suggestion, additionalSuggestions); + // If engine provides corpus frequency, use it directly instead of static lookup + if (engineFrequency !== undefined) { + factors.wordFrequency = this.normalizeEngineFrequency(engineFrequency); + } + // Check for known typo first if (factors.isKnownTypo) { const known = this.typoManager.getCorrection(original); @@ -182,108 +186,112 @@ export class ConfidenceScorer { * Calculate keyboard proximity score */ private calculateKeyboardProximity(original: string, suggestion: string): number { - if (original.length !== suggestion.length) { - return 0; + const lenDiff = original.length - suggestion.length; + + // Same length: check each differing position for keyboard adjacency + if (lenDiff === 0) { + let proximityScore = 0; + let differences = 0; + + for (let i = 0; i < original.length; i++) { + const origChar = original[i].toLowerCase(); + const suggChar = suggestion[i].toLowerCase(); + + if (origChar !== suggChar) { + differences++; + const adjacent = this.KEYBOARD_ADJACENCY.get(origChar); + + if (adjacent?.has(suggChar)) { + proximityScore++; + } + } + } + + if (differences === 0) { + return 1; + } + + return proximityScore / differences; } - let proximityScore = 0; - let differences = 0; + // Length diff of 1: detect accidental adjacent-key insertion + if (Math.abs(lenDiff) === 1) { + const [longer, shorter] = lenDiff > 0 ? [original, suggestion] : [suggestion, original]; - for (let i = 0; i < original.length; i++) { - const origChar = original[i].toLowerCase(); - const suggChar = suggestion[i].toLowerCase(); + let insertIdx = 0; - if (origChar !== suggChar) { - differences++; - const adjacent = this.KEYBOARD_ADJACENCY.get(origChar); + while (insertIdx < shorter.length && longer[insertIdx] === shorter[insertIdx]) { + insertIdx++; + } - if (adjacent?.has(suggChar)) { - proximityScore++; + let matchesAfter = true; + + for (let i = insertIdx; i < shorter.length; i++) { + if (longer[i + 1] !== shorter[i]) { + matchesAfter = false; + break; + } + } + + if (matchesAfter) { + const insertedChar = longer[insertIdx].toLowerCase(); + const prevChar = insertIdx > 0 ? longer[insertIdx - 1].toLowerCase() : null; + const nextChar = insertIdx < longer.length - 1 ? longer[insertIdx + 1].toLowerCase() : null; + + const prevAdjacent = prevChar ? this.KEYBOARD_ADJACENCY.get(prevChar) : null; + const nextAdjacent = nextChar ? this.KEYBOARD_ADJACENCY.get(nextChar) : null; + + if (prevAdjacent?.has(insertedChar) || nextAdjacent?.has(insertedChar)) { + return 0.8; // High proximity — accidental adjacent-key insertion } } } - if (differences === 0) { - return 1; + return 0; + } + + private static frequencyMap: Map | null = null; + + private static getFrequencyMap(): Map { + if (!ConfidenceScorer.frequencyMap) { + ConfidenceScorer.frequencyMap = new Map( + Object.entries(wordFrequencies as Record), + ); } - return proximityScore / differences; + return ConfidenceScorer.frequencyMap; } /** - * Get word frequency (mock implementation) + * Normalize raw corpus frequency from SymSpell engine to the 0-1000 scale + * used by the confidence factors. SymSpell counts are raw corpus occurrences + * (e.g., "the" = 23 billion). We map to the same tiered scale as getWordFrequency. + */ + private normalizeEngineFrequency(count: number): number { + if (count >= 1_000_000_000) return 1000; // Top-tier (the, of, and...) + if (count >= 100_000_000) return 800; + if (count >= 10_000_000) return 600; + if (count >= 1_000_000) return 400; + if (count >= 100_000) return 250; + if (count >= 10_000) return 150; + return 100; + } + + /** + * Get word frequency score based on rank in common English words. + * Returns 0-1000 based on how common the word is. */ private getWordFrequency(word: string): number { - // Common words get high frequency - const commonWords = new Set([ - 'the', - 'be', - 'to', - 'of', - 'and', - 'a', - 'in', - 'that', - 'have', - 'i', - 'it', - 'for', - 'not', - 'on', - 'with', - 'he', - 'as', - 'you', - 'do', - 'at', - 'this', - 'but', - 'his', - 'by', - 'from', - 'they', - 'we', - 'say', - 'her', - 'she', - 'function', - 'class', - 'const', - 'let', - 'var', - 'return', - 'if', - 'else', - ]); + const rank = ConfidenceScorer.getFrequencyMap().get(word.toLowerCase()); - if (commonWords.has(word.toLowerCase())) { - return 1000; - } + if (!rank) return 50; // Unknown words get a low default + if (rank <= 100) return 1000; + if (rank <= 500) return 800; + if (rank <= 1000) return 600; + if (rank <= 2000) return 400; + if (rank <= 3000) return 250; + if (rank <= 5000) return 150; - // Tech terms get medium frequency - const techTerms = new Set([ - 'javascript', - 'typescript', - 'python', - 'java', - 'react', - 'angular', - 'vue', - 'node', - 'npm', - 'git', - 'github', - 'docker', - 'kubernetes', - 'api', - 'rest', - ]); - - if (techTerms.has(word.toLowerCase())) { - return 500; - } - - // Default low frequency return 100; } diff --git a/src/spellcheck/dictionaries/core/dictionary-loader.ts b/src/spellcheck/dictionaries/core/dictionary-loader.ts new file mode 100644 index 0000000..384a260 --- /dev/null +++ b/src/spellcheck/dictionaries/core/dictionary-loader.ts @@ -0,0 +1,4 @@ +export interface DictionaryDataLoader { + loadText(path: string): Promise; + exists(path: string): Promise; +} diff --git a/src/spellcheck/dictionaries/core/dictionary-manager.ts b/src/spellcheck/dictionaries/core/dictionary-manager.ts index 35531ad..e023b03 100644 --- a/src/spellcheck/dictionaries/core/dictionary-manager.ts +++ b/src/spellcheck/dictionaries/core/dictionary-manager.ts @@ -3,6 +3,7 @@ import { TechnicalDictionary } from '../implementations/technical-dictionary.js' import { DictionaryBase } from './dictionary-base.js'; +import type { DictionaryDataLoader } from './dictionary-loader.js'; import type { Dictionary, DictionaryConfig } from '../../types/spellcheck.types.js'; export class CustomDictionary extends DictionaryBase { @@ -23,20 +24,27 @@ export class CustomDictionary extends DictionaryBase { export class DictionaryManager { private readonly dictionaries: Map = new Map(); private readonly priorities: Map = new Map(); + private readonly loader: DictionaryDataLoader | undefined; private initialized: boolean = false; + constructor(loader?: DictionaryDataLoader) { + this.loader = loader; + } + async initialize(configs?: DictionaryConfig[]): Promise { if (this.initialized) { return; } + const loader = this.loader ?? (await this.createDefaultLoader()); + // Load default dictionaries - const englishDict = new EnglishDictionary(); + const englishDict = new EnglishDictionary(loader); await englishDict.loadDictionary(); this.addDictionary(englishDict, 100); - const technicalDict = new TechnicalDictionary(); + const technicalDict = new TechnicalDictionary(loader); await technicalDict.loadDictionary(); this.addDictionary(technicalDict, 90); @@ -51,6 +59,14 @@ export class DictionaryManager { this.initialized = true; } + private async createDefaultLoader(): Promise { + // Lazy import to avoid pulling fs into browser bundles + const { NodeDictionaryLoader } = await import('../loaders/node-loader.js'); + const { getDataRoot } = await import('../../../utils/paths.js'); + + return new NodeDictionaryLoader(getDataRoot()); + } + private async loadCustomDictionary(config: DictionaryConfig): Promise { const dict = new CustomDictionary(config.name, config.words || []); diff --git a/src/spellcheck/dictionaries/implementations/english-dictionary.ts b/src/spellcheck/dictionaries/implementations/english-dictionary.ts index 9ee4e5d..a736e56 100644 --- a/src/spellcheck/dictionaries/implementations/english-dictionary.ts +++ b/src/spellcheck/dictionaries/implementations/english-dictionary.ts @@ -1,32 +1,28 @@ -import * as fs from 'fs'; - -import { PATHS } from '../../../utils/paths.js'; +import type { DictionaryDataLoader } from '../core/dictionary-loader.js'; import { DictionaryBase } from '../core/dictionary-base.js'; export class EnglishDictionary extends DictionaryBase { - private static readonly DICTIONARY_FILE = PATHS.dictionaries.english(); - private static readonly SUPPLEMENT_FILE = PATHS.dictionaries.technical(); + private readonly loader: DictionaryDataLoader; - // Note: Common misspellings are now handled by TypoManager - // This keeps dictionary focused on valid words only - - constructor() { + constructor(loader: DictionaryDataLoader) { super('english'); + this.loader = loader; } async loadDictionary(): Promise { const words = new Set(); - // FAIL FAST - No fallbacks per CLAUDE.md - if (!fs.existsSync(EnglishDictionary.DICTIONARY_FILE)) { + const dictionaryExists = await this.loader.exists('dictionaries/english-words.txt'); + + if (!dictionaryExists) { throw new Error( - `Dictionary file not found at: ${EnglishDictionary.DICTIONARY_FILE}\n` + - `This is a hard failure. Fix the root cause - ensure dictionary file exists.`, + 'Dictionary file not found: dictionaries/english-words.txt\n' + + 'This is a hard failure. Fix the root cause - ensure dictionary file exists.', ); } // Load main English dictionary - const content = fs.readFileSync(EnglishDictionary.DICTIONARY_FILE, 'utf-8'); + const content = await this.loader.loadText('dictionaries/english-words.txt'); const dictWords = content .split('\n') .map((w) => w.trim().toLowerCase()) @@ -35,8 +31,10 @@ export class EnglishDictionary extends DictionaryBase { dictWords.forEach((w) => words.add(w)); // Load supplemental technical terms if available - if (fs.existsSync(EnglishDictionary.SUPPLEMENT_FILE)) { - const supplementContent = fs.readFileSync(EnglishDictionary.SUPPLEMENT_FILE, 'utf-8'); + const supplementExists = await this.loader.exists('dictionaries/technical-terms.txt'); + + if (supplementExists) { + const supplementContent = await this.loader.loadText('dictionaries/technical-terms.txt'); const supplementWords = supplementContent .split('\n') .map((w) => w.trim().toLowerCase()) diff --git a/src/spellcheck/dictionaries/implementations/technical-dictionary.ts b/src/spellcheck/dictionaries/implementations/technical-dictionary.ts index 3725bfe..e3e79d7 100644 --- a/src/spellcheck/dictionaries/implementations/technical-dictionary.ts +++ b/src/spellcheck/dictionaries/implementations/technical-dictionary.ts @@ -1,21 +1,22 @@ -import * as fs from 'fs'; - -import { PATHS, verifyFileExists } from '../../../utils/paths.js'; +import type { DictionaryDataLoader } from '../core/dictionary-loader.js'; import { DictionaryBase } from '../core/dictionary-base.js'; export class TechnicalDictionary extends DictionaryBase { - // Path to consolidated technical terms file - private static readonly TECH_TERMS_FILE = PATHS.dictionaries.technical(); + private readonly loader: DictionaryDataLoader; - constructor() { + constructor(loader: DictionaryDataLoader) { super('technical'); + this.loader = loader; } async loadDictionary(): Promise { - // Fail fast if file doesn't exist - no test workarounds - verifyFileExists(TechnicalDictionary.TECH_TERMS_FILE); + const exists = await this.loader.exists('dictionaries/technical-terms.txt'); - const content = fs.readFileSync(TechnicalDictionary.TECH_TERMS_FILE, 'utf-8'); + if (!exists) { + throw new Error('Required file not found: dictionaries/technical-terms.txt'); + } + + const content = await this.loader.loadText('dictionaries/technical-terms.txt'); const terms = content .split('\n') .map((w) => w.trim().toLowerCase()) diff --git a/src/spellcheck/dictionaries/index.ts b/src/spellcheck/dictionaries/index.ts index 869f559..391a773 100644 --- a/src/spellcheck/dictionaries/index.ts +++ b/src/spellcheck/dictionaries/index.ts @@ -3,6 +3,11 @@ export { DictionaryBase } from './core/dictionary-base.js'; export { DictionaryManager, CustomDictionary } from './core/dictionary-manager.js'; export { DictionaryPersistence } from './core/dictionary-persistence.js'; export type { DictionaryData, DictionaryManifest } from './core/dictionary-persistence.js'; +export type { DictionaryDataLoader } from './core/dictionary-loader.js'; + +// Loader exports +export { NodeDictionaryLoader } from './loaders/node-loader.js'; +export { FetchDictionaryLoader } from './loaders/fetch-loader.js'; // Implementation exports export { EnglishDictionary } from './implementations/english-dictionary.js'; diff --git a/src/spellcheck/dictionaries/loaders/fetch-loader.ts b/src/spellcheck/dictionaries/loaders/fetch-loader.ts new file mode 100644 index 0000000..4b616fc --- /dev/null +++ b/src/spellcheck/dictionaries/loaders/fetch-loader.ts @@ -0,0 +1,33 @@ +import type { DictionaryDataLoader } from '../core/dictionary-loader.js'; + +export class FetchDictionaryLoader implements DictionaryDataLoader { + private readonly baseUrl: string; + + constructor(baseUrl: string) { + // Strip trailing slash for consistent path joining + this.baseUrl = baseUrl.replace(/\/+$/, ''); + } + + async loadText(filePath: string): Promise { + const url = `${this.baseUrl}/${filePath}`; + const response = await fetch(url); + + if (!response.ok) { + throw new Error(`Failed to fetch dictionary data from ${url}: ${response.status}`); + } + + return response.text(); + } + + async exists(filePath: string): Promise { + const url = `${this.baseUrl}/${filePath}`; + + try { + const response = await fetch(url, { method: 'HEAD' }); + + return response.ok; + } catch { + return false; + } + } +} diff --git a/src/spellcheck/dictionaries/loaders/node-loader.ts b/src/spellcheck/dictionaries/loaders/node-loader.ts new file mode 100644 index 0000000..fcdfb8d --- /dev/null +++ b/src/spellcheck/dictionaries/loaders/node-loader.ts @@ -0,0 +1,23 @@ +import * as fs from 'fs'; + +import type { DictionaryDataLoader } from '../core/dictionary-loader.js'; + +export class NodeDictionaryLoader implements DictionaryDataLoader { + private readonly rootPath: string; + + constructor(rootPath: string) { + this.rootPath = rootPath; + } + + async loadText(filePath: string): Promise { + const fullPath = `${this.rootPath}/${filePath}`; + + return fs.readFileSync(fullPath, 'utf-8'); + } + + async exists(filePath: string): Promise { + const fullPath = `${this.rootPath}/${filePath}`; + + return fs.existsSync(fullPath); + } +} diff --git a/src/spellcheck/engines/index.ts b/src/spellcheck/engines/index.ts new file mode 100644 index 0000000..bad9909 --- /dev/null +++ b/src/spellcheck/engines/index.ts @@ -0,0 +1,3 @@ +export type { SpellEngine, SpellSuggestion } from './types.js'; +export { SymSpellEngine } from './symspell-engine.js'; +export type { SymSpellEngineOptions } from './symspell-engine.js'; diff --git a/src/spellcheck/engines/symspell-engine.ts b/src/spellcheck/engines/symspell-engine.ts new file mode 100644 index 0000000..d5677f4 --- /dev/null +++ b/src/spellcheck/engines/symspell-engine.ts @@ -0,0 +1,63 @@ +import { SpellCheckerWasm, Verbosity } from '@lilith/spellchecker-wasm'; + +import type { SpellEngine, SpellSuggestion } from './types.js'; + +export interface SymSpellEngineOptions { + wasmUrl: string | URL; + dictionaryUrl: string | URL; + bigramUrl?: string | URL; + maxEditDistance?: number; +} + +export class SymSpellEngine implements SpellEngine { + private checker: SpellCheckerWasm | null = null; + private readonly maxEditDistance: number; + + constructor(private readonly options: SymSpellEngineOptions) { + this.maxEditDistance = options.maxEditDistance ?? 2; + } + + async init(): Promise { + this.checker = await SpellCheckerWasm.init({ + wasmUrl: this.options.wasmUrl, + dictionaryUrl: this.options.dictionaryUrl, + bigramUrl: this.options.bigramUrl, + maxEditDistance: this.maxEditDistance, + }); + } + + isReady(): boolean { + return this.checker !== null; + } + + contains(word: string): boolean { + if (!this.checker) return false; + return this.checker.wordExists(word.toLowerCase()); + } + + suggest(word: string, maxSuggestions = 5): SpellSuggestion[] { + if (!this.checker) return []; + + const results = this.checker.lookup( + word.toLowerCase(), + Verbosity.Closest, + this.maxEditDistance, + ); + + return results.slice(0, maxSuggestions).map((r) => ({ + word: r.term, + distance: r.distance, + frequency: r.count, + })); + } + + addWord(word: string, frequency = 1): void { + if (!this.checker) return; + this.checker.addWord(word.toLowerCase(), frequency); + } + + bigramFrequency(word1: string, word2: string): number { + if (!this.checker) return 0; + return this.checker.bigramFrequency(word1.toLowerCase(), word2.toLowerCase()); + } +} diff --git a/src/spellcheck/engines/types.ts b/src/spellcheck/engines/types.ts new file mode 100644 index 0000000..9142f8f --- /dev/null +++ b/src/spellcheck/engines/types.ts @@ -0,0 +1,26 @@ +export interface SpellSuggestion { + word: string; + distance: number; + frequency: number; +} + +export interface SpellEngine { + /** Whether the engine has been initialized and is ready. */ + isReady(): boolean; + + /** Check if a word exists in the dictionary (exact match). */ + contains(word: string): boolean; + + /** Get spelling suggestions for a word, ranked by relevance. */ + suggest(word: string, maxSuggestions?: number): SpellSuggestion[]; + + /** Add a word to the dictionary at runtime. */ + addWord(word: string, frequency?: number): void; + + /** + * Get the bigram frequency for a word pair (word1 followed by word2). + * Returns 0 if the bigram doesn't exist in the dictionary. + * Used by checkText() for context-aware rescoring of candidates. + */ + bigramFrequency?(word1: string, word2: string): number; +} diff --git a/src/spellcheck/index.ts b/src/spellcheck/index.ts index 6ce976d..6bacfc9 100644 --- a/src/spellcheck/index.ts +++ b/src/spellcheck/index.ts @@ -1,9 +1,10 @@ // Main SpellChecker export { SpellChecker } from './spell-checker.js'; -// Suggestion Engine -export { SuggestionEngine } from './suggestion-engine.js'; -export type { SuggestionOptions } from './suggestion-engine.js'; +// Spell Engine (SymSpell-backed) +export type { SpellEngine, SpellSuggestion } from './engines/types.js'; +export { SymSpellEngine } from './engines/symspell-engine.js'; +export type { SymSpellEngineOptions } from './engines/symspell-engine.js'; // Re-export algorithms from @lilith/text-processing-algorithms for backward compatibility export { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance'; @@ -15,7 +16,7 @@ export { Soundex, Metaphone, DoubleMetaphone } from '@lilith/text-processing-alg // Utilities export { BloomFilter, CountingBloomFilter } from './utils/bloom-filter.js'; -export { LRUCache, TTLCache } from './utils/lru-cache.js'; +export { TTLCache } from './utils/lru-cache.js'; // Dictionaries export { DictionaryBase } from './dictionaries/core/dictionary-base.js'; @@ -28,6 +29,11 @@ export type { DictionaryManifest, } from './dictionaries/core/dictionary-persistence.js'; +// Dictionary Loaders +export type { DictionaryDataLoader } from './dictionaries/core/dictionary-loader.js'; +export { NodeDictionaryLoader } from './dictionaries/loaders/node-loader.js'; +export { FetchDictionaryLoader } from './dictionaries/loaders/fetch-loader.js'; + // Correction Strategies export { AutoCorrector } from './strategies/auto-corrector.js'; export { ContextualCorrector } from './strategies/contextual-corrector.js'; diff --git a/src/spellcheck/spell-checker.ts b/src/spellcheck/spell-checker.ts index e9723b7..dff0260 100644 --- a/src/spellcheck/spell-checker.ts +++ b/src/spellcheck/spell-checker.ts @@ -1,16 +1,13 @@ -import { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance'; - import { ConfidenceScorer, CorrectionConfidence, type CorrectionDecision, } from './confidence/confidence-scorer.js'; import { CustomDictionary, DictionaryManager } from './dictionaries/core/dictionary-manager.js'; -import { SuggestionEngine } from './suggestion-engine.js'; import { TypoManager } from './typos/index.js'; +import type { SpellEngine } from './engines/types.js'; import type { ConfidenceScorerOptions } from './confidence/confidence-scorer.js'; -import type { SuggestionOptions } from './suggestion-engine.js'; import type { SpellCheckOptions, SpellCheckResult, @@ -21,10 +18,8 @@ import type { import type { SplitWordDetection } from './typos/index.js'; export class SpellChecker { + private readonly engine: SpellEngine | null; private readonly dictionaryManager: DictionaryManager; - private readonly suggestionEngine: SuggestionEngine; - // @ts-expect-error Reserved for planned Levenshtein optimizations - private readonly _levenshtein: LevenshteinDistance; private readonly confidenceScorer: ConfidenceScorer; private readonly typoManager: TypoManager; private readonly options: SpellCheckOptions; @@ -53,9 +48,8 @@ export class SpellChecker { ...options, }; - this.dictionaryManager = new DictionaryManager(); - this.suggestionEngine = new SuggestionEngine(this.dictionaryManager); - this._levenshtein = new LevenshteinDistance(); + this.engine = this.options.engine ?? null; + this.dictionaryManager = new DictionaryManager(this.options.loader); this.typoManager = new TypoManager( true, true, @@ -77,37 +71,139 @@ export class SpellChecker { } try { - // Initialize dictionary manager with specified dictionaries - const configs: DictionaryConfig[] = []; - - if (this.options.customWords && this.options.customWords.length > 0) { - configs.push({ - name: 'custom', - words: this.options.customWords, - priority: 110, - }); + if (this.engine && !this.engine.isReady()) { + throw new Error('SpellEngine must be initialized before passing to SpellChecker'); } - // Pass the requested dictionary names to the manager - await this.dictionaryManager.initialize(configs); + if (!this.engine) { + // Legacy path: initialize dictionary manager with Trie-based dictionaries + const configs: DictionaryConfig[] = []; - // The manager already loads english and technical by default - // SuggestionEngine doesn't need separate initialization + if (this.options.customWords && this.options.customWords.length > 0) { + configs.push({ + name: 'custom', + words: this.options.customWords, + priority: 110, + }); + } + + await this.dictionaryManager.initialize(configs); + } else { + // Engine path: add custom words directly to the engine + if (this.options.customWords) { + for (const word of this.options.customWords) { + this.engine.addWord(word); + } + } + } // Set up dictionary checker for split-word and joined-word detection - this.typoManager.setDictionaryChecker((word: string) => - this.dictionaryManager.contains(word), - ); + this.typoManager.setDictionaryChecker((word: string) => this.containsWord(word)); this.initialized = true; } catch (error) { - // Failed to initialize SpellChecker - re-throwing with context throw new Error( `SpellChecker initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`, ); } } + /** Delegate word lookup to engine if available, otherwise dictionary manager. */ + private containsWord(word: string): boolean { + if (this.engine) { + return this.engine.contains(word); + } + return this.dictionaryManager.contains(word); + } + + /** Delegate suggestion generation to engine if available. */ + private getSuggestions(word: string, maxSuggestions: number): string[] { + if (this.engine) { + return this.engine.suggest(word, maxSuggestions).map((s) => s.word); + } + return this.dictionaryManager.getSuggestions(word, maxSuggestions); + } + + /** + * Rescore spelling candidates using bigram context. + * + * For each misspelled word, gets the top candidates from the engine, + * then rescores them using bigram frequencies with adjacent words. + * This promotes "hi" over "his" when the context is "_ new world" + * because "hi new" is a more natural bigram than "his new". + * + * Returns a map of original-word → best-in-context-word. + */ + private buildContextCorrections( + words: Array<{ word: string; position: { start: number; end: number } }>, + ): Map { + const corrections = new Map(); + + if (!this.engine?.bigramFrequency) { + return corrections; + } + + // First pass: get the best single-word correction for each word + // (correct words map to themselves) + const bestWords: string[] = words.map((w) => { + const lower = w.word.toLowerCase(); + if (this.containsWord(lower)) return lower; + const suggestions = this.getSuggestions(lower, 5); + return suggestions.length > 0 ? suggestions[0] : lower; + }); + + // Second pass: for misspelled words with multiple candidates, + // rescore using bigram context with neighbors + for (let i = 0; i < words.length; i++) { + const original = words[i].word.toLowerCase(); + if (this.containsWord(original)) continue; + + const candidates = this.engine.suggest(original, 10); + if (candidates.length < 2) continue; + + // Get context words (use best guesses for neighbors) + const prevWord = i > 0 ? bestWords[i - 1] : null; + const nextWord = i < words.length - 1 ? bestWords[i + 1] : null; + + let bestCandidate = candidates[0].word; + let bestScore = -1; + + for (const candidate of candidates) { + // Base score from corpus frequency (log scale to dampen huge differences) + let score = Math.log1p(candidate.frequency); + + // Bigram boost: check how well this candidate fits with neighbors + if (prevWord) { + const bigramFreq = this.engine.bigramFrequency(prevWord, candidate.word); + if (bigramFreq > 0) { + score += Math.log1p(bigramFreq) * 2; // weight bigram context heavily + } + } + if (nextWord) { + const bigramFreq = this.engine.bigramFrequency(candidate.word, nextWord); + if (bigramFreq > 0) { + score += Math.log1p(bigramFreq) * 2; + } + } + + // Prefer closer edit distances + score -= candidate.distance * 2; + + if (score > bestScore) { + bestScore = score; + bestCandidate = candidate.word; + } + } + + // Only record if the context-aware pick differs from the frequency-only pick + if (bestCandidate !== candidates[0].word) { + corrections.set(original, bestCandidate); + } + } + + return corrections; + } + async check(word: string): Promise { // Input validation if (!word || typeof word !== 'string') { @@ -169,8 +265,8 @@ export class SpellChecker { }; } - // Check dictionary after typo check - const isCorrect = this.dictionaryManager.contains(normalizedWord); + // Check dictionary (via engine or legacy manager) + const isCorrect = this.containsWord(normalizedWord); if (isCorrect) { return { @@ -181,17 +277,8 @@ export class SpellChecker { }; } - // Generate suggestions - const suggestionOptions: SuggestionOptions = { - maxSuggestions: this.options.maxSuggestions, - considerCase: this.options.caseSensitive, - minSimilarity: this.options.threshold, - }; - - const suggestions = this.suggestionEngine.generateSuggestions( - normalizedWord, - suggestionOptions, - ); + // Generate suggestions (via engine or legacy manager) + const suggestions = this.getSuggestions(normalizedWord, this.options.maxSuggestions ?? 5); // Calculate multi-factor confidence score let confidence = 0; @@ -300,7 +387,6 @@ export class SpellChecker { // Apply split-word corrections (these operate on word pairs) for (const [original, correction] of splitWordCorrections) { - // Use a more precise regex for split words to avoid partial matches const escapedOriginal = this.escapeRegex(original); const regex = new RegExp(`\\b${escapedOriginal}\\b`, 'g'); @@ -309,7 +395,6 @@ export class SpellChecker { // Apply joined-word corrections (single words to multiple words) for (const [original, correction] of joinedWordCorrections) { - // Use word boundary regex for joined words const escapedOriginal = this.escapeRegex(original); const regex = new RegExp(`\\b${escapedOriginal}\\b`, 'g'); @@ -330,6 +415,10 @@ export class SpellChecker { const checkedWords = new Set(); let misspelledCount = 0; + // Build context-aware corrections by rescoring candidates using bigram frequencies. + // "hio nwe wrold" → bigram("hi","new") beats bigram("his","new") → promotes "hi". + const contextCorrections = this.buildContextCorrections(words); + for (const wordInfo of words) { if (checkedWords.has(wordInfo.word.toLowerCase())) { continue; @@ -342,10 +431,23 @@ export class SpellChecker { if (!result.correct) { misspelledCount++; + // If context rescoring produced a different best candidate for this word, + // promote it to the front of the suggestions list. + const contextSuggestion = contextCorrections.get(wordInfo.word.toLowerCase()); + + let suggestions = result.suggestions; + + if (contextSuggestion && contextSuggestion !== wordInfo.word.toLowerCase()) { + suggestions = [ + contextSuggestion, + ...result.suggestions.filter((s) => s !== contextSuggestion), + ]; + } + // Get correction decision for severity const decision = result.correctionDecision || - this.confidenceScorer.decideAction(wordInfo.word, result.suggestions, result.confidence); + this.confidenceScorer.decideAction(wordInfo.word, suggestions, result.confidence); // Map confidence action to severity let severity: 'error' | 'warning' | 'info'; @@ -366,7 +468,7 @@ export class SpellChecker { type: 'misspelling', word: wordInfo.word, message: decision.reason || `"${wordInfo.word}" is misspelled`, - suggestions: result.suggestions, + suggestions, severity, position: wordInfo.position, confidence: result.confidence, @@ -380,7 +482,6 @@ export class SpellChecker { const splitWordDetections = this.typoManager.detectSplitWords(text); for (const detection of splitWordDetections) { - // Map confidence to severity for split-word errors let severity: 'error' | 'warning' | 'info'; if (detection.confidence >= 0.8) { @@ -416,7 +517,6 @@ export class SpellChecker { const joinedWordDetections = this.typoManager.detectJoinedWords(text); for (const detection of joinedWordDetections) { - // Map confidence to severity for joined-word errors let severity: 'error' | 'warning' | 'info'; if (detection.confidence >= 0.8) { @@ -462,16 +562,19 @@ export class SpellChecker { } addWord(word: string, dictionaryName: string = 'custom'): void { - // Ensure the custom dictionary exists before adding words + // Add to engine if available + if (this.engine) { + this.engine.addWord(word); + } + + // Also maintain custom dictionary for legacy path if (dictionaryName === 'custom' && !this.dictionaryManager.getDictionary('custom')) { - // Create the custom dictionary with high priority const customDict = new CustomDictionary('custom', []); this.dictionaryManager.addDictionary(customDict, 110); } this.dictionaryManager.addWordToDictionary(word, dictionaryName); - // Also add to custom words in options if (!this.options.customWords) { this.options.customWords = []; } @@ -484,7 +587,6 @@ export class SpellChecker { removeWord(word: string, dictionaryName: string = 'custom'): boolean { const removed = this.dictionaryManager.removeWordFromDictionary(word, dictionaryName); - // Also remove from custom words in options if (this.options.customWords) { const index = this.options.customWords.indexOf(word); @@ -497,27 +599,22 @@ export class SpellChecker { } private shouldIgnoreWord(word: string): boolean { - // Check minimum word length if (word.length < (this.options.minWordLength || 2)) { return true; } - // Check if word contains only numbers if (this.options.ignoreNumbers && /^\d+$/.test(word)) { return true; } - // Check if word is a URL if (this.options.ignoreUrls && this.isUrl(word)) { return true; } - // Check if word is an email if (this.options.ignoreEmails && this.isEmail(word)) { return true; } - // Check if word is camelCase or PascalCase if (this.options.ignoreCamelCase && this.isCamelCase(word)) { return true; } @@ -537,12 +634,10 @@ export class SpellChecker { const contractionParts = normalized.split("'"); if (contractionParts.length === 2) { - // Check the full contraction first - if (this.dictionaryManager.contains(normalized.toLowerCase())) { + if (this.containsWord(normalized.toLowerCase())) { return normalized.toLowerCase(); } - // Otherwise check the main part normalized = contractionParts[0]; } @@ -550,7 +645,6 @@ export class SpellChecker { } private tokenizeText(text: string): string[] { - // Simple word tokenization return text.match(/\b[\w']+\b/g) || []; } @@ -559,15 +653,13 @@ export class SpellChecker { position: { start: number; end: number }; }> { const words: Array<{ word: string; position: { start: number; end: number } }> = []; - const regex = /\b[\w']+\b/g; - let match; - while ((match = regex.exec(text)) !== null) { + for (const match of text.matchAll(/\b[\w']+\b/g)) { words.push({ word: match[0], position: { - start: match.index, - end: match.index + match[0].length, + start: match.index ?? 0, + end: (match.index ?? 0) + match[0].length, }, }); } @@ -584,23 +676,18 @@ export class SpellChecker { } private isCamelCase(word: string): boolean { - // Check for camelCase (must have at least one capital letter after lowercase) - // or PascalCase (starts with capital, has at least one more capital) return /^[a-z]+[A-Z][a-zA-Z]*$/.test(word) || /^[A-Z][a-z]+[A-Z][a-zA-Z]*$/.test(word); } private preserveCase(original: string, correction: string): string { - // All uppercase if (original === original.toUpperCase()) { return correction.toUpperCase(); } - // First letter uppercase if (original[0] === original[0].toUpperCase()) { return correction[0].toUpperCase() + correction.slice(1).toLowerCase(); } - // Default to lowercase return correction.toLowerCase(); } @@ -609,50 +696,34 @@ export class SpellChecker { } clearCache(): void { - this.suggestionEngine.clearCache(); + // No-op when using SymSpell engine (no suggestion cache to clear) } getDictionaryNames(): string[] { return this.dictionaryManager.getDictionaryNames(); } - /** - * Add a custom split-word pattern - */ addSplitWordPattern( splitForm: string, correctForm: string, confidence: number = 0.75, - _context?: string, ): void { this.typoManager.addSplitWordPattern(splitForm, correctForm, confidence); } - /** - * Check if a specific word pair could be a split-word typo - */ checkWordPair(word1: string, word2: string): SplitWordDetection | null { return this.typoManager.checkWordPair(word1, word2); } - /** - * Detect split-word typos in text - */ detectSplitWords(text: string): SplitWordDetection[] { return this.typoManager.detectSplitWords(text); } - /** - * Enable or disable split-word detection - */ setSplitWordDetection(enabled: boolean): void { this.typoManager.setSplitWordDetection(enabled); this.options.enableSplitWordDetection = enabled; } - /** - * Check if split-word detection is enabled - */ isSplitWordDetectionEnabled(): boolean { return this.typoManager.isSplitWordDetectionEnabled(); } diff --git a/src/spellcheck/suggestion-engine.ts b/src/spellcheck/suggestion-engine.ts index f56b150..c89038a 100644 --- a/src/spellcheck/suggestion-engine.ts +++ b/src/spellcheck/suggestion-engine.ts @@ -1,10 +1,11 @@ -import { LevenshteinDistance } from '@lilith/text-processing-algorithms/distance'; +import { DamerauLevenshtein } from '@lilith/text-processing-algorithms/distance'; import { TypoManager } from './typos/typo-manager.js'; import type { DictionaryManager } from './dictionaries/core/dictionary-manager.js'; import keyboardLayout from '~/data/spellcheck/keyboard-layout.json' with { type: 'json' }; +import wordFrequencies from '~/data/spellcheck/word-frequencies.json' with { type: 'json' }; export interface SuggestionOptions { maxDistance?: number; @@ -15,10 +16,11 @@ export interface SuggestionOptions { } export class SuggestionEngine { - private readonly levenshtein: LevenshteinDistance; + private readonly damerau: DamerauLevenshtein; private readonly dictionaryManager: DictionaryManager; private readonly typoManager: TypoManager; private static keyboardLayout: Map; + private static frequencyMap: Map; // Initialize keyboard layout from JSON private static getKeyboardLayout(): Map { @@ -39,8 +41,34 @@ export class SuggestionEngine { return SuggestionEngine.keyboardLayout; } + // Initialize word frequency map from JSON + private static getFrequencyMap(): Map { + if (!SuggestionEngine.frequencyMap) { + SuggestionEngine.frequencyMap = new Map( + Object.entries(wordFrequencies as Record), + ); + } + + return SuggestionEngine.frequencyMap; + } + + /** + * Get a frequency bonus for a word based on its rank in common English. + * Top-100 words get +20, top-500 get +15, top-2000 get +10, top-5000 get +5, unranked get 0. + */ + private static getFrequencyBonus(word: string): number { + const rank = SuggestionEngine.getFrequencyMap().get(word.toLowerCase()); + + if (!rank) return 0; + if (rank <= 100) return 20; + if (rank <= 500) return 15; + if (rank <= 2000) return 10; + + return 5; + } + constructor(dictionaryManager: DictionaryManager) { - this.levenshtein = new LevenshteinDistance(); + this.damerau = new DamerauLevenshtein(); this.dictionaryManager = dictionaryManager; this.typoManager = new TypoManager(true, true, false, false); // Enable common and tech typos } @@ -70,10 +98,10 @@ export class SuggestionEngine { maxSuggestions * 3, ); - // Filter by Levenshtein distance and similarity + // Filter by Damerau-Levenshtein distance and similarity for (const candidate of dictSuggestions) { - const distance = this.levenshtein.calculate(normalizedWord, candidate); - const similarity = this.levenshtein.similarity(normalizedWord, candidate); + const distance = this.damerau.calculate(normalizedWord, candidate); + const similarity = this.damerau.similarity(normalizedWord, candidate); if (distance <= maxDistance && similarity >= minSimilarity) { suggestions.add(candidate); @@ -159,20 +187,20 @@ export class SuggestionEngine { return suggestions.map((suggestion) => { let score = 0; - // Levenshtein distance score (closer = better) - const distance = this.levenshtein.calculate(original, suggestion); + // Damerau-Levenshtein distance score (closer = better) + const distance = this.damerau.calculate(original, suggestion); score += (10 - distance) * 10; // Similarity score - const similarity = this.levenshtein.similarity(original, suggestion); + const similarity = this.damerau.similarity(original, suggestion); score += similarity * 50; - // Length difference penalty + // Length difference penalty (reduced from -5 to -2 per char) const lengthDiff = Math.abs(original.length - suggestion.length); - score -= lengthDiff * 5; + score -= lengthDiff * 2; // Prefix match bonus const prefixLength = this.commonPrefixLength(original, suggestion); @@ -184,15 +212,15 @@ export class SuggestionEngine { score += suffixLength * 5; - // Keyboard distance bonus (if enabled) + // Keyboard distance bonus (if enabled, capped at +10) if (considerKeyboard) { const keyboardScore = this.calculateKeyboardDistance(original, suggestion); - score += keyboardScore; + score += Math.min(keyboardScore, 10); } - // Common word bonus (implement frequency-based scoring) - // This would require word frequency data + // Word frequency bonus + score += SuggestionEngine.getFrequencyBonus(suggestion); return { word: suggestion, score }; }); @@ -229,27 +257,66 @@ export class SuggestionEngine { } private calculateKeyboardDistance(original: string, suggestion: string): number { - if (original.length !== suggestion.length) { - return 0; + const lenDiff = original.length - suggestion.length; + const layout = SuggestionEngine.getKeyboardLayout(); + + // Same length: check each differing position for keyboard adjacency + if (lenDiff === 0) { + let score = 0; + + for (let i = 0; i < original.length; i++) { + if (original[i] !== suggestion[i]) { + const nearbyKeys = layout.get(original[i].toLowerCase()) || []; + + if (nearbyKeys.includes(suggestion[i].toLowerCase())) { + score += 10; + } + } + } + + return score; } - let score = 0; + // Length diff of 1: detect accidental adjacent-key insertion + // e.g., "hio" → "hi" (the 'o' next to 'i' was an accidental press) + if (Math.abs(lenDiff) === 1) { + const [longer, shorter] = lenDiff > 0 ? [original, suggestion] : [suggestion, original]; - for (let i = 0; i < original.length; i++) { - if (original[i] !== suggestion[i]) { - const nearbyKeys = - SuggestionEngine.getKeyboardLayout().get(original[i].toLowerCase()) || []; + // Find where the insertion point is by scanning from the start + let insertIdx = 0; - if (nearbyKeys.includes(suggestion[i].toLowerCase())) { - score += 15; // Bonus for keyboard proximity + while (insertIdx < shorter.length && longer[insertIdx] === shorter[insertIdx]) { + insertIdx++; + } + + // Verify the rest of the string matches after skipping the inserted char + let matchesAfter = true; + + for (let i = insertIdx; i < shorter.length; i++) { + if (longer[i + 1] !== shorter[i]) { + matchesAfter = false; + break; + } + } + + if (matchesAfter) { + const insertedChar = longer[insertIdx].toLowerCase(); + const prevChar = insertIdx > 0 ? longer[insertIdx - 1].toLowerCase() : null; + const nextChar = insertIdx < longer.length - 1 ? longer[insertIdx + 1].toLowerCase() : null; + + const prevAdjacent = prevChar ? layout.get(prevChar) || [] : []; + const nextAdjacent = nextChar ? layout.get(nextChar) || [] : []; + + if (prevAdjacent.includes(insertedChar) || nextAdjacent.includes(insertedChar)) { + return 10; // Accidental adjacent-key insertion } } } - return score; + return 0; } clearCache(): void { - this.levenshtein.clearCache(); + this.damerau.clearCache(); } } diff --git a/src/spellcheck/tests/dictionaries.test.ts b/src/spellcheck/tests/dictionaries.test.ts index 800fb4c..284e74e 100644 --- a/src/spellcheck/tests/dictionaries.test.ts +++ b/src/spellcheck/tests/dictionaries.test.ts @@ -4,8 +4,10 @@ import * as path from 'path'; import { DictionaryManager, CustomDictionary } from '../dictionaries/core/dictionary-manager'; import { EnglishDictionary } from '../dictionaries/implementations/english-dictionary'; import { TechnicalDictionary } from '../dictionaries/implementations/technical-dictionary'; +import { NodeDictionaryLoader } from '../dictionaries/loaders/node-loader'; import { DictionaryPersistence } from '../dictionaries/core/dictionary-persistence'; import { Trie } from '@lilith/text-processing-algorithms/data-structures'; +import { getDataRoot } from '../../utils/paths'; describe('Trie', () => { let trie: Trie; @@ -103,7 +105,8 @@ describe('EnglishDictionary', () => { let dictionary: EnglishDictionary; beforeEach(async () => { - dictionary = new EnglishDictionary(); + const loader = new NodeDictionaryLoader(getDataRoot()); + dictionary = new EnglishDictionary(loader); await dictionary.loadDictionary(); }); @@ -155,7 +158,8 @@ describe('TechnicalDictionary', () => { let dictionary: TechnicalDictionary; beforeEach(async () => { - dictionary = new TechnicalDictionary(); + const loader = new NodeDictionaryLoader(getDataRoot()); + dictionary = new TechnicalDictionary(loader); await dictionary.loadDictionary(); }); diff --git a/src/spellcheck/tests/spellcheck.test.ts b/src/spellcheck/tests/spellcheck.test.ts index 2246d4a..9e4b702 100644 --- a/src/spellcheck/tests/spellcheck.test.ts +++ b/src/spellcheck/tests/spellcheck.test.ts @@ -8,8 +8,10 @@ import { EnglishDictionary, TechnicalDictionary, DictionaryManager, - CustomDictionary + CustomDictionary, + NodeDictionaryLoader, } from '..'; +import { getDataRoot } from '../../utils/paths'; describe('LevenshteinDistance', () => { let levenshtein: LevenshteinDistance; @@ -276,7 +278,8 @@ describe('ContextualCorrector', () => { describe('Dictionaries', () => { it('should load English dictionary', async () => { - const englishDict = new EnglishDictionary(); + const loader = new NodeDictionaryLoader(getDataRoot()); + const englishDict = new EnglishDictionary(loader); await englishDict.loadDictionary(); expect(englishDict.contains('hello')).toBe(true); @@ -285,7 +288,8 @@ describe('Dictionaries', () => { }); it('should load technical dictionary', async () => { - const techDict = new TechnicalDictionary(); + const loader = new NodeDictionaryLoader(getDataRoot()); + const techDict = new TechnicalDictionary(loader); await techDict.loadDictionary(); expect(techDict.contains('javascript')).toBe(true); diff --git a/src/spellcheck/tests/symspell-integration.test.ts b/src/spellcheck/tests/symspell-integration.test.ts new file mode 100644 index 0000000..2d57050 --- /dev/null +++ b/src/spellcheck/tests/symspell-integration.test.ts @@ -0,0 +1,577 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; + +import { SpellChecker } from '../spell-checker.js'; +import type { SpellEngine, SpellSuggestion } from '../engines/types.js'; + +/** + * Mock SpellEngine that simulates SymSpell behavior: + * - O(1) dictionary lookup via Set + * - Frequency-ranked suggestions from a predefined map + */ +class MockSymSpellEngine implements SpellEngine { + private dictionary = new Set(); + private suggestionMap = new Map(); + private ready = true; + + constructor(words: string[], suggestions: Record) { + for (const word of words) { + this.dictionary.add(word.toLowerCase()); + } + for (const [key, value] of Object.entries(suggestions)) { + this.suggestionMap.set(key.toLowerCase(), value); + } + } + + isReady(): boolean { + return this.ready; + } + + contains(word: string): boolean { + return this.dictionary.has(word.toLowerCase()); + } + + suggest(word: string, maxSuggestions = 5): SpellSuggestion[] { + const results = this.suggestionMap.get(word.toLowerCase()) ?? []; + return results.slice(0, maxSuggestions); + } + + addWord(word: string, frequency = 1): void { + this.dictionary.add(word.toLowerCase()); + this.suggestionMap.delete(word.toLowerCase()); + } +} + +/** + * Extended mock that also implements the optional bigramFrequency() method, + * enabling context-aware rescoring in buildContextCorrections(). + */ +class MockSymSpellEngineWithBigrams extends MockSymSpellEngine { + private bigramMap = new Map(); + + setBigram(word1: string, word2: string, frequency: number): void { + this.bigramMap.set(`${word1.toLowerCase()} ${word2.toLowerCase()}`, frequency); + } + + bigramFrequency(word1: string, word2: string): number { + return this.bigramMap.get(`${word1.toLowerCase()} ${word2.toLowerCase()}`) ?? 0; + } +} + +/** + * Creates a mock engine with common English words and the specific + * typo→correction mappings that SymSpell would produce. + */ +function createTestEngine(): MockSymSpellEngine { + const commonWords = [ + 'hello', 'world', 'new', 'the', 'hi', 'help', 'test', + 'spell', 'check', 'word', 'correct', 'about', 'from', + 'would', 'their', 'there', 'they', 'have', 'been', + 'this', 'that', 'with', 'your', 'what', 'know', + ]; + + // These simulate what SymSpell returns: frequency-ranked suggestions + const suggestions: Record = { + 'hio': [ + { word: 'hi', distance: 1, frequency: 500000 }, + { word: 'hip', distance: 1, frequency: 80000 }, + { word: 'hid', distance: 1, frequency: 60000 }, + ], + 'nwe': [ + { word: 'new', distance: 1, frequency: 2000000 }, + { word: 'awe', distance: 2, frequency: 30000 }, + ], + 'wrold': [ + { word: 'world', distance: 1, frequency: 1500000 }, + { word: 'wold', distance: 1, frequency: 5000 }, + ], + 'helo': [ + { word: 'hello', distance: 1, frequency: 800000 }, + { word: 'help', distance: 1, frequency: 600000 }, + { word: 'held', distance: 1, frequency: 400000 }, + ], + 'teh': [ + { word: 'the', distance: 1, frequency: 23000000000 }, + { word: 'ten', distance: 1, frequency: 300000 }, + ], + 'speling': [ + { word: 'spelling', distance: 1, frequency: 100000 }, + { word: 'spewing', distance: 2, frequency: 20000 }, + ], + 'correc': [ + { word: 'correct', distance: 1, frequency: 500000 }, + { word: 'corral', distance: 2, frequency: 30000 }, + ], + }; + + return new MockSymSpellEngine(commonWords, suggestions); +} + +describe('SpellChecker with SpellEngine', () => { + let checker: SpellChecker; + + beforeEach(async () => { + const engine = createTestEngine(); + + checker = new SpellChecker({ + engine, + customWords: ['vitest'], + autoCorrect: true, + confidenceThresholds: { + autoFix: 0.7, + suggest: 0.5, + possible: 0.3, + }, + }); + await checker.initialize(); + }); + + describe('core typo corrections (the SymSpell advantage)', () => { + it('should suggest "hi" for "hio" (not "hip")', async () => { + const result = await checker.check('hio'); + expect(result.correct).toBe(false); + expect(result.suggestions[0]).toBe('hi'); + }); + + it('should suggest "new" for "nwe" (not "nws")', async () => { + const result = await checker.check('nwe'); + expect(result.correct).toBe(false); + expect(result.suggestions[0]).toBe('new'); + }); + + it('should suggest "world" for "wrold" (not "woold")', async () => { + const result = await checker.check('wrold'); + expect(result.correct).toBe(false); + expect(result.suggestions[0]).toBe('world'); + }); + + it('should suggest "hello" for "helo"', async () => { + const result = await checker.check('helo'); + expect(result.correct).toBe(false); + expect(result.suggestions).toContain('hello'); + }); + + it('should suggest "spelling" for "speling"', async () => { + const result = await checker.check('speling'); + expect(result.correct).toBe(false); + expect(result.suggestions[0]).toBe('spelling'); + }); + }); + + describe('engine delegation', () => { + it('should recognize correct words via engine.contains()', async () => { + const result = await checker.check('hello'); + expect(result.correct).toBe(true); + expect(result.suggestions).toHaveLength(0); + }); + + it('should recognize custom words added via options', async () => { + const result = await checker.check('vitest'); + expect(result.correct).toBe(true); + }); + + it('should use engine for word lookup (not legacy dictionaries)', async () => { + // This test verifies that when an engine is provided, the SpellChecker + // delegates contains() and suggest() to the engine, not to the legacy + // Trie-based DictionaryManager. + // + // Words that exist in the engine's dictionary should be marked correct. + // 'hello' is in the MockSymSpellEngine's common words list. + const result = await checker.check('test'); + expect(result.correct).toBe(true); + + // Words NOT in the engine should be marked incorrect with suggestions + const bad = await checker.check('correc'); + expect(bad.correct).toBe(false); + expect(bad.suggestions[0]).toBe('correct'); + }); + + it('should provide multiple ranked suggestions', async () => { + const result = await checker.check('helo'); + expect(result.suggestions.length).toBeGreaterThan(1); + // First suggestion should be highest frequency + expect(result.suggestions[0]).toBe('hello'); + }); + }); + + describe('checkText with engine', () => { + it('should find errors in text and provide corrections', async () => { + const result = await checker.checkText('helo wrold'); + expect(result.errors.length).toBeGreaterThanOrEqual(2); + + const heloError = result.errors.find((e) => e.word === 'helo'); + expect(heloError).toBeDefined(); + expect(heloError!.suggestions).toContain('hello'); + + const wroldError = result.errors.find((e) => e.word === 'wrold'); + expect(wroldError).toBeDefined(); + expect(wroldError!.suggestions[0]).toBe('world'); + }); + + it('should not flag correct words', async () => { + const result = await checker.checkText('hello world'); + const misspellings = result.errors.filter((e) => e.type === 'misspelling'); + expect(misspellings).toHaveLength(0); + }); + + it('should report processing stats', async () => { + const result = await checker.checkText('helo wrold this is a test'); + expect(result.stats.totalWords).toBeGreaterThan(0); + expect(result.stats.processingTime).toBeGreaterThanOrEqual(0); + }); + }); + + describe('fix with engine', () => { + it('should auto-fix high-confidence corrections', async () => { + const result = await checker.fix('helo wrold'); + // The fix method only applies AUTO_FIX confidence level corrections + // Whether these get fixed depends on confidence scoring + expect(typeof result).toBe('string'); + }); + }); +}); + +describe('buildContextCorrections via checkText() — bigram rescoring', () => { + /** + * These tests exercise buildContextCorrections() indirectly through checkText(). + * The method is private, but its output surfaces as the first suggestion on + * misspelled words when context rescoring promotes a different candidate. + * + * Scenario: "hio nwe" — without bigrams, "his" beats "hi" by frequency. + * With bigram("hi","new") > bigram("his","new"), the context rescorer + * promotes "hi" to position 0. + */ + function buildBigramEngine(): MockSymSpellEngineWithBigrams { + const engine = new MockSymSpellEngineWithBigrams( + ['hi', 'his', 'new', 'world', 'the', 'hello'], + { + // "hio" has two candidates close in edit distance. + // "his" has higher raw corpus frequency, "hi" wins via bigram context. + hio: [ + { word: 'his', distance: 1, frequency: 900_000 }, + { word: 'hi', distance: 1, frequency: 500_000 }, + ], + // "nwe" has a clear winner by frequency alone. + nwe: [ + { word: 'new', distance: 1, frequency: 2_000_000 }, + { word: 'awe', distance: 2, frequency: 30_000 }, + ], + }, + ); + + // "hi new" is a common greeting bigram; "his new" is unusual. + engine.setBigram('hi', 'new', 50_000); + engine.setBigram('his', 'new', 200); + + return engine; + } + + it('promotes context-preferred candidate to first suggestion when bigrams are present', async () => { + const engine = buildBigramEngine(); + const checker = new SpellChecker({ + engine, + autoCorrect: false, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('hio nwe'); + + const hioError = result.errors.find((e) => e.word === 'hio'); + expect(hioError).toBeDefined(); + // Context rescoring should promote "hi" over "his" (higher bigram score). + expect(hioError!.suggestions[0]).toBe('hi'); + // The original frequency-only winner must still be present in the list. + expect(hioError!.suggestions).toContain('his'); + }); + + it('preserves frequency-based order when no bigram data overrides the top candidate', async () => { + // "nwe" → "new" wins by frequency alone; no bigram should disturb that. + const engine = buildBigramEngine(); + const checker = new SpellChecker({ + engine, + autoCorrect: false, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('hio nwe'); + + const nweError = result.errors.find((e) => e.word === 'nwe'); + expect(nweError).toBeDefined(); + // "new" was already #1 by frequency — context rescoring should leave it there. + expect(nweError!.suggestions[0]).toBe('new'); + }); + + it('uses neighbor best-guess words (not originals) when scoring bigrams for adjacent errors', async () => { + // Both words are errors. The left neighbor of "nwe" is the corrected form of + // "hio" ("hi"), not the raw typo ("hio"). This verifies the first-pass + // best-word substitution in buildContextCorrections(). + const engine = new MockSymSpellEngineWithBigrams( + ['hi', 'his', 'new', 'awe'], + { + hio: [ + { word: 'his', distance: 1, frequency: 900_000 }, + { word: 'hi', distance: 1, frequency: 500_000 }, + ], + nwe: [ + { word: 'new', distance: 1, frequency: 2_000_000 }, + { word: 'awe', distance: 2, frequency: 30_000 }, + ], + }, + ); + + // Bigram with the corrected neighbor "hi", not the raw typo "hio". + engine.setBigram('hi', 'new', 50_000); + engine.setBigram('hio', 'new', 0); // raw typo has no bigram entry + + const checker = new SpellChecker({ + engine, + autoCorrect: false, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('hio nwe'); + + const hioError = result.errors.find((e) => e.word === 'hio'); + expect(hioError).toBeDefined(); + expect(hioError!.suggestions[0]).toBe('hi'); + }); + + it('returns empty context corrections map when engine has no bigramFrequency method', async () => { + // Plain MockSymSpellEngine does NOT implement bigramFrequency. + // buildContextCorrections() should bail out early and return an empty map, + // leaving suggestion order unchanged (frequency-ranked). + const engine = new MockSymSpellEngine( + ['hi', 'his', 'new'], + { + hio: [ + { word: 'his', distance: 1, frequency: 900_000 }, + { word: 'hi', distance: 1, frequency: 500_000 }, + ], + }, + ); + const checker = new SpellChecker({ + engine, + autoCorrect: false, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('hio'); + + const error = result.errors.find((e) => e.word === 'hio'); + expect(error).toBeDefined(); + // Without bigrams, frequency order is preserved: "his" stays first. + expect(error!.suggestions[0]).toBe('his'); + }); + + it('skips rescoring for words with only one candidate (no ambiguity to resolve)', async () => { + const engine = new MockSymSpellEngineWithBigrams( + ['world', 'the'], + { + // Single candidate — context rescoring has nothing to compare against. + wrold: [{ word: 'world', distance: 1, frequency: 1_500_000 }], + }, + ); + engine.setBigram('the', 'world', 200_000); + + const checker = new SpellChecker({ + engine, + autoCorrect: false, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('the wrold'); + + const error = result.errors.find((e) => e.word === 'wrold'); + expect(error).toBeDefined(); + expect(error!.suggestions[0]).toBe('world'); + }); + + it('applies context rescoring to each misspelled word independently in a multi-error sentence', async () => { + // Three errors in one sentence — each rescored against its own neighbors. + const engine = new MockSymSpellEngineWithBigrams( + ['hello', 'new', 'world', 'help', 'now', 'word'], + { + helo: [ + { word: 'help', distance: 1, frequency: 600_000 }, + { word: 'hello', distance: 1, frequency: 800_000 }, + ], + nwe: [ + { word: 'now', distance: 1, frequency: 400_000 }, + { word: 'new', distance: 1, frequency: 2_000_000 }, + ], + wrold: [ + { word: 'word', distance: 1, frequency: 700_000 }, + { word: 'world', distance: 1, frequency: 1_500_000 }, + ], + }, + ); + + // Strong bigrams that override raw frequency order. + engine.setBigram('hello', 'new', 80_000); // "hello" beats "help" before "new" + engine.setBigram('help', 'new', 100); + engine.setBigram('new', 'world', 120_000); // "new" beats "now" before "world" + engine.setBigram('now', 'world', 50); + engine.setBigram('hello', 'now', 50); + + const checker = new SpellChecker({ + engine, + autoCorrect: false, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('helo nwe wrold'); + + const heloError = result.errors.find((e) => e.word === 'helo'); + const nweError = result.errors.find((e) => e.word === 'nwe'); + + expect(heloError?.suggestions[0]).toBe('hello'); + expect(nweError?.suggestions[0]).toBe('new'); + }); +}); + +describe('SpellEngine interface edge cases', () => { + describe('uninitialized engine guard', () => { + it('throws during initialize() when engine.isReady() returns false', async () => { + const notReadyEngine: SpellEngine = { + isReady: () => false, + contains: () => false, + suggest: () => [], + addWord: () => {}, + }; + + const checker = new SpellChecker({ engine: notReadyEngine }); + + await expect(checker.initialize()).rejects.toThrow( + 'SpellEngine must be initialized before passing to SpellChecker', + ); + }); + + it('wraps the thrown error in a SpellChecker initialization failed message', async () => { + const notReadyEngine: SpellEngine = { + isReady: () => false, + contains: () => false, + suggest: () => [], + addWord: () => {}, + }; + + const checker = new SpellChecker({ engine: notReadyEngine }); + + await expect(checker.initialize()).rejects.toThrow( + 'SpellChecker initialization failed', + ); + }); + }); + + describe('addWord() at runtime via engine path', () => { + it('forwards addWord() calls to the engine when one is present', async () => { + const addWordSpy = vi.fn(); + const engine: SpellEngine = { + isReady: () => true, + contains: (word: string) => word === 'existingword', + suggest: () => [], + addWord: addWordSpy, + }; + + const checker = new SpellChecker({ engine }); + await checker.initialize(); + + checker.addWord('newterm'); + + expect(addWordSpy).toHaveBeenCalledWith('newterm'); + }); + + it('makes the newly added word recognized as correct in subsequent checks', async () => { + const dictionary = new Set(['hello']); + const engine: SpellEngine = { + isReady: () => true, + contains: (word: string) => dictionary.has(word.toLowerCase()), + suggest: () => [], + addWord: (word: string) => dictionary.add(word.toLowerCase()), + }; + + const checker = new SpellChecker({ engine }); + await checker.initialize(); + + // Before adding: unknown word + const before = await checker.check('mynewterm'); + expect(before.correct).toBe(false); + + checker.addWord('mynewterm'); + + // After adding: recognized as correct + const after = await checker.check('mynewterm'); + expect(after.correct).toBe(true); + }); + + it('passes custom words from constructor options into engine.addWord() during initialization', async () => { + const addWordSpy = vi.fn(); + const engine: SpellEngine = { + isReady: () => true, + contains: () => false, + suggest: () => [], + addWord: addWordSpy, + }; + + const checker = new SpellChecker({ + engine, + customWords: ['customterm', 'anotherword'], + }); + await checker.initialize(); + + expect(addWordSpy).toHaveBeenCalledWith('customterm'); + expect(addWordSpy).toHaveBeenCalledWith('anotherword'); + }); + + it('does not call addWord() on engine if no customWords are provided', async () => { + const addWordSpy = vi.fn(); + const engine: SpellEngine = { + isReady: () => true, + contains: () => false, + suggest: () => [], + addWord: addWordSpy, + }; + + const checker = new SpellChecker({ engine }); + await checker.initialize(); + + expect(addWordSpy).not.toHaveBeenCalled(); + }); + }); + + describe('engine with bigramFrequency defined but returning zero for all pairs', () => { + it('falls back to frequency-based ordering when all bigram scores are zero', async () => { + // bigramFrequency is present but always returns 0 — no context signal. + // The frequency-ranked order from suggest() should be preserved. + const engine: SpellEngine & { bigramFrequency(w1: string, w2: string): number } = { + isReady: () => true, + contains: (word: string) => ['hi', 'his', 'new'].includes(word), + suggest: (_word: string, max = 5) => + ([ + { word: 'his', distance: 1, frequency: 900_000 }, + { word: 'hi', distance: 1, frequency: 500_000 }, + ] as SpellSuggestion[]).slice(0, max), + addWord: () => {}, + bigramFrequency: () => 0, + }; + + const checker = new SpellChecker({ + engine, + confidenceThresholds: { autoFix: 0.7, suggest: 0.5, possible: 0.3 }, + }); + await checker.initialize(); + + const result = await checker.checkText('hio'); + + const error = result.errors.find((e) => e.word === 'hio'); + expect(error).toBeDefined(); + // All bigrams are 0, so the context-preferred candidate equals the + // frequency-preferred candidate — no reordering occurs. + expect(error!.suggestions[0]).toBe('his'); + }); + }); +}); diff --git a/src/spellcheck/types/spellcheck.types.ts b/src/spellcheck/types/spellcheck.types.ts index 64a65ee..3b3f457 100644 --- a/src/spellcheck/types/spellcheck.types.ts +++ b/src/spellcheck/types/spellcheck.types.ts @@ -1,4 +1,6 @@ import type { CorrectionDecision } from '../confidence/confidence-scorer.js'; +import type { DictionaryDataLoader } from '../dictionaries/core/dictionary-loader.js'; +import type { SpellEngine } from '../engines/types.js'; export interface SpellCheckResult { word: string; @@ -35,6 +37,8 @@ export interface SpellCheckOptions { confidenceThresholds?: ConfidenceThresholds; enableSplitWordDetection?: boolean; enableJoinedWordDetection?: boolean; + loader?: DictionaryDataLoader; + engine?: SpellEngine; } export interface DictionaryConfig { diff --git a/src/splitters/chunk-splitter.test.ts b/src/splitters/chunk-splitter.test.ts index dc2322a..bbadd11 100644 --- a/src/splitters/chunk-splitter.test.ts +++ b/src/splitters/chunk-splitter.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { ChunkSplitter } from '../../src/splitters/chunk-splitter'; +import { ChunkSplitter } from '../../src/splitters/chunk-splitter.js'; describe('ChunkSplitter', () => { test('should split text into chunks based on max size', () => { diff --git a/src/splitters/sentence-splitter.test.ts b/src/splitters/sentence-splitter.test.ts index dd2156e..c2c940b 100644 --- a/src/splitters/sentence-splitter.test.ts +++ b/src/splitters/sentence-splitter.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { SentenceSplitter } from '../../src/splitters/sentence-splitter'; +import { SentenceSplitter } from '../../src/splitters/sentence-splitter.js'; describe('SentenceSplitter', () => { const splitter = new SentenceSplitter(); diff --git a/src/transformers/case-transformer.test.ts b/src/transformers/case-transformer.test.ts index 102d402..1e77de2 100644 --- a/src/transformers/case-transformer.test.ts +++ b/src/transformers/case-transformer.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { CaseTransformer } from '../../src/transformers/case-transformer'; +import { CaseTransformer } from '../../src/transformers/case-transformer.js'; describe('CaseTransformer', () => { const transformer = new CaseTransformer(); diff --git a/src/transformers/redactor.test.ts b/src/transformers/redactor.test.ts index 34bb4ca..eefc224 100644 --- a/src/transformers/redactor.test.ts +++ b/src/transformers/redactor.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { Redactor } from '../../src/transformers/redactor'; +import { Redactor } from '../../src/transformers/redactor.js'; describe('Redactor', () => { const redactor = new Redactor(); diff --git a/src/transformers/template-engine.test.ts b/src/transformers/template-engine.test.ts index cfad1ba..053923d 100644 --- a/src/transformers/template-engine.test.ts +++ b/src/transformers/template-engine.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { TemplateEngine } from '../../src/transformers/template-engine'; +import { TemplateEngine } from '../../src/transformers/template-engine.js'; describe('TemplateEngine', () => { const engine = new TemplateEngine(); diff --git a/src/utils/paths.test.ts b/src/utils/paths.test.ts index 32d2d1a..bf02ecc 100644 --- a/src/utils/paths.test.ts +++ b/src/utils/paths.test.ts @@ -7,7 +7,7 @@ import { getSpellcheckDataPath as _getSpellcheckDataPath, PATHS, verifyFileExists -} from './paths'; +} from './paths.js'; describe('Path utilities', () => { describe('getProjectRoot', () => { diff --git a/src/utils/paths.ts b/src/utils/paths.ts index 5fb28b9..1dffe22 100644 --- a/src/utils/paths.ts +++ b/src/utils/paths.ts @@ -33,13 +33,19 @@ export function getProjectRoot(): string { return process.cwd(); } +/** + * Get the root path for dictionary/spellcheck data files. + * Used by NodeDictionaryLoader as its root path. + */ +export function getDataRoot(): string { + return path.join(getProjectRoot(), 'src', 'data'); +} + /** * Get the absolute path to a data file */ export function getDataPath(...segments: string[]): string { - const projectRoot = getProjectRoot(); - - return path.join(projectRoot, 'src', 'data', ...segments); + return path.join(getDataRoot(), ...segments); } /** diff --git a/src/validators/email-validator.test.ts b/src/validators/email-validator.test.ts index ef2b046..488c03e 100644 --- a/src/validators/email-validator.test.ts +++ b/src/validators/email-validator.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { EmailValidator } from '../../src/validators/email-validator'; +import { EmailValidator } from '../../src/validators/email-validator.js'; describe('EmailValidator', () => { const validator = new EmailValidator(); diff --git a/src/validators/json-validator.test.ts b/src/validators/json-validator.test.ts index 62275f9..c462f66 100644 --- a/src/validators/json-validator.test.ts +++ b/src/validators/json-validator.test.ts @@ -1,5 +1,5 @@ import { describe, test, expect } from 'vitest'; -import { JSONValidator } from '../../src/validators/json-validator'; +import { JSONValidator } from '../../src/validators/json-validator.js'; describe('JSONValidator', () => { const validator = new JSONValidator();