/// <summary>Find suggested spellings for a given input word.</summary> /// <param name="input">The word being spell checked.</param> /// <param name="verbosity">The value controlling the quantity/closeness of the retuned suggestions.</param> /// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param> /// <returns>A List of SuggestItem object representing suggested correct spellings for the input word, /// sorted by edit distance, and secondarily by count frequency.</returns> public override List <string> Lookup(string inpt) { Verbosity verbosity = SymSpell.Verbosity.Closest; int maxEditDistance = 2; //verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found //verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency //verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination) // maxEditDistance used in Lookup can't be bigger than the maxDictionaryEditDistance // used to construct the underlying dictionary structure. if (maxEditDistance > MaxDictionaryEditDistance) { throw new ArgumentOutOfRangeException(maxEditDistance.ToString()); } List <SuggestItem> suggestions = new List <SuggestItem>(); List <string> Namzatlar = new List <string>(); string newinput = inpt.ToLower().Replace(Uyghur.Sozghuch, ""); int inputLen = newinput.Length; // early exit - word is too big to possibly match any words if (inputLen - maxEditDistance > maxLength) { return(Namzatlar); } // deletes we've considered already HashSet <string> hashset1 = new HashSet <string>(); // suggestions we've considered already HashSet <string> hashset2 = new HashSet <string>(); // quick look for exact match long suggestionCount = 0; if (words.TryGetValue(newinput, out suggestionCount)) { suggestions.Add(new SuggestItem(newinput, 0, suggestionCount)); } hashset2.Add(newinput); // we considered the input already in the word.TryGetValue above int maxEditDistance2 = maxEditDistance; int candidatePointer = 0; var singleSuggestion = new string[1] { string.Empty }; List <string> candidates = new List <string>(); //add original prefix int inputPrefixLen = inputLen; if (inputPrefixLen > prefixLength) { inputPrefixLen = prefixLength; candidates.Add(newinput.Substring(0, inputPrefixLen)); } else { candidates.Add(newinput); } var distanceComparer = new EditDistance(newinput, this.distanceAlgorithm); while (candidatePointer < candidates.Count) { string candidate = candidates[candidatePointer++]; int candidateLen = candidate.Length; int lengthDiff = inputPrefixLen - candidateLen; //save some time - early termination //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if (lengthDiff > maxEditDistance2) { // skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest // (candidates are ordered by delete distance, so none are closer than current) if (verbosity == Verbosity.All) { continue; } break; } //read candidate entry from dictionary string[] dictSuggestions; if (deletes.TryGetValue(GetStringHash(candidate), out dictSuggestions)) { //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list for (int i = 0; i < dictSuggestions.Length; i++) { var suggestion = dictSuggestions[i]; int suggestionLen = suggestion.Length; if (suggestion == newinput) { continue; } if ((Math.Abs(suggestionLen - inputLen) > maxEditDistance2) || // input and sugg lengths diff > allowed/current best distance (suggestionLen < candidateLen) || // sugg must be for a different delete string, in same bin only because of hash collision (suggestionLen == candidateLen && suggestion != candidate)) // if sugg len = delete len, then it either equals delete or is in same bin only because of hash collision { continue; } var suggPrefixLen = Math.Min(suggestionLen, prefixLength); if (suggPrefixLen > inputPrefixLen && (suggPrefixLen - candidateLen) > maxEditDistance2) { continue; } //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance. //For inserts and deletes the resulting edit distance might exceed maxEditDistance. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; int min = 0; if (candidateLen == 0) { //suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance) distance = Math.Max(inputLen, suggestionLen); if (distance > maxEditDistance2 || !hashset2.Add(suggestion)) { continue; } } else if (suggestionLen == 1) { if (newinput.IndexOf(suggestion[0]) < 0) { distance = inputLen; } else { distance = inputLen - 1; } if (distance > maxEditDistance2 || !hashset2.Add(suggestion)) { continue; } } else //number of edits in prefix ==maxediddistance AND no identic suffix //, then editdistance>maxEditDistance and no need for Levenshtein calculation // (inputLen >= prefixLength) && (suggestionLen >= prefixLength) if ((prefixLength - maxEditDistance == candidateLen) && (((min = Math.Min(inputLen, suggestionLen) - prefixLength) > 1) && (newinput.Substring(inputLen + 1 - min) != suggestion.Substring(suggestionLen + 1 - min))) || ((min > 0) && (newinput[inputLen - min] != suggestion[suggestionLen - min]) && ((newinput[inputLen - min - 1] != suggestion[suggestionLen - min]) || (newinput[inputLen - min] != suggestion[suggestionLen - min - 1])))) { continue; } else { // DeleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest. if ((verbosity != Verbosity.All && !DeleteInSuggestionPrefix(candidate, candidateLen, suggestion, suggestionLen)) || !hashset2.Add(suggestion)) { continue; } distance = distanceComparer.Compare(suggestion, maxEditDistance2); if (distance < 0) { continue; } } //save some time //do not process higher distances than those already found, if verbosity<All (note: maxEditDistance2 will always equal maxEditDistance when Verbosity.All) if (distance <= maxEditDistance2) { suggestionCount = words[suggestion]; SuggestItem si = new SuggestItem(suggestion, distance, suggestionCount); if (suggestions.Count > 0) { switch (verbosity) { case Verbosity.Closest: { //we will calculate DamLev distance only to the smallest found distance so far if (distance < maxEditDistance2) { suggestions.Clear(); } break; } case Verbosity.Top: { if (distance < maxEditDistance2 || suggestionCount > suggestions[0].count) { maxEditDistance2 = distance; suggestions[0] = si; } continue; } } } if (verbosity != Verbosity.All) { maxEditDistance2 = distance; } suggestions.Add(si); } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if ((lengthDiff < maxEditDistance) && (candidateLen <= prefixLength)) { //save some time //do not create edits with edit distance smaller than suggestions already found if (verbosity != Verbosity.All && lengthDiff >= maxEditDistance2) { continue; } for (int i = 0; i < candidateLen; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } } //end while //sort by ascending edit distance, then by descending word frequency if (suggestions.Count > 1) { suggestions.Sort(); } foreach (SuggestItem item in suggestions) { Namzatlar.Add(item.term); } if (gYeziq == Uyghur.YEZIQ.ULY) { newinput = newinput.Replace('o', 'ö').Replace('u', 'ü').Replace('é', 'e'); if (IsListed(newinput)) { Namzatlar.Insert(0, newinput); } } return(Namzatlar); } //end if
public List <SuggestItem> Lookup(string input, string language, int editDistanceMax, int verbose) { // editDistanceMax used in Lookup can't be bigger than the editDistanceMax use to construct // the underlying dictionary structure. //if (editDistanceMax > this.editDistanceMax) throw new ArgumentOutOfRangeException(); //save some time if (input.Length - editDistanceMax > maxLength) { return(new List <SuggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <SuggestItem> suggestions = new List <SuggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); int editDistanceMax2 = editDistanceMax; int candidatePointer = 0; //add original term candidates.Add(input); //add original prefix if (input.Length > lp) { candidates.Add(input.Substring(0, lp)); } var distanceComparer = new EditDistance(EditDistance.DistanceAlgorithm.DamerauOSA); while (candidatePointer < candidates.Count) { string candidate = candidates[candidatePointer++]; int lengthDiff = Math.Min(input.Length, lp) - candidate.Length; //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (lengthDiff > suggestions[0].distance)) { goto sort; } //read candidate entry from dictionary if (dictionary.TryGetValue(language + candidate, out int valueo)) { DictionaryItem value = new DictionaryItem(); if (valueo >= 0) { value.suggestions.Add((Int32)valueo); } else { value = itemlist[-valueo - 1]; } //if count>0 then candidate entry is correct dictionary term, not only delete item if (value.count > 0) { int distance = input.Length - candidate.Length; //save some time //do not process higher distances than those already found, if verbose<2 if ((distance <= editDistanceMax) && ((verbose == 2) || (suggestions.Count == 0) || (distance <= suggestions[0].distance)) && (hashset2.Add(candidate))) { //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2). //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end. //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch. //Therefore returned suggestions were not always complete for verbose<2. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) { suggestions.Clear(); } //add correct dictionary term term to suggestion list SuggestItem si = new SuggestItem() { term = candidate, count = value.count, distance = distance }; suggestions.Add(si); //early termination if ((verbose < 2) && (distance == 0)) { goto sort; } } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list foreach (int suggestionint in value.suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0;// editDistanceMax+1; if (suggestion != input) { int min = 0; if (Math.Abs(suggestion.Length - input.Length) > editDistanceMax2) { continue; } else if (candidate.Length == 0) { //suggestions which have no common chars with input (input.length<=editDistanceMax && suggestion.length<=editDistanceMax) if (!hashset2.Add(suggestion)) { continue; } distance = Math.Max(input.Length, suggestion.Length); } else //number of edits in prefix ==maxediddistance AND no identic suffix, then editdistance>editdistancemax and no need for Levenshtein calculation // (input.Length >= lp) && (suggestion.Length >= lp) if ((lp - editDistanceMax == candidate.Length) && (((min = Math.Min(input.Length, suggestion.Length) - lp) > 1) && (input.Substring(input.Length + 1 - min) != suggestion.Substring(suggestion.Length + 1 - min))) || ((min > 0) && (input[input.Length - min] != suggestion[suggestion.Length - min]) && ((input[input.Length - min - 1] != suggestion[suggestion.Length - min]) || (input[input.Length - min] != suggestion[suggestion.Length - min - 1])))) { continue; } else //edit distance of remaining string (after prefix) if ((suggestion.Length == candidate.Length) && (input.Length <= lp)) { if (!hashset2.Add(suggestion)) { continue; } distance = input.Length - candidate.Length; } else if ((input.Length == candidate.Length) && (suggestion.Length <= lp)) { if (!hashset2.Add(suggestion)) { continue; } distance = suggestion.Length - candidate.Length; } else if (hashset2.Add(suggestion)) { distance = distanceComparer.Compare(input, suggestion, editDistanceMax2); if (distance < 0) { distance = editDistanceMax + 1; } } else { continue; } } else if (!hashset2.Add(suggestion)) { continue; } //save some time //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion, out int value2)) { SuggestItem si = new SuggestItem() { term = suggestion, count = itemlist[-value2 - 1].count, distance = distance }; //we will calculate DamLev distance only to the smallest found distance sof far if (verbose < 2) { editDistanceMax2 = distance; } //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) { suggestions.Clear(); } suggestions.Add(si); } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if ((lengthDiff < editDistanceMax) && (candidate.Length <= lp)) { //save some time //do not create edits with edit distance smaller than suggestions already found if ((verbose < 2) && (suggestions.Count > 0) && (lengthDiff >= suggestions[0].distance)) { continue; } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } }//end while //sort by ascending edit distance, then by descending word frequency sort : if (verbose < 2) { suggestions.Sort((x, y) => - x.count.CompareTo(y.count)); } else { suggestions.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); } if ((verbose == 0) && (suggestions.Count > 1)) { return(suggestions.GetRange(0, 1)); } else { return(suggestions); } }
public List <SuggestItem> LookupCompound(string input, int editDistanceMax) { //parse input string into single terms string[] termList1 = ParseWords(input); List <SuggestItem> suggestionsPreviousTerm; //suggestions for a single term List <SuggestItem> suggestions = new List <SuggestItem>(); //suggestions for a single term List <SuggestItem> suggestionParts = new List <SuggestItem>(); //1 line with separate parts //translate every term to its best suggestion, otherwise it remains unchanged bool lastCombi = false; for (int i = 0; i < termList1.Length; i++) { suggestionsPreviousTerm = new List <SuggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++) { suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy()); } suggestions = Lookup(termList1[i], Verbosity.Top, editDistanceMax); //combi check, always before split if ((i > 0) && !lastCombi) { List <SuggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], Verbosity.Top, editDistanceMax); if (suggestionsCombi.Count > 0) { SuggestItem best1 = suggestionParts[suggestionParts.Count - 1]; SuggestItem best2 = new SuggestItem(); if (suggestions.Count > 0) { best2 = suggestions[0]; } else { best2.term = termList1[i]; best2.distance = editDistanceMax + 1; best2.count = 0; } //if (suggestionsCombi[0].distance + 1 < DamerauLevenshteinDistance(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term)) var distanceComparer1 = new EditDistance(termList1[i - 1] + " " + termList1[i], this.distanceAlgorithm); //new int distance1 = distanceComparer1.Compare(best1.term + " " + best2.term, editDistanceMax); if ((distance1 >= 0) && (suggestionsCombi[0].distance + 1 < distance1)) { suggestionsCombi[0].distance++; suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0]; lastCombi = true; goto nextTerm; } } } lastCombi = false; //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms if ((suggestions.Count > 0) && ((suggestions[0].distance == 0) || (termList1[i].Length == 1))) { //choose best suggestion suggestionParts.Add(suggestions[0]); } else { //if no perfect suggestion, split word into pairs List <SuggestItem> suggestionsSplit = new List <SuggestItem>(); //add original term if (suggestions.Count > 0) { suggestionsSplit.Add(suggestions[0]); } if (termList1[i].Length > 1) { for (int j = 1; j < termList1[i].Length; j++) { string part1 = termList1[i].Substring(0, j); string part2 = termList1[i].Substring(j); SuggestItem suggestionSplit = new SuggestItem(); List <SuggestItem> suggestions1 = Lookup(part1, Verbosity.Top, editDistanceMax); if (suggestions1.Count > 0) { if ((suggestions.Count > 0) && (suggestions[0].term == suggestions1[0].term)) { break; //if split correction1 == einzelwort correction } List <SuggestItem> suggestions2 = Lookup(part2, Verbosity.Top, editDistanceMax); if (suggestions2.Count > 0) { if ((suggestions.Count > 0) && (suggestions[0].term == suggestions2[0].term)) { break; //if split correction1 == einzelwort correction } //select best suggestion for split pair suggestionSplit.term = suggestions1[0].term + " " + suggestions2[0].term; var distanceComparer2 = new EditDistance(termList1[i], this.distanceAlgorithm); //new int distance2 = distanceComparer2.Compare(suggestions1[0].term + " " + suggestions2[0].term, editDistanceMax); if (distance2 < 0) { distance2 = editDistanceMax + 1; } suggestionSplit.distance = distance2; suggestionSplit.count = Math.Min(suggestions1[0].count, suggestions2[0].count); suggestionsSplit.Add(suggestionSplit); //early termination of split if (suggestionSplit.distance == 1) { break; } } } } if (suggestionsSplit.Count > 0) { //select best suggestion for split pair suggestionsSplit.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); suggestionParts.Add(suggestionsSplit[0]); } else { SuggestItem si = new SuggestItem(); si.term = termList1[i]; si.count = 0; si.distance = editDistanceMax + 1; suggestionParts.Add(si); } } else { SuggestItem si = new SuggestItem(); si.term = termList1[i]; si.count = 0; si.distance = editDistanceMax + 1; suggestionParts.Add(si); } } nextTerm :; } SuggestItem suggestion = new SuggestItem(); suggestion.count = Int64.MaxValue; string s = ""; foreach (SuggestItem si in suggestionParts) { s += si.term + " "; suggestion.count = Math.Min(suggestion.count, si.count); } //Console.WriteLine(s); suggestion.term = s.TrimEnd(); var distanceComparer3 = new EditDistance(suggestion.term, this.distanceAlgorithm); //new suggestion.distance = distanceComparer3.Compare(input, int.MaxValue); List <SuggestItem> suggestionsLine = new List <SuggestItem>(); suggestionsLine.Add(suggestion); return(suggestionsLine); }