public List <SuggestItem> Lookup(string input, int editDistanceMax = 2) { input = input.ToLower().Trim(); //save some time if (input.Length - editDistanceMax > maxlength) { return(new List <SuggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <SuggestItem> suggestions = new List <SuggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); //add original term candidates.Add(input); while (candidates.Count > 0) { string candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if (Verbose < 2 && suggestions.Count > 0 && input.Length - candidate.Length > suggestions[0].Distance) { return(ReturnSorted(suggestions)); } //read candidate entry from dictionary if (dictionary.TryGetValue(candidate, out int valueo)) { DictionaryItem value = new DictionaryItem(); if (valueo >= 0) { value.Suggestions.Add(valueo); } else { value = itemlist[-valueo - 1]; } //if count>0 then candidate entry is correct dictionary term, not only delete item if (value.Count > 0 && hashset2.Add(candidate)) { int distance = input.Length - candidate.Length; //save some time //do not process higher distances than those already found, if verbose<2 if (Verbose == 2 || suggestions.Count == 0 || distance <= suggestions[0].Distance) { //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2). //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end. //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch. //Therefore returned suggestions were not always complete for verbose<2. //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0].Distance > distance) { suggestions.Clear(); } //add correct dictionary term term to suggestion list SuggestItem si = new SuggestItem(candidate, value.Count, distance); suggestions.Add(si); //early termination if (Verbose < 2 && input.Length - candidate.Length == 0) { return(ReturnSorted(suggestions)); } } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list foreach (int suggestionint in value.Suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) { distance = input.Length - candidate.Length; } else if (input.Length == candidate.Length) { distance = suggestion.Length - candidate.Length; } else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while (ii < suggestion.Length && ii < input.Length && suggestion[ii] == input[ii]) { ii++; } while (jj < suggestion.Length - ii && jj < input.Length - ii && suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1]) { jj++; } if (ii > 0 || jj > 0) { distance = suggestion.Substring(ii, suggestion.Length - ii - jj).DamerauLevenshteinDistance2(input.Substring(ii, input.Length - ii - jj)); } else { distance = suggestion.DamerauLevenshteinDistance2(input); } } } //save some time //do not process higher distances than those already found, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && distance > suggestions[0].Distance) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(suggestion, out int value2)) { SuggestItem si = new SuggestItem(suggestion, itemlist[-value2 - 1].Count, distance); //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0].Distance > distance) { suggestions.Clear(); } suggestions.Add(si); } } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if (Verbose < 2 && suggestions.Count > 0 && input.Length - candidate.Length >= suggestions[0].Distance) { continue; } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } } //end while //sort by ascending edit distance, then by descending word frequency return(ReturnSorted(suggestions)); }
public List <SuggestItem> LookupCompound(string input, int editDistanceMax = 2) { input = input.ToLower().Trim(); //parse input string into single terms string[] termList1 = ParseWords(input).ToArray(); List <SuggestItem> suggestionsPreviousTerm; //suggestions for a single term var suggestions = new List <SuggestItem>(); //suggestions for a single term var suggestionParts = new List <SuggestItem>(); //1 line with separate parts //translate every term to its best suggestion, otherwise it remains unchanged bool lastCombi = false; for (int i = 0; i < termList1.Length; i++) { suggestionsPreviousTerm = new List <SuggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++) { suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy()); } suggestions = Lookup(termList1[i], editDistanceMax); //combi check, always before split if (i > 0 && !lastCombi) { List <SuggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], editDistanceMax); if (suggestionsCombi.Count > 0) { SuggestItem best1 = suggestionParts[suggestionParts.Count - 1]; SuggestItem best2; if (suggestions.Count > 0) { best2 = suggestions[0]; } else { best2 = new SuggestItem(termList1[i], editDistanceMax + 1, 0); } if (suggestionsCombi[0].Distance + 1 < (termList1[i - 1] + " " + termList1[i]).DamerauLevenshteinDistance2(best1.Term + " " + best2.Term)) { suggestionsCombi[0].IncreaseDistance(); suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0]; break; } } } //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms if (suggestions.Count > 0 && (suggestions[0].Distance == 0 || termList1[i].Length == 1)) { //choose best suggestion suggestionParts.Add(suggestions[0]); } else { //if no perfect suggestion, split word into pairs List <SuggestItem> suggestionsSplit = new List <SuggestItem>(); //add original term if (suggestions.Count > 0) { suggestionsSplit.Add(suggestions[0]); } if (termList1[i].Length > 1) { for (int j = 1; j < termList1[i].Length; j++) { string part1 = termList1[i].Substring(0, j); string part2 = termList1[i].Substring(j); List <SuggestItem> suggestions1 = Lookup(part1, editDistanceMax); if (suggestions1.Count > 0) { if (suggestions.Count > 0 && suggestions[0].Term == suggestions1[0].Term) { break; } //if split correction1 == einzelwort correction List <SuggestItem> suggestions2 = Lookup(part2, editDistanceMax); if (suggestions2.Count > 0) { if (suggestions.Count > 0 && suggestions[0].Term == suggestions2[0].Term) { break; } //if split correction1 == einzelwort correction //select best suggestion for split pair var suggestionSplitTerm = suggestions1[0].Term + " " + suggestions2[0].Term; var suggestionSplitDistance = termList1[i].DamerauLevenshteinDistance2(suggestions1[0].Term + " " + suggestions2[0].Term); var suggestionSplitCount = Math.Min(suggestions1[0].Count, suggestions2[0].Count); SuggestItem suggestionSplit = new SuggestItem(suggestionSplitTerm, suggestionSplitCount, suggestionSplitDistance); suggestionsSplit.Add(suggestionSplit); //early termination of split if (suggestionSplit.Distance == 1) { break; } } } } if (suggestionsSplit.Count > 0) { //select best suggestion for split pair suggestionsSplit.Sort((x, y) => 2 * x.Distance.CompareTo(y.Distance) - x.Count.CompareTo(y.Count)); suggestionParts.Add(suggestionsSplit[0]); } else { SuggestItem si = new SuggestItem(termList1[i], 0, editDistanceMax + 1); suggestionParts.Add(si); } } else { SuggestItem si = new SuggestItem(termList1[i], 0, editDistanceMax + 1); suggestionParts.Add(si); } } } var suggestionCount = long.MaxValue; string s = ""; foreach (SuggestItem si in suggestionParts) { s += si.Term + " "; suggestionCount = Math.Min(suggestionCount, si.Count); } var suggestionTerm = s.TrimEnd(); var suggestionDistance = suggestionTerm.DamerauLevenshteinDistance2(input); var suggestion = new SuggestItem(suggestionTerm, suggestionCount, suggestionDistance); List <SuggestItem> suggestionsLine = new List <SuggestItem>(); suggestionsLine.Add(suggestion); return(suggestionsLine); }
public List <SuggestItem> Lookup(string word, int editDistance = 2) { word = word.Trim().ToLower(); var editDistanceMax = editDistance; //save some time if (word.Length - editDistanceMax > maxlength) { return(new List <SuggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <SuggestItem> suggestions = new List <SuggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); int editDistanceMax2 = editDistanceMax; int candidatePointer = 0; //add original term candidates.Add(word); while (candidatePointer < candidates.Count) { string candidate = candidates[candidatePointer++]; int lengthDiff = Math.Min(word.Length, lp) - candidate.Length; //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if (Verbose < 2 && suggestions.Count > 0 && lengthDiff > suggestions[0] .Distance) { return(SortItems(suggestions)); } //read candidate entry from dictionary if (dictionary.TryGetValue(candidate, out int valueo)) { DictionaryItem value = new DictionaryItem(); if (valueo >= 0) { value.Suggestions.Add(valueo); } else { value = itemlist[-valueo - 1]; } //if count>0 then candidate entry is correct dictionary term, not only delete item if (value.Count > 0) { int distance = word.Length - candidate.Length; //save some time //do not process higher distances than those already found, if verbose<2 if (distance <= editDistanceMax && (Verbose == 2 || suggestions.Count == 0 || distance <= suggestions[0].Distance) && hashset2.Add(candidate)) { //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2). //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end. //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch. //Therefore returned suggestions were not always complete for verbose<2. //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0] .Distance > distance) { suggestions.Clear(); //!!! } //add correct dictionary term term to suggestion list SuggestItem si = new SuggestItem( candidate, value.Count, distance); suggestions.Add(si); //early termination if (Verbose < 2 && distance == 0) { return(SortItems(suggestions)); } } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list foreach (int suggestionint in value.Suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; // editDistanceMax+1; if (suggestion != word) { int min = 0; if (Math.Abs(suggestion.Length - word.Length) > editDistanceMax2) { continue; } if (candidate.Length == 0) { //suggestions which have no common chars with input (input.length<=editDistanceMax && suggestion.length<=editDistanceMax) if (!hashset2.Add(suggestion)) { continue; } distance = Math.Max(word.Length, suggestion.Length); } else //number of edits in prefix ==maxediddistance AND no identic suffix, then editdistance>editdistancemax and no need for Levenshtein calculation // (input.Length >= lp) && (suggestion.Length >= lp) if (lp - editDistanceMax == candidate.Length && (min = Math.Min(word.Length, suggestion.Length) - lp) > 1 && word.Substring(word.Length + 1 - min) != suggestion.Substring(suggestion.Length + 1 - min) || min > 0 && word[word.Length - min] != suggestion[suggestion.Length - min] && (word[word.Length - min - 1] != suggestion[suggestion.Length - min] || word[word.Length - min] != suggestion[suggestion.Length - min - 1])) { continue; } else //edit distance of remaining string (after prefix) { if (suggestion.Length == candidate.Length && word.Length <= lp) { if (!hashset2.Add(suggestion)) { continue; } distance = word.Length - candidate.Length; } else if (word.Length == candidate.Length && suggestion.Length <= lp) { if (!hashset2.Add(suggestion)) { continue; } distance = suggestion.Length - candidate.Length; } else if (hashset2.Add(suggestion)) { distance = word.DamerauLevenshteinDistance(suggestion, editDistanceMax2); if (distance < 0) { distance = editDistanceMax + 1; } } else { continue; } } } else if (!hashset2.Add(suggestion)) { continue; } //save some time //do not process higher distances than those already found, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && distance > suggestions[0] .Distance) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(suggestion, out int value2)) { SuggestItem si = new SuggestItem( suggestion, itemlist[-value2 - 1] .Count, distance); //we will calculate DamLev distance only to the smallest found distance sof far if (Verbose < 2) { editDistanceMax2 = distance; } //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0] .Distance > distance) { suggestions.Clear(); } suggestions.Add(si); } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (lengthDiff < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found //if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) continue; if (Verbose < 2 && suggestions.Count > 0 && lengthDiff >= suggestions[0] .Distance) { continue; //!?! } if (candidate.Length > lp) { candidate = candidate.Substring(0, lp); //just the input entry might be > lp } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } } //end while return(SortItems(suggestions)); }