//save some time and space private void AddLowestDistance(DictionaryItem item, string suggestion, int suggestionint, string delete) { //remove all existing suggestions of higher distance, if verbose<2 //index2word if (Verbose < 2 && item.Suggestions.Count > 0 && wordlist[item.Suggestions[0]].Length - delete.Length > suggestion.Length - delete.Length) { item.Suggestions.Clear(); } //do not add suggestion of higher distance than existing, if verbose<2 if (Verbose == 2 || item.Suggestions.Count == 0 || wordlist[item.Suggestions[0]].Length - delete.Length >= suggestion.Length - delete.Length) { item.Suggestions.Add(suggestionint); item.Suggestions.TrimExcess(); } }
private void CreateDelete(string delete, int keyint) { //Int32 value2; DictionaryItem di; if (dictionary.TryGetValue(delete, out int value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //int or dictionaryItem? single delete existed before! if (value2 >= 0) { //transformes int to dictionaryItem di = new DictionaryItem(); di.Suggestions.Add(value2); itemlist.Add(di); dictionary[delete] = -itemlist.Count; if (!di.Suggestions.Contains(keyint)) { di.Suggestions.Add(keyint); } } else { di = itemlist[-value2 - 1]; if (!di.Suggestions.Contains(keyint)) { di.Suggestions.Add(keyint); } } } else { dictionary.Add(delete, keyint); } }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry public bool CreateDictionaryEntry(string key, long count) { //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction int countTreshold = 1; long countPrevious = 0; bool result = false; DictionaryItem value = null; if (dictionary.TryGetValue(key, out int valueo)) { //new word, but identical single delete existed before //+ = single delete = index auf worlist //- = !single delete (word / word + delete(s) / deletes) = index to dictionaryItem list if (valueo >= 0) { int tmp = valueo; value = new DictionaryItem(); value.Suggestions.Add(tmp); value.Suggestions.TrimExcess(); itemlist.Add(value); dictionary[key] = -itemlist.Count; } //existing word (word appears several times) else { value = itemlist[-valueo - 1]; } countPrevious = value.Count; //summarizes multiple frequency entries of a word (prevents overflow) value.Count = Math.Min(long.MaxValue, value.Count + count); } else { //new word value = new DictionaryItem(); value.Count = count; itemlist.Add(value); dictionary[key] = -itemlist.Count; if (key.Length > maxlength) { maxlength = key.Length; } } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (value.Count >= countTreshold && countPrevious < countTreshold) { //word2index wordlist.Add(key); int keyint = wordlist.Count - 1; result = true; //create deletes foreach (string delete in Edits(key, 0, new HashSet <string>())) { DictionaryItem di; if (dictionary.TryGetValue(delete, out int value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //int or dictionaryItem? single delete existed before! if (value2 >= 0) { //transformes int to dictionaryItem di = new DictionaryItem(); di.Suggestions.Add(value2); di.Suggestions.TrimExcess(); itemlist.Add(di); dictionary[delete] = -itemlist.Count; if (!di.Suggestions.Contains(keyint)) { AddLowestDistance(di, key, keyint, delete); } } else { di = itemlist[-value2 - 1]; if (!di.Suggestions.Contains(keyint)) { AddLowestDistance(di, key, keyint, delete); } } } else { dictionary.Add(delete, keyint); } } } return(result); }
public List <SuggestItem> Lookup(string input, int editDistanceMax = 2) { input = input.ToLower().Trim(); //save some time if (input.Length - editDistanceMax > maxlength) { return(new List <SuggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <SuggestItem> suggestions = new List <SuggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); //add original term candidates.Add(input); while (candidates.Count > 0) { string candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if (Verbose < 2 && suggestions.Count > 0 && input.Length - candidate.Length > suggestions[0].Distance) { return(ReturnSorted(suggestions)); } //read candidate entry from dictionary if (dictionary.TryGetValue(candidate, out int valueo)) { DictionaryItem value = new DictionaryItem(); if (valueo >= 0) { value.Suggestions.Add(valueo); } else { value = itemlist[-valueo - 1]; } //if count>0 then candidate entry is correct dictionary term, not only delete item if (value.Count > 0 && hashset2.Add(candidate)) { int distance = input.Length - candidate.Length; //save some time //do not process higher distances than those already found, if verbose<2 if (Verbose == 2 || suggestions.Count == 0 || distance <= suggestions[0].Distance) { //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2). //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end. //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch. //Therefore returned suggestions were not always complete for verbose<2. //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0].Distance > distance) { suggestions.Clear(); } //add correct dictionary term term to suggestion list SuggestItem si = new SuggestItem(candidate, value.Count, distance); suggestions.Add(si); //early termination if (Verbose < 2 && input.Length - candidate.Length == 0) { return(ReturnSorted(suggestions)); } } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list foreach (int suggestionint in value.Suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) { distance = input.Length - candidate.Length; } else if (input.Length == candidate.Length) { distance = suggestion.Length - candidate.Length; } else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while (ii < suggestion.Length && ii < input.Length && suggestion[ii] == input[ii]) { ii++; } while (jj < suggestion.Length - ii && jj < input.Length - ii && suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1]) { jj++; } if (ii > 0 || jj > 0) { distance = suggestion.Substring(ii, suggestion.Length - ii - jj).DamerauLevenshteinDistance2(input.Substring(ii, input.Length - ii - jj)); } else { distance = suggestion.DamerauLevenshteinDistance2(input); } } } //save some time //do not process higher distances than those already found, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && distance > suggestions[0].Distance) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(suggestion, out int value2)) { SuggestItem si = new SuggestItem(suggestion, itemlist[-value2 - 1].Count, distance); //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0].Distance > distance) { suggestions.Clear(); } suggestions.Add(si); } } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if (Verbose < 2 && suggestions.Count > 0 && input.Length - candidate.Length >= suggestions[0].Distance) { continue; } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } } //end while //sort by ascending edit distance, then by descending word frequency return(ReturnSorted(suggestions)); }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry public bool AddRecord(string word, long count = 1) { word = word.ToLower(); //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction int countTreshold = 1; long countPrevious = 0; bool result = false; DictionaryItem value; //Int32 valueo; if (dictionary.TryGetValue(word, out int valueo)) { //new word, but identical single delete existed before //+ = single delete = index auf worlist //- = !single delete (word / word + delete(s) / deletes) = index to dictionaryItem list if (valueo >= 0) { int tmp = valueo; value = new DictionaryItem(); value.Suggestions.Add(tmp); itemlist.Add(value); dictionary[word] = -itemlist.Count; } //existing word (word appears several times) else { value = itemlist[-valueo - 1]; } countPrevious = value.Count; //summarizes multiple frequency entries of a word (prevents overflow) value.Count = Math.Min(long.MaxValue, value.Count + count); } else { //new word value = new DictionaryItem { Count = count }; itemlist.Add(value); dictionary[word] = -itemlist.Count; if (word.Length > maxlength) { maxlength = word.Length; } } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (value.Count >= countTreshold && countPrevious < countTreshold) { //word2index wordlist.Add(word); int keyint = wordlist.Count - 1; result = true; //create deletes foreach (string delete in EditsPrefix(word)) { CreateDelete(delete, keyint); } } return(result); }
public List <SuggestItem> Lookup(string word, int editDistance = 2) { word = word.Trim().ToLower(); var editDistanceMax = editDistance; //save some time if (word.Length - editDistanceMax > maxlength) { return(new List <SuggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <SuggestItem> suggestions = new List <SuggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); int editDistanceMax2 = editDistanceMax; int candidatePointer = 0; //add original term candidates.Add(word); while (candidatePointer < candidates.Count) { string candidate = candidates[candidatePointer++]; int lengthDiff = Math.Min(word.Length, lp) - candidate.Length; //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if (Verbose < 2 && suggestions.Count > 0 && lengthDiff > suggestions[0] .Distance) { return(SortItems(suggestions)); } //read candidate entry from dictionary if (dictionary.TryGetValue(candidate, out int valueo)) { DictionaryItem value = new DictionaryItem(); if (valueo >= 0) { value.Suggestions.Add(valueo); } else { value = itemlist[-valueo - 1]; } //if count>0 then candidate entry is correct dictionary term, not only delete item if (value.Count > 0) { int distance = word.Length - candidate.Length; //save some time //do not process higher distances than those already found, if verbose<2 if (distance <= editDistanceMax && (Verbose == 2 || suggestions.Count == 0 || distance <= suggestions[0].Distance) && hashset2.Add(candidate)) { //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2). //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end. //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch. //Therefore returned suggestions were not always complete for verbose<2. //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0] .Distance > distance) { suggestions.Clear(); //!!! } //add correct dictionary term term to suggestion list SuggestItem si = new SuggestItem( candidate, value.Count, distance); suggestions.Add(si); //early termination if (Verbose < 2 && distance == 0) { return(SortItems(suggestions)); } } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list foreach (int suggestionint in value.Suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; // editDistanceMax+1; if (suggestion != word) { int min = 0; if (Math.Abs(suggestion.Length - word.Length) > editDistanceMax2) { continue; } if (candidate.Length == 0) { //suggestions which have no common chars with input (input.length<=editDistanceMax && suggestion.length<=editDistanceMax) if (!hashset2.Add(suggestion)) { continue; } distance = Math.Max(word.Length, suggestion.Length); } else //number of edits in prefix ==maxediddistance AND no identic suffix, then editdistance>editdistancemax and no need for Levenshtein calculation // (input.Length >= lp) && (suggestion.Length >= lp) if (lp - editDistanceMax == candidate.Length && (min = Math.Min(word.Length, suggestion.Length) - lp) > 1 && word.Substring(word.Length + 1 - min) != suggestion.Substring(suggestion.Length + 1 - min) || min > 0 && word[word.Length - min] != suggestion[suggestion.Length - min] && (word[word.Length - min - 1] != suggestion[suggestion.Length - min] || word[word.Length - min] != suggestion[suggestion.Length - min - 1])) { continue; } else //edit distance of remaining string (after prefix) { if (suggestion.Length == candidate.Length && word.Length <= lp) { if (!hashset2.Add(suggestion)) { continue; } distance = word.Length - candidate.Length; } else if (word.Length == candidate.Length && suggestion.Length <= lp) { if (!hashset2.Add(suggestion)) { continue; } distance = suggestion.Length - candidate.Length; } else if (hashset2.Add(suggestion)) { distance = word.DamerauLevenshteinDistance(suggestion, editDistanceMax2); if (distance < 0) { distance = editDistanceMax + 1; } } else { continue; } } } else if (!hashset2.Add(suggestion)) { continue; } //save some time //do not process higher distances than those already found, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && distance > suggestions[0] .Distance) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(suggestion, out int value2)) { SuggestItem si = new SuggestItem( suggestion, itemlist[-value2 - 1] .Count, distance); //we will calculate DamLev distance only to the smallest found distance sof far if (Verbose < 2) { editDistanceMax2 = distance; } //remove all existing suggestions of higher distance, if verbose<2 if (Verbose < 2 && suggestions.Count > 0 && suggestions[0] .Distance > distance) { suggestions.Clear(); } suggestions.Add(si); } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (lengthDiff < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found //if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) continue; if (Verbose < 2 && suggestions.Count > 0 && lengthDiff >= suggestions[0] .Distance) { continue; //!?! } if (candidate.Length > lp) { candidate = candidate.Substring(0, lp); //just the input entry might be > lp } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } } //end while return(SortItems(suggestions)); }