//save some time and space private void AddLowestDistance(DictionaryItem item, string suggestion, int suggestionint, string delete) { //remove all existing suggestions of higher distance, if verbose<2 //index2word if ((verbose < 2) && (item.Suggestions.Count > 0) && (Wordlist[item.Suggestions[0]].Length - delete.Length > suggestion.Length - delete.Length)) item.Suggestions.Clear(); //do not add suggestion of higher distance than existing, if verbose<2 if ((verbose == 2) || (item.Suggestions.Count == 0) || (Wordlist[item.Suggestions[0]].Length - delete.Length >= suggestion.Length - delete.Length)) item.Suggestions.Add(suggestionint); }
private List<SuggestItem> Lookup(string input, int editDistanceMax) { //save some time if (input.Length - editDistanceMax > Maxlength) return new List<SuggestItem>(); var candidates = new List<string>(); var hashset1 = new HashSet<string>(); var suggestions = new List<SuggestItem>(); var hashset2 = new HashSet<string>(); //add original term candidates.Add(input); while (candidates.Count > 0) { var candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length > suggestions[0].Distance)) goto sort; //read candidate entry from dictionary object valueo; if (Dictionary.TryGetValue(candidate, out valueo)) { var value = new DictionaryItem(); if (valueo is int) value.Suggestions.Add((int)valueo); else value = (DictionaryItem)valueo; //if count>0 then candidate entry is correct dictionary term, not only delete item if ((value.Count > 0) && hashset2.Add(candidate)) { //add correct dictionary term term to suggestion list var si = new SuggestItem(candidate, value.Count, input.Length - candidate.Length); suggestions.Add(si); //early termination if ((verbose < 2) && (input.Length - candidate.Length == 0)) goto sort; } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list foreach (int suggestionint in value.Suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = Wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) distance = input.Length - candidate.Length; else if (input.Length == candidate.Length) distance = suggestion.Length - candidate.Length; else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) ii++; while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) jj++; if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else distance = DamerauLevenshteinDistance(suggestion, input); } } //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].Distance > distance)) suggestions.Clear(); //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].Distance)) continue; if (distance > editDistanceMax) continue; object value2; if (!Dictionary.TryGetValue(suggestion, out value2)) continue; if (value2 == null) continue; var item = (DictionaryItem)value2; var si = new SuggestItem(suggestion, item.Count, distance); suggestions.Add(si); } }//end foreach }//end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].Distance)) continue; candidates.AddRange(candidate.Select((t, i) => candidate.Remove(i, 1)).Where(delete => hashset1.Add(delete))); } }//end while //sort by ascending edit distance, then by descending word frequency sort: if (verbose < 2) suggestions.Sort((x, y) => -x.Count.CompareTo(y.Count)); else suggestions.Sort((x, y) => 2 * x.Distance.CompareTo(y.Distance) - x.Count.CompareTo(y.Count)); if ((verbose == 0) && (suggestions.Count > 1)) return suggestions.GetRange(0, 1); return suggestions; }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private bool CreateDictionaryEntry(string key) { bool result = false; DictionaryItem value = null; object valueo; if (Dictionary.TryGetValue(key, out valueo)) { //int or dictionaryItem? delete existed before word! if (valueo is int) { var tmp = (int)valueo; value = new DictionaryItem(); value.Suggestions.Add(tmp); Dictionary[key] = value; } //already exists: //1. word appears several times //2. word1==deletes(word2) else { value = (valueo as DictionaryItem); } //prevent overflow if (value != null && value.Count < int.MaxValue) { value.Count++; } } else if (Wordlist.Count < int.MaxValue) { value = new DictionaryItem(); value.Count++; Dictionary.Add(key, value); if (key.Length > Maxlength) Maxlength = key.Length; } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction if (value != null && value.Count == 1) { //word2index Wordlist.Add(key); if (key.Equals("I", StringComparison.InvariantCultureIgnoreCase)) { Console.WriteLine(key); } var keyint = Wordlist.Count - 1; result = true; //create deletes foreach (var delete in Edits(key, 0, new HashSet<string>())) { object value2; if (Dictionary.TryGetValue(delete, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //int or dictionaryItem? single delete existed before! if (value2 is int) { //transformes int to dictionaryItem var tmp = (int)value2; DictionaryItem di = new DictionaryItem(); di.Suggestions.Add(tmp); Dictionary[delete] = di; if (!di.Suggestions.Contains(keyint)) { AddLowestDistance(di, key, keyint, delete); } } else { var dictionaryItem = value2 as DictionaryItem; if (dictionaryItem != null && !dictionaryItem.Suggestions.Contains(keyint)) { AddLowestDistance((DictionaryItem) value2, key, keyint, delete); } } } else { Dictionary.Add(delete, keyint); } } } return result; }