//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value; if (dictionary.TryGetValue(language + key, out value)) { //already exists: //1. word appears several times //2. word1==deletes(word2) value.count++; } else { value = new dictionaryItem(); value.count++; dictionary.Add(language + key, value); } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (string.IsNullOrEmpty(value.term)) { result = true; value.term = key; //create deletes foreach (editItem delete in Edits(key, 0, true)) { editItem suggestion = new editItem(); suggestion.term = key; suggestion.distance = delete.distance; dictionaryItem value2; if (dictionary.TryGetValue(language + delete.term, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) if (!value2.suggestions.Contains(suggestion)) { AddLowestDistance(value2.suggestions, suggestion); } } else { value2 = new dictionaryItem(); value2.suggestions.Add(suggestion); dictionary.Add(language + delete.term, value2); } } } return(result); }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value; if (dictionary.TryGetValue(language + key, out value)) { //already exists: //1. word appears several times //2. word1==deletes(word2) value.count++; } else { value = new dictionaryItem(); value.count++; dictionary.Add(language + key, value); } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction if (value.count == 1) { result = true; //create deletes foreach (string delete in Edits(key, 0, new HashSet <string>())) { dictionaryItem value2; if (dictionary.TryGetValue(language + delete, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //if (!value2.suggestions.Contains(key)) AddLowestDistance(value2, key, delete.Value); if (!value2.suggestions.Contains(key)) { AddLowestDistance(value2, key, delete); } } else { value2 = new dictionaryItem(); value2.suggestions.Add(key); dictionary.Add(language + delete, value2); } } } return(result); }
//save some time and space private static void AddLowestDistance(dictionaryItem item, string suggestion, string delete) { //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (item.suggestions.Count > 0) && (item.suggestions[0].Length - delete.Length > suggestion.Length - delete.Length)) { item.suggestions.Clear(); } //do not add suggestion of higher distance than existing, if verbose<2 if ((verbose == 2) || (item.suggestions.Count == 0) || (item.suggestions[0].Length - delete.Length >= suggestion.Length - delete.Length)) { item.suggestions.Add(suggestion); } }
//save some time and space private static void AddLowestDistance(dictionaryItem item, string suggestion, Int32 suggestionint, string delete) { //remove all existing suggestions of higher distance, if verbose<2 //index2word if ((verbose < 2) && (item.suggestions.Count > 0) && (wordlist[item.suggestions[0]].Length - delete.Length > suggestion.Length - delete.Length)) { item.suggestions.Clear(); } //do not add suggestion of higher distance than existing, if verbose<2 if ((verbose == 2) || (item.suggestions.Count == 0) || (wordlist[item.suggestions[0]].Length - delete.Length >= suggestion.Length - delete.Length)) { item.suggestions.Add(suggestionint); item.suggestions.TrimExcess(); } }
public static int maxlength = 0;//maximum dictionary term length //for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value=null; object valueo; if (dictionary.TryGetValue(language+key, out valueo)) { //int or dictionaryItem? delete existed before word! if (valueo is Int32) { Int32 tmp = (Int32)valueo; value = new dictionaryItem(); value.suggestions.Add(tmp); dictionary[language + key] = value; } //already exists: //1. word appears several times //2. word1==deletes(word2) else { value = (valueo as dictionaryItem); } //prevent overflow if (value.count < Int32.MaxValue) value.count++; } else if (wordlist.Count < Int32.MaxValue) { value = new dictionaryItem(); (value as dictionaryItem).count++; dictionary.Add(language + key, value as dictionaryItem); if (key.Length > maxlength) maxlength = key.Length; } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction if ((value as dictionaryItem).count == 1) { //word2index wordlist.Add(key); Int32 keyint = (Int32)(wordlist.Count - 1); result = true; //create deletes foreach (string delete in Edits(key, 0, new HashSet<string>())) { object value2; if (dictionary.TryGetValue(language+delete, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //int or dictionaryItem? single delete existed before! if (value2 is Int32) { //transformes int to dictionaryItem Int32 tmp = (Int32)value2; dictionaryItem di = new dictionaryItem(); di.suggestions.Add(tmp); dictionary[language + delete] = di; if (!di.suggestions.Contains(keyint)) AddLowestDistance(di, key, keyint, delete); } else if (!(value2 as dictionaryItem).suggestions.Contains(keyint)) AddLowestDistance(value2 as dictionaryItem, key, keyint, delete); } else { dictionary.Add(language + delete, keyint); } } } return result; }
private static List<suggestItem> Lookup(string input, string language, int editDistanceMax) { //save some time if (input.Length - editDistanceMax > maxlength) return new List<suggestItem>(); List<string> candidates = new List<string>(); HashSet<string> hashset1 = new HashSet<string>(); List<suggestItem> suggestions = new List<suggestItem>(); HashSet<string> hashset2 = new HashSet<string>(); object valueo; //add original term candidates.Add(input); while (candidates.Count>0) { string candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (input.Length-candidate.Length > suggestions[0].distance)) goto sort; //read candidate entry from dictionary if (dictionary.TryGetValue(language + candidate, out valueo)) { dictionaryItem value= new dictionaryItem(); if (valueo is Int32) value.suggestions.Add((Int32)valueo); else value = (dictionaryItem)valueo; //if count>0 then candidate entry is correct dictionary term, not only delete item if ((value.count > 0) && hashset2.Add(candidate)) { //add correct dictionary term term to suggestion list suggestItem si = new suggestItem(); si.term = candidate; si.count = value.count; si.distance = input.Length - candidate.Length; suggestions.Add(si); //early termination if ((verbose < 2) && (input.Length - candidate.Length == 0)) goto sort; } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list object value2; foreach (int suggestionint in value.suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) distance = input.Length - candidate.Length; else if (input.Length == candidate.Length) distance = suggestion.Length - candidate.Length; else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) ii++; while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) jj++; if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else distance = DamerauLevenshteinDistance(suggestion, input); } } //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear(); //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue; if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion, out value2)) { suggestItem si = new suggestItem(); si.term = suggestion; si.count = (value2 as dictionaryItem).count; si.distance = distance; suggestions.Add(si); } } } }//end foreach }//end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) continue; for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) candidates.Add(delete); } } }//end while //sort by ascending edit distance, then by descending word frequency sort: if (verbose < 2) suggestions.Sort((x, y) => -x.count.CompareTo(y.count)); else suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); if ((verbose == 0)&&(suggestions.Count>1)) return suggestions.GetRange(0, 1); else return suggestions; }
//save some time and space private static void AddLowestDistance(dictionaryItem item, string suggestion, Int32 suggestionint, string delete) { //remove all existing suggestions of higher distance, if verbose<2 //index2word if ((verbose < 2) && (item.suggestions.Count > 0) && (wordlist[item.suggestions[0]].Length-delete.Length > suggestion.Length - delete.Length)) item.suggestions.Clear(); //do not add suggestion of higher distance than existing, if verbose<2 if ((verbose == 2) || (item.suggestions.Count == 0) || (wordlist[item.suggestions[0]].Length-delete.Length >= suggestion.Length - delete.Length)) item.suggestions.Add(suggestionint); }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value; if (dictionary.TryGetValue(language+key, out value)) { //already exists: //1. word appears several times //2. word1==deletes(word2) value.count++; } else { value = new dictionaryItem(); value.count++; dictionary.Add(language+key, value); } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (string.IsNullOrEmpty(value.term)) { result = true; value.term = key; //create deletes foreach (editItem delete in Edits(key, 0, true)) { editItem suggestion = new editItem(); suggestion.term = key; suggestion.distance = delete.distance; dictionaryItem value2; if (dictionary.TryGetValue(language+delete.term, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) if (!value2.suggestions.Contains(suggestion)) AddLowestDistance(value2.suggestions, suggestion); } else { value2 = new dictionaryItem(); value2.suggestions.Add(suggestion); dictionary.Add(language+delete.term, value2); } } } return result; }
public static int maxlength = 0;//maximum dictionary term length //for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value = null; object valueo; if (dictionary.TryGetValue(language + key, out valueo)) { //int or dictionaryItem? delete existed before word! if (valueo is Int32) { Int32 tmp = (Int32)valueo; value = new dictionaryItem(); value.suggestions.Add(tmp); dictionary[language + key] = value; } //already exists: //1. word appears several times //2. word1==deletes(word2) else { value = (valueo as dictionaryItem); } //prevent overflow if (value.count < Int32.MaxValue) { value.count++; } } else if (wordlist.Count < Int32.MaxValue) { value = new dictionaryItem(); (value as dictionaryItem).count++; dictionary.Add(language + key, value as dictionaryItem); if (key.Length > maxlength) { maxlength = key.Length; } } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction if ((value as dictionaryItem).count == 1) { //word2index wordlist.Add(key); Int32 keyint = (Int32)(wordlist.Count - 1); result = true; //create deletes foreach (string delete in Edits(key, 0, new HashSet <string>())) { object value2; if (dictionary.TryGetValue(language + delete, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //int or dictionaryItem? single delete existed before! if (value2 is Int32) { //transformes int to dictionaryItem Int32 tmp = (Int32)value2; dictionaryItem di = new dictionaryItem(); di.suggestions.Add(tmp); dictionary[language + delete] = di; if (!di.suggestions.Contains(keyint)) { AddLowestDistance(di, key, keyint, delete); } } else if (!(value2 as dictionaryItem).suggestions.Contains(keyint)) { AddLowestDistance(value2 as dictionaryItem, key, keyint, delete); } } else { dictionary.Add(language + delete, keyint); } } } return(result); }
private static List <suggestItem> Lookup(string input, string language, int editDistanceMax) { //save some time if (input.Length - editDistanceMax > maxlength) { return(new List <suggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <suggestItem> suggestions = new List <suggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); object valueo; //add original term candidates.Add(input); while (candidates.Count > 0) { string candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length > suggestions[0].distance)) { goto sort; } //read candidate entry from dictionary if (dictionary.TryGetValue(language + candidate, out valueo)) { dictionaryItem value = new dictionaryItem(); if (valueo is Int32) { value.suggestions.Add((Int32)valueo); } else { value = (dictionaryItem)valueo; } //if count>0 then candidate entry is correct dictionary term, not only delete item if ((value.count > 0) && hashset2.Add(candidate)) { //add correct dictionary term term to suggestion list suggestItem si = new suggestItem(); si.term = candidate; si.count = value.count; si.distance = input.Length - candidate.Length; suggestions.Add(si); //early termination if ((verbose < 2) && (input.Length - candidate.Length == 0)) { goto sort; } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list object value2; foreach (int suggestionint in value.suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) { distance = input.Length - candidate.Length; } else if (input.Length == candidate.Length) { distance = suggestion.Length - candidate.Length; } else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) { ii++; } while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) { jj++; } if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else { distance = DamerauLevenshteinDistance(suggestion, input); } } } //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) { suggestions.Clear(); } //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion, out value2)) { suggestItem si = new suggestItem(); si.term = suggestion; si.count = (value2 as dictionaryItem).count; si.distance = distance; suggestions.Add(si); } } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) { continue; } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } }//end while //sort by ascending edit distance, then by descending word frequency sort : if (verbose < 2) { suggestions.Sort((x, y) => - x.count.CompareTo(y.count)); } else { suggestions.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); } if ((verbose == 0) && (suggestions.Count > 1)) { return(suggestions.GetRange(0, 1)); } else { return(suggestions); } }
public static int maxlength = 0;//maximum dictionary term length //for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language, Int64 count) { //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction int countTreshold = 1; Int64 countPrevious = 0; bool result = false; dictionaryItem value = null; Int32 valueo; if (dictionary.TryGetValue(language + key, out valueo)) { //new word, but identical single delete existed before //+ = single delete = index auf worlist //- = !single delete (word / word + delete(s) / deletes) = index to dictionaryItem list if (valueo >= 0) { Int32 tmp = valueo; value = new dictionaryItem(); value.suggestions.Add(tmp); value.suggestions.TrimExcess(); itemlist.Add(value); dictionary[language + key] = -itemlist.Count; } //existing word (word appears several times) else { value = itemlist[-valueo - 1]; } countPrevious = value.count; //summarizes multiple frequency entries of a word (prevents overflow) value.count = Math.Min(Int64.MaxValue, value.count + count); } else { //new word value = new dictionaryItem(); value.count = count; itemlist.Add(value); dictionary[language + key] = -itemlist.Count; if (key.Length > maxlength) { maxlength = key.Length; } } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if ((value.count >= countTreshold) && (countPrevious < countTreshold)) { //word2index wordlist.Add(key); Int32 keyint = (Int32)(wordlist.Count - 1); result = true; //create deletes foreach (string delete in Edits(key, 0, new HashSet <string>())) { Int32 value2; dictionaryItem di; if (dictionary.TryGetValue(language + delete, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) //int or dictionaryItem? single delete existed before! if (value2 >= 0) { //transformes int to dictionaryItem di = new dictionaryItem(); di.suggestions.Add(value2); di.suggestions.TrimExcess(); itemlist.Add(di); dictionary[language + delete] = -itemlist.Count; if (!di.suggestions.Contains(keyint)) { AddLowestDistance(di, key, keyint, delete); } } else { di = itemlist[-value2 - 1]; if (!di.suggestions.Contains(keyint)) { AddLowestDistance(di, key, keyint, delete); } } } else { dictionary.Add(language + delete, keyint); } } } return(result); }