private static List<suggestItem> Lookup(string input, string language, int editDistanceMax) { //save some time if (input.Length - editDistanceMax > maxlength) return new List<suggestItem>(); List<string> candidates = new List<string>(); HashSet<string> hashset1 = new HashSet<string>(); List<suggestItem> suggestions = new List<suggestItem>(); HashSet<string> hashset2 = new HashSet<string>(); object valueo; //add original term candidates.Add(input); while (candidates.Count>0) { string candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (input.Length-candidate.Length > suggestions[0].distance)) goto sort; //read candidate entry from dictionary if (dictionary.TryGetValue(language + candidate, out valueo)) { dictionaryItem value= new dictionaryItem(); if (valueo is Int32) value.suggestions.Add((Int32)valueo); else value = (dictionaryItem)valueo; //if count>0 then candidate entry is correct dictionary term, not only delete item if ((value.count > 0) && hashset2.Add(candidate)) { //add correct dictionary term term to suggestion list suggestItem si = new suggestItem(); si.term = candidate; si.count = value.count; si.distance = input.Length - candidate.Length; suggestions.Add(si); //early termination if ((verbose < 2) && (input.Length - candidate.Length == 0)) goto sort; } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list object value2; foreach (int suggestionint in value.suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) distance = input.Length - candidate.Length; else if (input.Length == candidate.Length) distance = suggestion.Length - candidate.Length; else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) ii++; while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) jj++; if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else distance = DamerauLevenshteinDistance(suggestion, input); } } //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear(); //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue; if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion, out value2)) { suggestItem si = new suggestItem(); si.term = suggestion; si.count = (value2 as dictionaryItem).count; si.distance = distance; suggestions.Add(si); } } } }//end foreach }//end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) continue; for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) candidates.Add(delete); } } }//end while //sort by ascending edit distance, then by descending word frequency sort: if (verbose < 2) suggestions.Sort((x, y) => -x.count.CompareTo(y.count)); else suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); if ((verbose == 0)&&(suggestions.Count>1)) return suggestions.GetRange(0, 1); else return suggestions; }
private static List<suggestItem> Lookup(string input, string language, int editDistanceMax) { List<editItem> candidates = new List<editItem>(); //add original term editItem item = new editItem(); item.term = input; item.distance = 0; candidates.Add(item); List<suggestItem> suggestions = new List<suggestItem>(); dictionaryItem value; while (candidates.Count>0) { editItem candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2)&&(suggestions.Count > 0)&&(candidate.distance > suggestions[0].distance)) goto sort; if (candidate.distance > editDistanceMax) goto sort; if (dictionary.TryGetValue(language+candidate.term, out value)) { if (!string.IsNullOrEmpty(value.term)) { //correct term suggestItem si = new suggestItem(); si.term = value.term; si.count = value.count; si.distance = candidate.distance; if (!suggestions.Contains(si)) { suggestions.Add(si); //early termination if ((verbose < 2) && (candidate.distance == 0)) goto sort; } } //edit term (with suggestions to correct term) dictionaryItem value2; foreach (editItem suggestion in value.suggestions) { //save some time //skipping double items early if (suggestions.Find(x => x.term == suggestion.term) == null) { int distance = TrueDistance(suggestion, candidate, input); //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear(); //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue; if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language+suggestion.term, out value2)) { suggestItem si = new suggestItem(); si.term = value2.term; si.count = value2.count; si.distance = distance; suggestions.Add(si); } } } } }//end foreach //add edits if (candidate.distance < editDistanceMax) { foreach (editItem delete in Edits(candidate.term, candidate.distance,false)) { if (!candidates.Contains(delete)) candidates.Add(delete); } } }//end while sort: suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList(); if ((verbose == 0)&&(suggestions.Count>1)) return suggestions.GetRange(0, 1); else return suggestions; }
private static List <suggestItem> Lookup(string input, string language, int editDistanceMax) { //save some time if (input.Length - editDistanceMax > maxlength) { return(new List <suggestItem>()); } List <string> candidates = new List <string>(); HashSet <string> hashset1 = new HashSet <string>(); List <suggestItem> suggestions = new List <suggestItem>(); HashSet <string> hashset2 = new HashSet <string>(); object valueo; //add original term candidates.Add(input); while (candidates.Count > 0) { string candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length > suggestions[0].distance)) { goto sort; } //read candidate entry from dictionary if (dictionary.TryGetValue(language + candidate, out valueo)) { dictionaryItem value = new dictionaryItem(); if (valueo is Int32) { value.suggestions.Add((Int32)valueo); } else { value = (dictionaryItem)valueo; } //if count>0 then candidate entry is correct dictionary term, not only delete item if ((value.count > 0) && hashset2.Add(candidate)) { //add correct dictionary term term to suggestion list suggestItem si = new suggestItem(); si.term = candidate; si.count = value.count; si.distance = input.Length - candidate.Length; suggestions.Add(si); //early termination if ((verbose < 2) && (input.Length - candidate.Length == 0)) { goto sort; } } //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list object value2; foreach (int suggestionint in value.suggestions) { //save some time //skipping double items early: different deletes of the input term can lead to the same suggestion //index2word string suggestion = wordlist[suggestionint]; if (hashset2.Add(suggestion)) { //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. int distance = 0; if (suggestion != input) { if (suggestion.Length == candidate.Length) { distance = input.Length - candidate.Length; } else if (input.Length == candidate.Length) { distance = suggestion.Length - candidate.Length; } else { //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. int ii = 0; int jj = 0; while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) { ii++; } while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) { jj++; } if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else { distance = DamerauLevenshteinDistance(suggestion, input); } } } //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) { suggestions.Clear(); } //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion, out value2)) { suggestItem si = new suggestItem(); si.term = suggestion; si.count = (value2 as dictionaryItem).count; si.distance = distance; suggestions.Add(si); } } } } //end foreach } //end if //add edits //derive edits (deletes) from candidate (input) and add them to candidates list //this is a recursive process until the maximum edit distance has been reached if (input.Length - candidate.Length < editDistanceMax) { //save some time //do not create edits with edit distance smaller than suggestions already found if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) { continue; } for (int i = 0; i < candidate.Length; i++) { string delete = candidate.Remove(i, 1); if (hashset1.Add(delete)) { candidates.Add(delete); } } } }//end while //sort by ascending edit distance, then by descending word frequency sort : if (verbose < 2) { suggestions.Sort((x, y) => - x.count.CompareTo(y.count)); } else { suggestions.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); } if ((verbose == 0) && (suggestions.Count > 1)) { return(suggestions.GetRange(0, 1)); } else { return(suggestions); } }
private static List <suggestItem> Lookup(string input, string language, int editDistanceMax) { List <editItem> candidates = new List <editItem>(); //add original term editItem item = new editItem(); item.term = input; item.distance = 0; candidates.Add(item); List <suggestItem> suggestions = new List <suggestItem>(); dictionaryItem value; while (candidates.Count > 0) { editItem candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (candidate.distance > suggestions[0].distance)) { goto sort; } if (candidate.distance > editDistanceMax) { goto sort; } if (dictionary.TryGetValue(language + candidate.term, out value)) { if (!string.IsNullOrEmpty(value.term)) { //correct term suggestItem si = new suggestItem(); si.term = value.term; si.count = value.count; si.distance = candidate.distance; if (!suggestions.Contains(si)) { suggestions.Add(si); //early termination if ((verbose < 2) && (candidate.distance == 0)) { goto sort; } } } //edit term (with suggestions to correct term) dictionaryItem value2; foreach (editItem suggestion in value.suggestions) { //save some time //skipping double items early if (suggestions.Find(x => x.term == suggestion.term) == null) { int distance = TrueDistance(suggestion, candidate, input); //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) { suggestions.Clear(); } //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion.term, out value2)) { suggestItem si = new suggestItem(); si.term = value2.term; si.count = value2.count; si.distance = distance; suggestions.Add(si); } } } } }//end foreach //add edits if (candidate.distance < editDistanceMax) { foreach (editItem delete in Edits(candidate.term, candidate.distance, false)) { if (!candidates.Contains(delete)) { candidates.Add(delete); } } } }//end while sort : suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList(); if ((verbose == 0) && (suggestions.Count > 1)) { return(suggestions.GetRange(0, 1)); } else { return(suggestions); } }
public static List <suggestItem> LookupCompound(string input, string language, int editDistanceMax) { //parse input string into single terms string[] termList1 = parseWords(input).ToArray(); List <suggestItem> suggestionsPreviousTerm; //suggestions for a single term List <suggestItem> suggestions = new List <suggestItem>(); //suggestions for a single term List <suggestItem> suggestionParts = new List <suggestItem>(); //1 line with separate parts //translate every term to its best suggestion, otherwise it remains unchanged bool lastCombi = false; for (int i = 0; i < termList1.Length; i++) { suggestionsPreviousTerm = new List <suggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++) { suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy()); } suggestions = Lookup(termList1[i], language, editDistanceMax); //combi check, always before split if ((i > 0) && !lastCombi) { List <suggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], language, editDistanceMax); if (suggestionsCombi.Count > 0) { suggestItem best1 = suggestionParts[suggestionParts.Count - 1]; suggestItem best2 = new suggestItem(); if (suggestions.Count > 0) { best2 = suggestions[0]; } else { best2.term = termList1[i]; best2.distance = editDistanceMax + 1; best2.count = 0; } if (suggestionsCombi[0].distance + 1 < DamerauLevenshteinDistance(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term)) { suggestionsCombi[0].distance++; suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0]; lastCombi = true; goto nextTerm; } } } lastCombi = false; //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms if ((suggestions.Count > 0) && ((suggestions[0].distance == 0) || (termList1[i].Length == 1))) { //choose best suggestion suggestionParts.Add(suggestions[0]); } else { //if no perfect suggestion, split word into pairs List <suggestItem> suggestionsSplit = new List <suggestItem>(); //add original term if (suggestions.Count > 0) { suggestionsSplit.Add(suggestions[0]); } if (termList1[i].Length > 1) { for (int j = 1; j < termList1[i].Length; j++) { string part1 = termList1[i].Substring(0, j); string part2 = termList1[i].Substring(j); suggestItem suggestionSplit = new suggestItem(); List <suggestItem> suggestions1 = Lookup(part1, language, editDistanceMax); if (suggestions1.Count > 0) { if ((suggestions.Count > 0) && (suggestions[0].term == suggestions1[0].term)) { break; //if split correction1 == einzelwort correction } List <suggestItem> suggestions2 = Lookup(part2, language, editDistanceMax); if (suggestions2.Count > 0) { if ((suggestions.Count > 0) && (suggestions[0].term == suggestions2[0].term)) { break; //if split correction1 == einzelwort correction } //select best suggestion for split pair suggestionSplit.term = suggestions1[0].term + " " + suggestions2[0].term; suggestionSplit.distance = DamerauLevenshteinDistance(termList1[i], suggestions1[0].term + " " + suggestions2[0].term); suggestionSplit.count = Math.Min(suggestions1[0].count, suggestions2[0].count); suggestionsSplit.Add(suggestionSplit); //early termination of split if (suggestionSplit.distance == 1) { break; } } } } if (suggestionsSplit.Count > 0) { //select best suggestion for split pair suggestionsSplit.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); suggestionParts.Add(suggestionsSplit[0]); } else { suggestItem si = new suggestItem(); si.term = termList1[i]; si.count = 0; si.distance = editDistanceMax + 1; suggestionParts.Add(si); } } else { suggestItem si = new suggestItem(); si.term = termList1[i]; si.count = 0; si.distance = editDistanceMax + 1; suggestionParts.Add(si); } } nextTerm :; } suggestItem suggestion = new suggestItem(); suggestion.count = Int64.MaxValue; string s = ""; foreach (suggestItem si in suggestionParts) { s += si.term + " "; suggestion.count = Math.Min(suggestion.count, si.count); } //Console.WriteLine(s); suggestion.term = s.TrimEnd(); suggestion.distance = DamerauLevenshteinDistance(suggestion.term, input); List <suggestItem> suggestionsLine = new List <suggestItem>(); suggestionsLine.Add(suggestion); return(suggestionsLine); }