private static int TrueDistance(editItem dictionaryOriginal, editItem inputDelete, string inputOriginal) { //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. if (dictionaryOriginal.term == inputOriginal) { return(0); } else if (dictionaryOriginal.distance == 0) { return(inputDelete.distance); } else if (inputDelete.distance == 0) { return(dictionaryOriginal.distance); } else { return(DamerauLevenshteinDistance(dictionaryOriginal.term, inputOriginal));//adjust distance, if both distances>0 } }
//inexpensive and language independent: only deletes, no transposes + replaces + inserts //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters) private static List <editItem> Edits(string word, int editDistance, bool recursion) { editDistance++; List <editItem> deletes = new List <editItem>(); if (word.Length > 1) { for (int i = 0; i < word.Length; i++) { editItem delete = new editItem(); delete.term = word.Remove(i, 1); delete.distance = editDistance; if (!deletes.Contains(delete)) { deletes.Add(delete); //recursion, if maximum edit distance not yet reached if (recursion && (editDistance < editDistanceMax)) { foreach (editItem edit1 in Edits(delete.term, editDistance, recursion)) { if (!deletes.Contains(edit1)) { deletes.Add(edit1); } } } } } } return(deletes); }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value; if (dictionary.TryGetValue(language + key, out value)) { //already exists: //1. word appears several times //2. word1==deletes(word2) value.count++; } else { value = new dictionaryItem(); value.count++; dictionary.Add(language + key, value); } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (string.IsNullOrEmpty(value.term)) { result = true; value.term = key; //create deletes foreach (editItem delete in Edits(key, 0, true)) { editItem suggestion = new editItem(); suggestion.term = key; suggestion.distance = delete.distance; dictionaryItem value2; if (dictionary.TryGetValue(language + delete.term, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) if (!value2.suggestions.Contains(suggestion)) { AddLowestDistance(value2.suggestions, suggestion); } } else { value2 = new dictionaryItem(); value2.suggestions.Add(suggestion); dictionary.Add(language + delete.term, value2); } } } return(result); }
//save some time and space private static void AddLowestDistance(List <editItem> suggestions, editItem suggestion) { //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > suggestion.distance)) { suggestions.Clear(); } //do not add suggestion of higher distance than existing, if verbose<2 if ((verbose == 2) || (suggestions.Count == 0) || (suggestions[0].distance >= suggestion.distance)) { suggestions.Add(suggestion); } }
private void mEditOnClick(object sender, EventArgs ea) { //edit selected items foreach (ListViewFileItem lvi in contents.SelectedItems) { editItem ei = new editItem(lvi.SubItems[(int)ListViewOrder.Name].Text, lvi.SubItems[(int)ListViewOrder.FileType].Text, lvi.SubItems[(int)ListViewOrder.Upx].Text, lvi.SubItems[(int)ListViewOrder.NullString].Text); if (ei.ShowDialog() == DialogResult.OK) { lvi.SubItems[(int)ListViewOrder.Name].Text = ei.FileName; lvi.SubItems[(int)ListViewOrder.FileType].Text = ei.Filetype; lvi.SubItems[(int)ListViewOrder.Upx].Text = ei.Upx; lvi.SubItems[(int)ListViewOrder.NullString].Text = ei.StringNull; } } }
private static int TrueDistance(editItem dictionaryOriginal, editItem inputDelete, string inputOriginal) { //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. //For inserts and deletes the resulting edit distance might exceed editDistanceMax. //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. if (dictionaryOriginal.term == inputOriginal) return 0; else if (dictionaryOriginal.distance == 0) return inputDelete.distance; else if (inputDelete.distance == 0) return dictionaryOriginal.distance; else return DamerauLevenshteinDistance(dictionaryOriginal.term, inputOriginal);//adjust distance, if both distances>0 }
private static List<suggestItem> Lookup(string input, string language, int editDistanceMax) { List<editItem> candidates = new List<editItem>(); //add original term editItem item = new editItem(); item.term = input; item.distance = 0; candidates.Add(item); List<suggestItem> suggestions = new List<suggestItem>(); dictionaryItem value; while (candidates.Count>0) { editItem candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2)&&(suggestions.Count > 0)&&(candidate.distance > suggestions[0].distance)) goto sort; if (candidate.distance > editDistanceMax) goto sort; if (dictionary.TryGetValue(language+candidate.term, out value)) { if (!string.IsNullOrEmpty(value.term)) { //correct term suggestItem si = new suggestItem(); si.term = value.term; si.count = value.count; si.distance = candidate.distance; if (!suggestions.Contains(si)) { suggestions.Add(si); //early termination if ((verbose < 2) && (candidate.distance == 0)) goto sort; } } //edit term (with suggestions to correct term) dictionaryItem value2; foreach (editItem suggestion in value.suggestions) { //save some time //skipping double items early if (suggestions.Find(x => x.term == suggestion.term) == null) { int distance = TrueDistance(suggestion, candidate, input); //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear(); //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue; if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language+suggestion.term, out value2)) { suggestItem si = new suggestItem(); si.term = value2.term; si.count = value2.count; si.distance = distance; suggestions.Add(si); } } } } }//end foreach //add edits if (candidate.distance < editDistanceMax) { foreach (editItem delete in Edits(candidate.term, candidate.distance,false)) { if (!candidates.Contains(delete)) candidates.Add(delete); } } }//end while sort: suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList(); if ((verbose == 0)&&(suggestions.Count>1)) return suggestions.GetRange(0, 1); else return suggestions; }
//inexpensive and language independent: only deletes, no transposes + replaces + inserts //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters) private static List<editItem> Edits(string word, int editDistance, bool recursion) { editDistance++; List<editItem> deletes = new List<editItem>(); if (word.Length > 1) { for (int i = 0; i < word.Length; i++) { editItem delete = new editItem(); delete.term=word.Remove(i, 1); delete.distance=editDistance; if (!deletes.Contains(delete)) { deletes.Add(delete); //recursion, if maximum edit distance not yet reached if (recursion && (editDistance < editDistanceMax)) { foreach (editItem edit1 in Edits(delete.term, editDistance,recursion)) { if (!deletes.Contains(edit1)) deletes.Add(edit1); } } } } } return deletes; }
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary //every delete entry has a suggestions list, which points to the original term(s) it was created from //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry private static bool CreateDictionaryEntry(string key, string language) { bool result = false; dictionaryItem value; if (dictionary.TryGetValue(language+key, out value)) { //already exists: //1. word appears several times //2. word1==deletes(word2) value.count++; } else { value = new dictionaryItem(); value.count++; dictionary.Add(language+key, value); } //edits/suggestions are created only once, no matter how often word occurs //edits/suggestions are created only as soon as the word occurs in the corpus, //even if the same term existed before in the dictionary as an edit from another word if (string.IsNullOrEmpty(value.term)) { result = true; value.term = key; //create deletes foreach (editItem delete in Edits(key, 0, true)) { editItem suggestion = new editItem(); suggestion.term = key; suggestion.distance = delete.distance; dictionaryItem value2; if (dictionary.TryGetValue(language+delete.term, out value2)) { //already exists: //1. word1==deletes(word2) //2. deletes(word1)==deletes(word2) if (!value2.suggestions.Contains(suggestion)) AddLowestDistance(value2.suggestions, suggestion); } else { value2 = new dictionaryItem(); value2.suggestions.Add(suggestion); dictionary.Add(language+delete.term, value2); } } } return result; }
//save some time and space private static void AddLowestDistance(List<editItem> suggestions, editItem suggestion) { //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > suggestion.distance)) suggestions.Clear(); //do not add suggestion of higher distance than existing, if verbose<2 if ((verbose == 2) || (suggestions.Count == 0) || (suggestions[0].distance >= suggestion.distance)) suggestions.Add(suggestion); }
private static List <suggestItem> Lookup(string input, string language, int editDistanceMax) { List <editItem> candidates = new List <editItem>(); //add original term editItem item = new editItem(); item.term = input; item.distance = 0; candidates.Add(item); List <suggestItem> suggestions = new List <suggestItem>(); dictionaryItem value; while (candidates.Count > 0) { editItem candidate = candidates[0]; candidates.RemoveAt(0); //save some time //early termination //suggestion distance=candidate.distance... candidate.distance+editDistanceMax //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected if ((verbose < 2) && (suggestions.Count > 0) && (candidate.distance > suggestions[0].distance)) { goto sort; } if (candidate.distance > editDistanceMax) { goto sort; } if (dictionary.TryGetValue(language + candidate.term, out value)) { if (!string.IsNullOrEmpty(value.term)) { //correct term suggestItem si = new suggestItem(); si.term = value.term; si.count = value.count; si.distance = candidate.distance; if (!suggestions.Contains(si)) { suggestions.Add(si); //early termination if ((verbose < 2) && (candidate.distance == 0)) { goto sort; } } } //edit term (with suggestions to correct term) dictionaryItem value2; foreach (editItem suggestion in value.suggestions) { //save some time //skipping double items early if (suggestions.Find(x => x.term == suggestion.term) == null) { int distance = TrueDistance(suggestion, candidate, input); //save some time. //remove all existing suggestions of higher distance, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) { suggestions.Clear(); } //do not process higher distances than those already found, if verbose<2 if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) { continue; } if (distance <= editDistanceMax) { if (dictionary.TryGetValue(language + suggestion.term, out value2)) { suggestItem si = new suggestItem(); si.term = value2.term; si.count = value2.count; si.distance = distance; suggestions.Add(si); } } } } }//end foreach //add edits if (candidate.distance < editDistanceMax) { foreach (editItem delete in Edits(candidate.term, candidate.distance, false)) { if (!candidates.Contains(delete)) { candidates.Add(delete); } } } }//end while sort : suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList(); if ((verbose == 0) && (suggestions.Count > 1)) { return(suggestions.GetRange(0, 1)); } else { return(suggestions); } }