Пример #1
0
 //save some time and space
 private void AddLowestDistance(DictionaryItem item, string suggestion, int suggestionint, string delete)
 {
     //remove all existing suggestions of higher distance, if verbose<2
     //index2word
     if ((verbose < 2) && (item.Suggestions.Count > 0) && (Wordlist[item.Suggestions[0]].Length - delete.Length > suggestion.Length - delete.Length)) item.Suggestions.Clear();
     //do not add suggestion of higher distance than existing, if verbose<2
     if ((verbose == 2) || (item.Suggestions.Count == 0) || (Wordlist[item.Suggestions[0]].Length - delete.Length >= suggestion.Length - delete.Length)) item.Suggestions.Add(suggestionint);
 }
Пример #2
0
        private List<SuggestItem> Lookup(string input, int editDistanceMax)
        {
            //save some time
            if (input.Length - editDistanceMax > Maxlength) return new List<SuggestItem>();

            var candidates = new List<string>();
            var hashset1 = new HashSet<string>();

            var suggestions = new List<SuggestItem>();
            var hashset2 = new HashSet<string>();

            //add original term
            candidates.Add(input);

            while (candidates.Count > 0)
            {
                var candidate = candidates[0];
                candidates.RemoveAt(0);

                //save some time
                //early termination
                //suggestion distance=candidate.distance... candidate.distance+editDistanceMax                
                //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
                if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length > suggestions[0].Distance)) goto sort;


                //read candidate entry from dictionary
                object valueo;
                if (Dictionary.TryGetValue(candidate, out valueo))
                {
                    var value = new DictionaryItem();
                    if (valueo is int) value.Suggestions.Add((int)valueo); else value = (DictionaryItem)valueo;

                    //if count>0 then candidate entry is correct dictionary term, not only delete item
                    if ((value.Count > 0) && hashset2.Add(candidate))
                    {
                        //add correct dictionary term term to suggestion list
                        var si = new SuggestItem(candidate, value.Count, input.Length - candidate.Length);
                        suggestions.Add(si);
                        //early termination
                        if ((verbose < 2) && (input.Length - candidate.Length == 0)) goto sort;
                    }

                    //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                    foreach (int suggestionint in value.Suggestions)
                    {
                        //save some time 
                        //skipping double items early: different deletes of the input term can lead to the same suggestion
                        //index2word
                        string suggestion = Wordlist[suggestionint];
                        if (hashset2.Add(suggestion))
                        {
                            //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                            //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. 
                            //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                            //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                            //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                            //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                            //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                            int distance = 0;
                            if (suggestion != input)
                            {
                                if (suggestion.Length == candidate.Length) distance = input.Length - candidate.Length;
                                else if (input.Length == candidate.Length) distance = suggestion.Length - candidate.Length;
                                else
                                {
                                    //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
                                    int ii = 0;
                                    int jj = 0;
                                    while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) ii++;
                                    while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) jj++;
                                    if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else distance = DamerauLevenshteinDistance(suggestion, input);

                                }
                            }

                            //save some time.
                            //remove all existing suggestions of higher distance, if verbose<2
                            if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].Distance > distance)) suggestions.Clear();
                            //do not process higher distances than those already found, if verbose<2
                            if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].Distance)) continue;

                            if (distance > editDistanceMax) continue;
                            object value2;
                            if (!Dictionary.TryGetValue(suggestion, out value2)) continue;
                            if (value2 == null) continue;
                            var item = (DictionaryItem)value2;
                            var si = new SuggestItem(suggestion, item.Count, distance);
                            suggestions.Add(si);
                        }
                    }//end foreach
                }//end if         

                //add edits 
                //derive edits (deletes) from candidate (input) and add them to candidates list
                //this is a recursive process until the maximum edit distance has been reached
                if (input.Length - candidate.Length < editDistanceMax)
                {
                    //save some time
                    //do not create edits with edit distance smaller than suggestions already found
                    if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].Distance)) continue;

                    candidates.AddRange(candidate.Select((t, i) => candidate.Remove(i, 1)).Where(delete => hashset1.Add(delete)));
                }
            }//end while

            //sort by ascending edit distance, then by descending word frequency
            sort: if (verbose < 2) suggestions.Sort((x, y) => -x.Count.CompareTo(y.Count)); else suggestions.Sort((x, y) => 2 * x.Distance.CompareTo(y.Distance) - x.Count.CompareTo(y.Count));
            if ((verbose == 0) && (suggestions.Count > 1)) return suggestions.GetRange(0, 1);
            return suggestions;
        }
Пример #3
0
        //for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary
        //every delete entry has a suggestions list, which points to the original term(s) it was created from
        //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry
        private bool CreateDictionaryEntry(string key)
        {
            bool result = false;
            DictionaryItem value = null;
            object valueo;
            if (Dictionary.TryGetValue(key, out valueo))
            {
                //int or dictionaryItem? delete existed before word!
                if (valueo is int)
                {
                    var tmp = (int)valueo;
                    value = new DictionaryItem();
                    value.Suggestions.Add(tmp);
                    Dictionary[key] = value;
                }

                //already exists:
                //1. word appears several times
                //2. word1==deletes(word2) 
                else
                {
                    value = (valueo as DictionaryItem);
                }
                //prevent overflow
                if (value != null && value.Count < int.MaxValue)
                {
                    value.Count++;
                }
            }
            else if (Wordlist.Count < int.MaxValue)
            {
                value = new DictionaryItem();
                value.Count++;
                Dictionary.Add(key, value);

                if (key.Length > Maxlength) Maxlength = key.Length;
            }


            //edits/suggestions are created only once, no matter how often word occurs
            //edits/suggestions are created only as soon as the word occurs in the corpus, 
            //even if the same term existed before in the dictionary as an edit from another word
            //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
            if (value != null && value.Count == 1)
            {
                //word2index
                Wordlist.Add(key);
                if (key.Equals("I", StringComparison.InvariantCultureIgnoreCase))
                {
                    Console.WriteLine(key);
                }
                var keyint = Wordlist.Count - 1;

                result = true;

                //create deletes
                foreach (var delete in Edits(key, 0, new HashSet<string>()))
                {
                    object value2;
                    if (Dictionary.TryGetValue(delete, out value2))
                    {
                        //already exists:
                        //1. word1==deletes(word2) 
                        //2. deletes(word1)==deletes(word2) 
                        //int or dictionaryItem? single delete existed before!
                        if (value2 is int)
                        {
                            //transformes int to dictionaryItem
                            var tmp = (int)value2;
                            DictionaryItem di = new DictionaryItem();
                            di.Suggestions.Add(tmp);
                            Dictionary[delete] = di;
                            if (!di.Suggestions.Contains(keyint))
                            {
                                AddLowestDistance(di, key, keyint, delete);
                            }
                        }
                        else
                        {
                            var dictionaryItem = value2 as DictionaryItem;
                            if (dictionaryItem != null && !dictionaryItem.Suggestions.Contains(keyint))
                            {
                                AddLowestDistance((DictionaryItem) value2, key, keyint, delete);
                            }
                        }
                    }
                    else
                    {
                        Dictionary.Add(delete, keyint);
                    }
                }
            }
            return result;
        }