Example #1
0
        public List <SuggestItem> Lookup(string input, int editDistanceMax = 2)
        {
            input = input.ToLower().Trim();

            //save some time
            if (input.Length - editDistanceMax > maxlength)
            {
                return(new List <SuggestItem>());
            }

            List <string>    candidates = new List <string>();
            HashSet <string> hashset1   = new HashSet <string>();

            List <SuggestItem> suggestions = new List <SuggestItem>();
            HashSet <string>   hashset2    = new HashSet <string>();


            //add original term
            candidates.Add(input);

            while (candidates.Count > 0)
            {
                string candidate = candidates[0];
                candidates.RemoveAt(0);

                //save some time
                //early termination
                //suggestion distance=candidate.distance... candidate.distance+editDistanceMax
                //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
                if (Verbose < 2 &&
                    suggestions.Count > 0 &&
                    input.Length - candidate.Length > suggestions[0].Distance)
                {
                    return(ReturnSorted(suggestions));
                }

                //read candidate entry from dictionary
                if (dictionary.TryGetValue(candidate, out int valueo))
                {
                    DictionaryItem value = new DictionaryItem();
                    if (valueo >= 0)
                    {
                        value.Suggestions.Add(valueo);
                    }
                    else
                    {
                        value = itemlist[-valueo - 1];
                    }

                    //if count>0 then candidate entry is correct dictionary term, not only delete item
                    if (value.Count > 0 &&
                        hashset2.Add(candidate))
                    {
                        int distance = input.Length - candidate.Length;

                        //save some time
                        //do not process higher distances than those already found, if verbose<2
                        if (Verbose == 2 ||
                            suggestions.Count == 0 ||
                            distance <= suggestions[0].Distance)
                        {
                            //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love
                            //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2).
                            //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end.
                            //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch.
                            //Therefore returned suggestions were not always complete for verbose<2.
                            //remove all existing suggestions of higher distance, if verbose<2
                            if (Verbose < 2 &&
                                suggestions.Count > 0 &&
                                suggestions[0].Distance > distance)
                            {
                                suggestions.Clear();
                            }

                            //add correct dictionary term term to suggestion list
                            SuggestItem si = new SuggestItem(candidate, value.Count, distance);
                            suggestions.Add(si);

                            //early termination
                            if (Verbose < 2 &&
                                input.Length - candidate.Length == 0)
                            {
                                return(ReturnSorted(suggestions));
                            }
                        }
                    }

                    //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                    foreach (int suggestionint in value.Suggestions)
                    {
                        //save some time
                        //skipping double items early: different deletes of the input term can lead to the same suggestion
                        //index2word
                        string suggestion = wordlist[suggestionint];
                        if (hashset2.Add(suggestion))
                        {
                            //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                            //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term.
                            //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                            //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                            //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                            //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                            //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                            int distance = 0;
                            if (suggestion != input)
                            {
                                if (suggestion.Length == candidate.Length)
                                {
                                    distance = input.Length - candidate.Length;
                                }
                                else if (input.Length == candidate.Length)
                                {
                                    distance = suggestion.Length - candidate.Length;
                                }
                                else
                                {
                                    //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
                                    int ii = 0;
                                    int jj = 0;
                                    while (ii < suggestion.Length &&
                                           ii < input.Length &&
                                           suggestion[ii] == input[ii])
                                    {
                                        ii++;
                                    }

                                    while (jj < suggestion.Length - ii &&
                                           jj < input.Length - ii &&
                                           suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])
                                    {
                                        jj++;
                                    }

                                    if (ii > 0 ||
                                        jj > 0)
                                    {
                                        distance = suggestion.Substring(ii, suggestion.Length - ii - jj).DamerauLevenshteinDistance2(input.Substring(ii, input.Length - ii - jj));
                                    }
                                    else
                                    {
                                        distance = suggestion.DamerauLevenshteinDistance2(input);
                                    }
                                }
                            }

                            //save some time
                            //do not process higher distances than those already found, if verbose<2
                            if (Verbose < 2 &&
                                suggestions.Count > 0 &&
                                distance > suggestions[0].Distance)
                            {
                                continue;
                            }

                            if (distance <= editDistanceMax)
                            {
                                if (dictionary.TryGetValue(suggestion, out int value2))
                                {
                                    SuggestItem si = new SuggestItem(suggestion, itemlist[-value2 - 1].Count, distance);

                                    //remove all existing suggestions of higher distance, if verbose<2
                                    if (Verbose < 2 &&
                                        suggestions.Count > 0 &&
                                        suggestions[0].Distance > distance)
                                    {
                                        suggestions.Clear();
                                    }

                                    suggestions.Add(si);
                                }
                            }
                        }
                    } //end foreach
                }     //end if

                //add edits
                //derive edits (deletes) from candidate (input) and add them to candidates list
                //this is a recursive process until the maximum edit distance has been reached
                if (input.Length - candidate.Length < editDistanceMax)
                {
                    //save some time
                    //do not create edits with edit distance smaller than suggestions already found
                    if (Verbose < 2 &&
                        suggestions.Count > 0 &&
                        input.Length - candidate.Length >= suggestions[0].Distance)
                    {
                        continue;
                    }

                    for (int i = 0; i < candidate.Length; i++)
                    {
                        string delete = candidate.Remove(i, 1);
                        if (hashset1.Add(delete))
                        {
                            candidates.Add(delete);
                        }
                    }
                }
            } //end while

            //sort by ascending edit distance, then by descending word frequency
            return(ReturnSorted(suggestions));
        }
Example #2
0
        public List <SuggestItem> LookupCompound(string input, int editDistanceMax = 2)
        {
            input = input.ToLower().Trim();

            //parse input string into single terms
            string[] termList1 = ParseWords(input).ToArray();

            List <SuggestItem> suggestionsPreviousTerm;     //suggestions for a single term
            var suggestions     = new List <SuggestItem>(); //suggestions for a single term
            var suggestionParts = new List <SuggestItem>(); //1 line with separate parts

            //translate every term to its best suggestion, otherwise it remains unchanged
            bool lastCombi = false;

            for (int i = 0; i < termList1.Length; i++)
            {
                suggestionsPreviousTerm = new List <SuggestItem>(suggestions.Count);
                for (int k = 0; k < suggestions.Count; k++)
                {
                    suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy());
                }

                suggestions = Lookup(termList1[i], editDistanceMax);

                //combi check, always before split
                if (i > 0 && !lastCombi)
                {
                    List <SuggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], editDistanceMax);

                    if (suggestionsCombi.Count > 0)
                    {
                        SuggestItem best1 = suggestionParts[suggestionParts.Count - 1];
                        SuggestItem best2;
                        if (suggestions.Count > 0)
                        {
                            best2 = suggestions[0];
                        }
                        else
                        {
                            best2 = new SuggestItem(termList1[i], editDistanceMax + 1, 0);
                        }

                        if (suggestionsCombi[0].Distance + 1 < (termList1[i - 1] + " " + termList1[i]).DamerauLevenshteinDistance2(best1.Term + " " + best2.Term))
                        {
                            suggestionsCombi[0].IncreaseDistance();
                            suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0];
                            break;
                        }
                    }
                }

                //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms
                if (suggestions.Count > 0 &&
                    (suggestions[0].Distance == 0 || termList1[i].Length == 1))
                {
                    //choose best suggestion
                    suggestionParts.Add(suggestions[0]);
                }
                else
                {
                    //if no perfect suggestion, split word into pairs
                    List <SuggestItem> suggestionsSplit = new List <SuggestItem>();

                    //add original term
                    if (suggestions.Count > 0)
                    {
                        suggestionsSplit.Add(suggestions[0]);
                    }

                    if (termList1[i].Length > 1)
                    {
                        for (int j = 1; j < termList1[i].Length; j++)
                        {
                            string part1 = termList1[i].Substring(0, j);
                            string part2 = termList1[i].Substring(j);

                            List <SuggestItem> suggestions1 = Lookup(part1, editDistanceMax);
                            if (suggestions1.Count > 0)
                            {
                                if (suggestions.Count > 0 &&
                                    suggestions[0].Term == suggestions1[0].Term)
                                {
                                    break;
                                }

                                //if split correction1 == einzelwort correction
                                List <SuggestItem> suggestions2 = Lookup(part2, editDistanceMax);
                                if (suggestions2.Count > 0)
                                {
                                    if (suggestions.Count > 0 &&
                                        suggestions[0].Term == suggestions2[0].Term)
                                    {
                                        break;
                                    }

                                    //if split correction1 == einzelwort correction
                                    //select best suggestion for split pair
                                    var         suggestionSplitTerm     = suggestions1[0].Term + " " + suggestions2[0].Term;
                                    var         suggestionSplitDistance = termList1[i].DamerauLevenshteinDistance2(suggestions1[0].Term + " " + suggestions2[0].Term);
                                    var         suggestionSplitCount    = Math.Min(suggestions1[0].Count, suggestions2[0].Count);
                                    SuggestItem suggestionSplit         = new SuggestItem(suggestionSplitTerm, suggestionSplitCount, suggestionSplitDistance);
                                    suggestionsSplit.Add(suggestionSplit);

                                    //early termination of split
                                    if (suggestionSplit.Distance == 1)
                                    {
                                        break;
                                    }
                                }
                            }
                        }

                        if (suggestionsSplit.Count > 0)
                        {
                            //select best suggestion for split pair
                            suggestionsSplit.Sort((x, y) => 2 * x.Distance.CompareTo(y.Distance) - x.Count.CompareTo(y.Count));
                            suggestionParts.Add(suggestionsSplit[0]);
                        }
                        else
                        {
                            SuggestItem si = new SuggestItem(termList1[i], 0, editDistanceMax + 1);
                            suggestionParts.Add(si);
                        }
                    }
                    else
                    {
                        SuggestItem si = new SuggestItem(termList1[i], 0, editDistanceMax + 1);
                        suggestionParts.Add(si);
                    }
                }
            }

            var    suggestionCount = long.MaxValue;
            string s = "";

            foreach (SuggestItem si in suggestionParts)
            {
                s += si.Term + " ";
                suggestionCount = Math.Min(suggestionCount, si.Count);
            }

            var suggestionTerm                 = s.TrimEnd();
            var suggestionDistance             = suggestionTerm.DamerauLevenshteinDistance2(input);
            var suggestion                     = new SuggestItem(suggestionTerm, suggestionCount, suggestionDistance);
            List <SuggestItem> suggestionsLine = new List <SuggestItem>();

            suggestionsLine.Add(suggestion);
            return(suggestionsLine);
        }
Example #3
0
        public List <SuggestItem> Lookup(string word, int editDistance = 2)
        {
            word = word.Trim().ToLower();
            var editDistanceMax = editDistance;

            //save some time
            if (word.Length - editDistanceMax > maxlength)
            {
                return(new List <SuggestItem>());
            }

            List <string>    candidates = new List <string>();
            HashSet <string> hashset1   = new HashSet <string>();

            List <SuggestItem> suggestions = new List <SuggestItem>();
            HashSet <string>   hashset2    = new HashSet <string>();

            int editDistanceMax2 = editDistanceMax;

            int candidatePointer = 0;

            //add original term
            candidates.Add(word);

            while (candidatePointer < candidates.Count)
            {
                string candidate  = candidates[candidatePointer++];
                int    lengthDiff = Math.Min(word.Length, lp) - candidate.Length;

                //save some time
                //early termination
                //suggestion distance=candidate.distance... candidate.distance+editDistanceMax
                //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
                if (Verbose < 2 &&
                    suggestions.Count > 0 &&
                    lengthDiff >
                    suggestions[0]
                    .Distance)
                {
                    return(SortItems(suggestions));
                }

                //read candidate entry from dictionary
                if (dictionary.TryGetValue(candidate, out int valueo))
                {
                    DictionaryItem value = new DictionaryItem();
                    if (valueo >= 0)
                    {
                        value.Suggestions.Add(valueo);
                    }
                    else
                    {
                        value = itemlist[-valueo - 1];
                    }

                    //if count>0 then candidate entry is correct dictionary term, not only delete item
                    if (value.Count > 0)
                    {
                        int distance = word.Length - candidate.Length;

                        //save some time
                        //do not process higher distances than those already found, if verbose<2
                        if (distance <= editDistanceMax &&
                            (Verbose == 2 || suggestions.Count == 0 || distance <= suggestions[0].Distance) &&
                            hashset2.Add(candidate))
                        {
                            //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love
                            //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2).
                            //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end.
                            //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch.
                            //Therefore returned suggestions were not always complete for verbose<2.
                            //remove all existing suggestions of higher distance, if verbose<2
                            if (Verbose < 2 &&
                                suggestions.Count > 0 &&
                                suggestions[0]
                                .Distance >
                                distance)
                            {
                                suggestions.Clear(); //!!!
                            }
                            //add correct dictionary term term to suggestion list
                            SuggestItem si = new SuggestItem(
                                candidate,
                                value.Count,
                                distance);
                            suggestions.Add(si);

                            //early termination
                            if (Verbose < 2 &&
                                distance == 0)
                            {
                                return(SortItems(suggestions));
                            }
                        }
                    }

                    //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                    foreach (int suggestionint in value.Suggestions)
                    {
                        //save some time
                        //skipping double items early: different deletes of the input term can lead to the same suggestion
                        //index2word
                        string suggestion = wordlist[suggestionint];

                        //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                        //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term.
                        //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                        //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                        //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                        //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                        //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                        int distance = 0; // editDistanceMax+1;
                        if (suggestion != word)
                        {
                            int min = 0;
                            if (Math.Abs(suggestion.Length - word.Length) > editDistanceMax2)
                            {
                                continue;
                            }
                            if (candidate.Length == 0)
                            {
                                //suggestions which have no common chars with input (input.length<=editDistanceMax && suggestion.length<=editDistanceMax)
                                if (!hashset2.Add(suggestion))
                                {
                                    continue;
                                }

                                distance = Math.Max(word.Length, suggestion.Length);
                            }
                            else

                            //number of edits in prefix ==maxediddistance  AND no identic suffix, then editdistance>editdistancemax and no need for Levenshtein calculation
                            //                                                 (input.Length >= lp) && (suggestion.Length >= lp)
                            if (lp - editDistanceMax == candidate.Length &&
                                (min = Math.Min(word.Length, suggestion.Length) - lp) > 1 &&
                                word.Substring(word.Length + 1 - min) != suggestion.Substring(suggestion.Length + 1 - min) ||
                                min > 0 &&
                                word[word.Length - min] != suggestion[suggestion.Length - min] &&
                                (word[word.Length - min - 1] != suggestion[suggestion.Length - min] || word[word.Length - min] != suggestion[suggestion.Length - min - 1]))
                            {
                                continue;
                            }
                            else //edit distance of remaining string (after prefix)
                            {
                                if (suggestion.Length == candidate.Length &&
                                    word.Length <= lp)
                                {
                                    if (!hashset2.Add(suggestion))
                                    {
                                        continue;
                                    }

                                    distance = word.Length - candidate.Length;
                                }
                                else if (word.Length == candidate.Length &&
                                         suggestion.Length <= lp)
                                {
                                    if (!hashset2.Add(suggestion))
                                    {
                                        continue;
                                    }

                                    distance = suggestion.Length - candidate.Length;
                                }
                                else if (hashset2.Add(suggestion))
                                {
                                    distance = word.DamerauLevenshteinDistance(suggestion, editDistanceMax2);
                                    if (distance < 0)
                                    {
                                        distance = editDistanceMax + 1;
                                    }
                                }
                                else
                                {
                                    continue;
                                }
                            }
                        }
                        else if (!hashset2.Add(suggestion))
                        {
                            continue;
                        }

                        //save some time
                        //do not process higher distances than those already found, if verbose<2
                        if (Verbose < 2 &&
                            suggestions.Count > 0 &&
                            distance >
                            suggestions[0]
                            .Distance)
                        {
                            continue;
                        }
                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(suggestion, out int value2))
                            {
                                SuggestItem si = new SuggestItem(
                                    suggestion,
                                    itemlist[-value2 - 1]
                                    .Count,
                                    distance);

                                //we will calculate DamLev distance only to the smallest found distance sof far
                                if (Verbose < 2)
                                {
                                    editDistanceMax2 = distance;
                                }

                                //remove all existing suggestions of higher distance, if verbose<2
                                if (Verbose < 2 &&
                                    suggestions.Count > 0 &&
                                    suggestions[0]
                                    .Distance >
                                    distance)
                                {
                                    suggestions.Clear();
                                }
                                suggestions.Add(si);
                            }
                        }
                    } //end foreach
                }     //end if

                //add edits
                //derive edits (deletes) from candidate (input) and add them to candidates list
                //this is a recursive process until the maximum edit distance has been reached
                if (lengthDiff < editDistanceMax)
                {
                    //save some time
                    //do not create edits with edit distance smaller than suggestions already found
                    //if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) continue;
                    if (Verbose < 2 &&
                        suggestions.Count > 0 &&
                        lengthDiff >=
                        suggestions[0]
                        .Distance)
                    {
                        continue; //!?!
                    }

                    if (candidate.Length > lp)
                    {
                        candidate = candidate.Substring(0, lp); //just the input entry might be > lp
                    }
                    for (int i = 0; i < candidate.Length; i++)
                    {
                        string delete = candidate.Remove(i, 1);

                        if (hashset1.Add(delete))
                        {
                            candidates.Add(delete);
                        }
                    }
                }
            } //end while

            return(SortItems(suggestions));
        }