Example #1
0
        /// <summary>Find suggested spellings for a given input word.</summary>
        /// <param name="input">The word being spell checked.</param>
        /// <param name="verbosity">The value controlling the quantity/closeness of the retuned suggestions.</param>
        /// <param name="maxEditDistance">The maximum edit distance between input and suggested words.</param>
        /// <returns>A List of SuggestItem object representing suggested correct spellings for the input word,
        /// sorted by edit distance, and secondarily by count frequency.</returns>
        public override List <string> Lookup(string inpt)
        {
            Verbosity verbosity       = SymSpell.Verbosity.Closest;
            int       maxEditDistance = 2;

            //verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found
            //verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency
            //verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination)

            // maxEditDistance used in Lookup can't be bigger than the maxDictionaryEditDistance
            // used to construct the underlying dictionary structure.
            if (maxEditDistance > MaxDictionaryEditDistance)
            {
                throw new ArgumentOutOfRangeException(maxEditDistance.ToString());
            }

            List <SuggestItem> suggestions = new List <SuggestItem>();
            List <string>      Namzatlar   = new List <string>();

            string newinput = inpt.ToLower().Replace(Uyghur.Sozghuch, "");
            int    inputLen = newinput.Length;

            // early exit - word is too big to possibly match any words
            if (inputLen - maxEditDistance > maxLength)
            {
                return(Namzatlar);
            }

            // deletes we've considered already
            HashSet <string> hashset1 = new HashSet <string>();
            // suggestions we've considered already
            HashSet <string> hashset2 = new HashSet <string>();

            // quick look for exact match
            long suggestionCount = 0;

            if (words.TryGetValue(newinput, out suggestionCount))
            {
                suggestions.Add(new SuggestItem(newinput, 0, suggestionCount));
            }
            hashset2.Add(newinput);             // we considered the input already in the word.TryGetValue above

            int maxEditDistance2 = maxEditDistance;
            int candidatePointer = 0;
            var singleSuggestion = new string[1] {
                string.Empty
            };
            List <string> candidates = new List <string>();

            //add original prefix
            int inputPrefixLen = inputLen;

            if (inputPrefixLen > prefixLength)
            {
                inputPrefixLen = prefixLength;
                candidates.Add(newinput.Substring(0, inputPrefixLen));
            }
            else
            {
                candidates.Add(newinput);
            }
            var distanceComparer = new EditDistance(newinput, this.distanceAlgorithm);

            while (candidatePointer < candidates.Count)
            {
                string candidate    = candidates[candidatePointer++];
                int    candidateLen = candidate.Length;
                int    lengthDiff   = inputPrefixLen - candidateLen;

                //save some time - early termination
                //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
                if (lengthDiff > maxEditDistance2)
                {
                    // skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest
                    // (candidates are ordered by delete distance, so none are closer than current)
                    if (verbosity == Verbosity.All)
                    {
                        continue;
                    }
                    break;
                }

                //read candidate entry from dictionary
                string[] dictSuggestions;
                if (deletes.TryGetValue(GetStringHash(candidate), out dictSuggestions))
                {
                    //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                    for (int i = 0; i < dictSuggestions.Length; i++)
                    {
                        var suggestion    = dictSuggestions[i];
                        int suggestionLen = suggestion.Length;
                        if (suggestion == newinput)
                        {
                            continue;
                        }
                        if ((Math.Abs(suggestionLen - inputLen) > maxEditDistance2) ||                      // input and sugg lengths diff > allowed/current best distance
                            (suggestionLen < candidateLen) ||                         // sugg must be for a different delete string, in same bin only because of hash collision
                            (suggestionLen == candidateLen && suggestion != candidate))                            // if sugg len = delete len, then it either equals delete or is in same bin only because of hash collision
                        {
                            continue;
                        }
                        var suggPrefixLen = Math.Min(suggestionLen, prefixLength);
                        if (suggPrefixLen > inputPrefixLen && (suggPrefixLen - candidateLen) > maxEditDistance2)
                        {
                            continue;
                        }

                        //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                        //We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term.
                        //For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance.
                        //For inserts and deletes the resulting edit distance might exceed maxEditDistance.
                        //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                        //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1)
                        //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                        int distance = 0;
                        int min      = 0;
                        if (candidateLen == 0)
                        {
                            //suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance)
                            distance = Math.Max(inputLen, suggestionLen);
                            if (distance > maxEditDistance2 || !hashset2.Add(suggestion))
                            {
                                continue;
                            }
                        }
                        else if (suggestionLen == 1)
                        {
                            if (newinput.IndexOf(suggestion[0]) < 0)
                            {
                                distance = inputLen;
                            }
                            else
                            {
                                distance = inputLen - 1;
                            }
                            if (distance > maxEditDistance2 || !hashset2.Add(suggestion))
                            {
                                continue;
                            }
                        }
                        else
                        //number of edits in prefix ==maxediddistance  AND no identic suffix
                        //, then editdistance>maxEditDistance and no need for Levenshtein calculation
                        //      (inputLen >= prefixLength) && (suggestionLen >= prefixLength)
                        if ((prefixLength - maxEditDistance == candidateLen) &&
                            (((min = Math.Min(inputLen, suggestionLen) - prefixLength) > 1) &&
                             (newinput.Substring(inputLen + 1 - min) != suggestion.Substring(suggestionLen + 1 - min))) ||
                            ((min > 0) && (newinput[inputLen - min] != suggestion[suggestionLen - min]) &&
                             ((newinput[inputLen - min - 1] != suggestion[suggestionLen - min]) ||
                              (newinput[inputLen - min] != suggestion[suggestionLen - min - 1]))))
                        {
                            continue;
                        }
                        else
                        {
                            // DeleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest.
                            if ((verbosity != Verbosity.All && !DeleteInSuggestionPrefix(candidate, candidateLen, suggestion, suggestionLen)) ||
                                !hashset2.Add(suggestion))
                            {
                                continue;
                            }
                            distance = distanceComparer.Compare(suggestion, maxEditDistance2);
                            if (distance < 0)
                            {
                                continue;
                            }
                        }

                        //save some time
                        //do not process higher distances than those already found, if verbosity<All (note: maxEditDistance2 will always equal maxEditDistance when Verbosity.All)
                        if (distance <= maxEditDistance2)
                        {
                            suggestionCount = words[suggestion];
                            SuggestItem si = new SuggestItem(suggestion, distance, suggestionCount);
                            if (suggestions.Count > 0)
                            {
                                switch (verbosity)
                                {
                                case Verbosity.Closest:
                                {
                                    //we will calculate DamLev distance only to the smallest found distance so far
                                    if (distance < maxEditDistance2)
                                    {
                                        suggestions.Clear();
                                    }
                                    break;
                                }

                                case Verbosity.Top:
                                {
                                    if (distance < maxEditDistance2 || suggestionCount > suggestions[0].count)
                                    {
                                        maxEditDistance2 = distance;
                                        suggestions[0]   = si;
                                    }
                                    continue;
                                }
                                }
                            }
                            if (verbosity != Verbosity.All)
                            {
                                maxEditDistance2 = distance;
                            }
                            suggestions.Add(si);
                        }
                    }            //end foreach
                }                //end if

                //add edits
                //derive edits (deletes) from candidate (input) and add them to candidates list
                //this is a recursive process until the maximum edit distance has been reached
                if ((lengthDiff < maxEditDistance) && (candidateLen <= prefixLength))
                {
                    //save some time
                    //do not create edits with edit distance smaller than suggestions already found
                    if (verbosity != Verbosity.All && lengthDiff >= maxEditDistance2)
                    {
                        continue;
                    }

                    for (int i = 0; i < candidateLen; i++)
                    {
                        string delete = candidate.Remove(i, 1);

                        if (hashset1.Add(delete))
                        {
                            candidates.Add(delete);
                        }
                    }
                }
            }            //end while

            //sort by ascending edit distance, then by descending word frequency
            if (suggestions.Count > 1)
            {
                suggestions.Sort();
            }
            foreach (SuggestItem item in suggestions)
            {
                Namzatlar.Add(item.term);
            }

            if (gYeziq == Uyghur.YEZIQ.ULY)
            {
                newinput = newinput.Replace('o', 'ö').Replace('u', 'ü').Replace('é', 'e');
                if (IsListed(newinput))
                {
                    Namzatlar.Insert(0, newinput);
                }
            }
            return(Namzatlar);
        }        //end if
Example #2
0
        public List <SuggestItem> Lookup(string input, string language, int editDistanceMax, int verbose)
        {
            // editDistanceMax used in Lookup can't be bigger than the editDistanceMax use to construct
            // the underlying dictionary structure.
            //if (editDistanceMax > this.editDistanceMax) throw new ArgumentOutOfRangeException();

            //save some time
            if (input.Length - editDistanceMax > maxLength)
            {
                return(new List <SuggestItem>());
            }

            List <string>    candidates = new List <string>();
            HashSet <string> hashset1   = new HashSet <string>();

            List <SuggestItem> suggestions = new List <SuggestItem>();
            HashSet <string>   hashset2    = new HashSet <string>();

            int editDistanceMax2 = editDistanceMax;

            int candidatePointer = 0;

            //add original term
            candidates.Add(input);

            //add original prefix
            if (input.Length > lp)
            {
                candidates.Add(input.Substring(0, lp));
            }

            var distanceComparer = new EditDistance(EditDistance.DistanceAlgorithm.DamerauOSA);

            while (candidatePointer < candidates.Count)
            {
                string candidate  = candidates[candidatePointer++];
                int    lengthDiff = Math.Min(input.Length, lp) - candidate.Length;

                //save some time
                //early termination
                //suggestion distance=candidate.distance... candidate.distance+editDistanceMax
                //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
                if ((verbose < 2) && (suggestions.Count > 0) && (lengthDiff > suggestions[0].distance))
                {
                    goto sort;
                }

                //read candidate entry from dictionary
                if (dictionary.TryGetValue(language + candidate, out int valueo))
                {
                    DictionaryItem value = new DictionaryItem();
                    if (valueo >= 0)
                    {
                        value.suggestions.Add((Int32)valueo);
                    }
                    else
                    {
                        value = itemlist[-valueo - 1];
                    }

                    //if count>0 then candidate entry is correct dictionary term, not only delete item
                    if (value.count > 0)
                    {
                        int distance = input.Length - candidate.Length;

                        //save some time
                        //do not process higher distances than those already found, if verbose<2
                        if ((distance <= editDistanceMax) &&
                            ((verbose == 2) || (suggestions.Count == 0) || (distance <= suggestions[0].distance)) &&
                            (hashset2.Add(candidate)))
                        {
                            //Fix: previously not allways all suggestons within editdistance (verbose=1) or the best suggestion (verbose=0) were returned : e.g. elove did not return love
                            //suggestions.Clear() was not executed in this branch, if a suggestion with lower edit distance was added here (for verbose<2).
                            //Then possibly suggestions with higher edit distance remained on top, the suggestion with lower edit distance were added to the end.
                            //All of them where deleted later once a suggestion with a lower distance than the first item in the list was later added in the other branch.
                            //Therefore returned suggestions were not always complete for verbose<2.
                            //remove all existing suggestions of higher distance, if verbose<2
                            if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance))
                            {
                                suggestions.Clear();
                            }

                            //add correct dictionary term term to suggestion list
                            SuggestItem si = new SuggestItem()
                            {
                                term     = candidate,
                                count    = value.count,
                                distance = distance
                            };
                            suggestions.Add(si);
                            //early termination
                            if ((verbose < 2) && (distance == 0))
                            {
                                goto sort;
                            }
                        }
                    }

                    //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                    foreach (int suggestionint in value.suggestions)
                    {
                        //save some time
                        //skipping double items early: different deletes of the input term can lead to the same suggestion
                        //index2word
                        string suggestion = wordlist[suggestionint];

                        //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                        //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term.
                        //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                        //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                        //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                        //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                        //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                        int distance = 0;// editDistanceMax+1;
                        if (suggestion != input)
                        {
                            int min = 0;
                            if (Math.Abs(suggestion.Length - input.Length) > editDistanceMax2)
                            {
                                continue;
                            }
                            else if (candidate.Length == 0)
                            {
                                //suggestions which have no common chars with input (input.length<=editDistanceMax && suggestion.length<=editDistanceMax)
                                if (!hashset2.Add(suggestion))
                                {
                                    continue;
                                }
                                distance = Math.Max(input.Length, suggestion.Length);
                            }
                            else
                            //number of edits in prefix ==maxediddistance  AND no identic suffix, then editdistance>editdistancemax and no need for Levenshtein calculation
                            //                                                 (input.Length >= lp) && (suggestion.Length >= lp)
                            if ((lp - editDistanceMax == candidate.Length) && (((min = Math.Min(input.Length, suggestion.Length) - lp) > 1) && (input.Substring(input.Length + 1 - min) != suggestion.Substring(suggestion.Length + 1 - min))) || ((min > 0) && (input[input.Length - min] != suggestion[suggestion.Length - min]) && ((input[input.Length - min - 1] != suggestion[suggestion.Length - min]) || (input[input.Length - min] != suggestion[suggestion.Length - min - 1]))))
                            {
                                continue;
                            }
                            else
                            //edit distance of remaining string (after prefix)
                            if ((suggestion.Length == candidate.Length) && (input.Length <= lp))
                            {
                                if (!hashset2.Add(suggestion))
                                {
                                    continue;
                                }
                                distance = input.Length - candidate.Length;
                            }
                            else if ((input.Length == candidate.Length) && (suggestion.Length <= lp))
                            {
                                if (!hashset2.Add(suggestion))
                                {
                                    continue;
                                }
                                distance = suggestion.Length - candidate.Length;
                            }
                            else if (hashset2.Add(suggestion))
                            {
                                distance = distanceComparer.Compare(input, suggestion, editDistanceMax2);
                                if (distance < 0)
                                {
                                    distance = editDistanceMax + 1;
                                }
                            }
                            else
                            {
                                continue;
                            }
                        }
                        else if (!hashset2.Add(suggestion))
                        {
                            continue;
                        }

                        //save some time
                        //do not process higher distances than those already found, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance))
                        {
                            continue;
                        }
                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(language + suggestion, out int value2))
                            {
                                SuggestItem si = new SuggestItem()
                                {
                                    term     = suggestion,
                                    count    = itemlist[-value2 - 1].count,
                                    distance = distance
                                };

                                //we will calculate DamLev distance only to the smallest found distance sof far
                                if (verbose < 2)
                                {
                                    editDistanceMax2 = distance;
                                }

                                //remove all existing suggestions of higher distance, if verbose<2
                                if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance))
                                {
                                    suggestions.Clear();
                                }
                                suggestions.Add(si);
                            }
                        }
                    } //end foreach
                }     //end if

                //add edits
                //derive edits (deletes) from candidate (input) and add them to candidates list
                //this is a recursive process until the maximum edit distance has been reached
                if ((lengthDiff < editDistanceMax) && (candidate.Length <= lp))
                {
                    //save some time
                    //do not create edits with edit distance smaller than suggestions already found
                    if ((verbose < 2) && (suggestions.Count > 0) && (lengthDiff >= suggestions[0].distance))
                    {
                        continue;
                    }

                    for (int i = 0; i < candidate.Length; i++)
                    {
                        string delete = candidate.Remove(i, 1);

                        if (hashset1.Add(delete))
                        {
                            candidates.Add(delete);
                        }
                    }
                }
            }//end while

            //sort by ascending edit distance, then by descending word frequency
            sort : if (verbose < 2)
            {
                suggestions.Sort((x, y) => - x.count.CompareTo(y.count));
            }
            else
            {
                suggestions.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
            }
            if ((verbose == 0) && (suggestions.Count > 1))
            {
                return(suggestions.GetRange(0, 1));
            }
            else
            {
                return(suggestions);
            }
        }
    public List <SuggestItem> LookupCompound(string input, int editDistanceMax)
    {
        //parse input string into single terms
        string[] termList1 = ParseWords(input);

        List <SuggestItem> suggestionsPreviousTerm;                         //suggestions for a single term
        List <SuggestItem> suggestions     = new List <SuggestItem>();      //suggestions for a single term
        List <SuggestItem> suggestionParts = new List <SuggestItem>();      //1 line with separate parts

        //translate every term to its best suggestion, otherwise it remains unchanged
        bool lastCombi = false;

        for (int i = 0; i < termList1.Length; i++)
        {
            suggestionsPreviousTerm = new List <SuggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++)
            {
                suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy());
            }
            suggestions = Lookup(termList1[i], Verbosity.Top, editDistanceMax);


            //combi check, always before split
            if ((i > 0) && !lastCombi)
            {
                List <SuggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], Verbosity.Top, editDistanceMax);

                if (suggestionsCombi.Count > 0)
                {
                    SuggestItem best1 = suggestionParts[suggestionParts.Count - 1];
                    SuggestItem best2 = new SuggestItem();
                    if (suggestions.Count > 0)
                    {
                        best2 = suggestions[0];
                    }
                    else
                    {
                        best2.term     = termList1[i];
                        best2.distance = editDistanceMax + 1;
                        best2.count    = 0;
                    }
                    //if (suggestionsCombi[0].distance + 1 < DamerauLevenshteinDistance(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term))
                    var distanceComparer1 = new EditDistance(termList1[i - 1] + " " + termList1[i], this.distanceAlgorithm);                    //new
                    int distance1         = distanceComparer1.Compare(best1.term + " " + best2.term, editDistanceMax);
                    if ((distance1 >= 0) && (suggestionsCombi[0].distance + 1 < distance1))
                    {
                        suggestionsCombi[0].distance++;
                        suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0];
                        lastCombi = true;
                        goto nextTerm;
                    }
                }
            }
            lastCombi = false;

            //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms
            if ((suggestions.Count > 0) && ((suggestions[0].distance == 0) || (termList1[i].Length == 1)))
            {
                //choose best suggestion
                suggestionParts.Add(suggestions[0]);
            }
            else
            {
                //if no perfect suggestion, split word into pairs
                List <SuggestItem> suggestionsSplit = new List <SuggestItem>();

                //add original term
                if (suggestions.Count > 0)
                {
                    suggestionsSplit.Add(suggestions[0]);
                }

                if (termList1[i].Length > 1)
                {
                    for (int j = 1; j < termList1[i].Length; j++)
                    {
                        string             part1           = termList1[i].Substring(0, j);
                        string             part2           = termList1[i].Substring(j);
                        SuggestItem        suggestionSplit = new SuggestItem();
                        List <SuggestItem> suggestions1    = Lookup(part1, Verbosity.Top, editDistanceMax);
                        if (suggestions1.Count > 0)
                        {
                            if ((suggestions.Count > 0) && (suggestions[0].term == suggestions1[0].term))
                            {
                                break;                                                                                                      //if split correction1 == einzelwort correction
                            }
                            List <SuggestItem> suggestions2 = Lookup(part2, Verbosity.Top, editDistanceMax);
                            if (suggestions2.Count > 0)
                            {
                                if ((suggestions.Count > 0) && (suggestions[0].term == suggestions2[0].term))
                                {
                                    break;                                                                                                          //if split correction1 == einzelwort correction
                                }
                                //select best suggestion for split pair
                                suggestionSplit.term = suggestions1[0].term + " " + suggestions2[0].term;
                                var distanceComparer2 = new EditDistance(termList1[i], this.distanceAlgorithm);                                //new
                                int distance2         = distanceComparer2.Compare(suggestions1[0].term + " " + suggestions2[0].term, editDistanceMax);
                                if (distance2 < 0)
                                {
                                    distance2 = editDistanceMax + 1;
                                }
                                suggestionSplit.distance = distance2;
                                suggestionSplit.count    = Math.Min(suggestions1[0].count, suggestions2[0].count);
                                suggestionsSplit.Add(suggestionSplit);

                                //early termination of split
                                if (suggestionSplit.distance == 1)
                                {
                                    break;
                                }
                            }
                        }
                    }

                    if (suggestionsSplit.Count > 0)
                    {
                        //select best suggestion for split pair
                        suggestionsSplit.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
                        suggestionParts.Add(suggestionsSplit[0]);
                    }
                    else
                    {
                        SuggestItem si = new SuggestItem();
                        si.term     = termList1[i];
                        si.count    = 0;
                        si.distance = editDistanceMax + 1;
                        suggestionParts.Add(si);
                    }
                }
                else
                {
                    SuggestItem si = new SuggestItem();
                    si.term     = termList1[i];
                    si.count    = 0;
                    si.distance = editDistanceMax + 1;
                    suggestionParts.Add(si);
                }
            }
            nextTerm :;
        }

        SuggestItem suggestion = new SuggestItem();

        suggestion.count = Int64.MaxValue;
        string s = ""; foreach (SuggestItem si in suggestionParts)

        {
            s += si.term + " "; suggestion.count = Math.Min(suggestion.count, si.count);
        }                                                                                                                                                  //Console.WriteLine(s);

        suggestion.term = s.TrimEnd();
        var distanceComparer3 = new EditDistance(suggestion.term, this.distanceAlgorithm);        //new

        suggestion.distance = distanceComparer3.Compare(input, int.MaxValue);

        List <SuggestItem> suggestionsLine = new List <SuggestItem>();

        suggestionsLine.Add(suggestion);
        return(suggestionsLine);
    }