Beispiel #1
0
    private static List<suggestItem> Lookup(string input, string language, int editDistanceMax)
    {
        //save some time
        if (input.Length - editDistanceMax > maxlength) return new List<suggestItem>();

        List<string> candidates = new List<string>();
        HashSet<string> hashset1 = new HashSet<string>();
 
        List<suggestItem> suggestions = new List<suggestItem>();
        HashSet<string> hashset2 = new HashSet<string>();

        object valueo;

        //add original term
        candidates.Add(input);

        while (candidates.Count>0)
        {
            string candidate = candidates[0];
            candidates.RemoveAt(0);

            //save some time
            //early termination
            //suggestion distance=candidate.distance... candidate.distance+editDistanceMax                
            //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
            if ((verbose < 2) && (suggestions.Count > 0) && (input.Length-candidate.Length > suggestions[0].distance)) goto sort;


            //read candidate entry from dictionary
            if (dictionary.TryGetValue(language + candidate, out valueo))
            {
                dictionaryItem value= new dictionaryItem();
                if (valueo is Int32) value.suggestions.Add((Int32)valueo); else value = (dictionaryItem)valueo;

                //if count>0 then candidate entry is correct dictionary term, not only delete item
                if ((value.count > 0) && hashset2.Add(candidate))
                {
                    //add correct dictionary term term to suggestion list
                    suggestItem si = new suggestItem();
                    si.term = candidate;
                    si.count = value.count;
                    si.distance = input.Length - candidate.Length;
                    suggestions.Add(si);
                    //early termination
                    if ((verbose < 2) && (input.Length - candidate.Length == 0)) goto sort;
                }

                //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                object value2;
                foreach (int suggestionint in value.suggestions)
                {
                    //save some time 
                    //skipping double items early: different deletes of the input term can lead to the same suggestion
                    //index2word
                    string suggestion = wordlist[suggestionint];
                    if (hashset2.Add(suggestion))
                    {
                        //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                        //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. 
                        //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                        //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                        //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                        //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                        //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                        int distance = 0;
                        if (suggestion != input)
                        {
                            if (suggestion.Length == candidate.Length) distance = input.Length - candidate.Length;
                            else if (input.Length == candidate.Length) distance = suggestion.Length - candidate.Length;
                            else
                            {
                                //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
                                int ii = 0;
                                int jj = 0;
                                while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii])) ii++;
                                while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1])) jj++;
                                if ((ii > 0) || (jj > 0)) { distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj)); } else distance = DamerauLevenshteinDistance(suggestion, input);

                            }
                        }

                        //save some time.
                        //remove all existing suggestions of higher distance, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear();
                        //do not process higher distances than those already found, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue;

                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(language + suggestion, out value2))
                            {
                                suggestItem si = new suggestItem();
                                si.term = suggestion;
                                si.count = (value2 as dictionaryItem).count;
                                si.distance = distance;
                                suggestions.Add(si);
                            }
                        }
                    }
                }//end foreach
            }//end if         
            
            //add edits 
            //derive edits (deletes) from candidate (input) and add them to candidates list
            //this is a recursive process until the maximum edit distance has been reached
            if (input.Length - candidate.Length < editDistanceMax)
            {
                //save some time
                //do not create edits with edit distance smaller than suggestions already found
                if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance)) continue;

                for (int i = 0; i < candidate.Length; i++)
                {
                    string delete = candidate.Remove(i, 1);
                    if (hashset1.Add(delete)) candidates.Add(delete);
                }
            }
        }//end while

        //sort by ascending edit distance, then by descending word frequency
        sort: if (verbose < 2) suggestions.Sort((x, y) => -x.count.CompareTo(y.count)); else suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
        if ((verbose == 0)&&(suggestions.Count>1)) return suggestions.GetRange(0, 1); else return suggestions;
    }
Beispiel #2
0
    private static List<suggestItem> Lookup(string input, string language, int editDistanceMax)
    {
        List<editItem> candidates = new List<editItem>();

        //add original term
        editItem item = new editItem();
        item.term = input;
        item.distance = 0;
        candidates.Add(item);

        List<suggestItem> suggestions = new List<suggestItem>();
        dictionaryItem value;

        while (candidates.Count>0)
        {
            editItem candidate = candidates[0];
            candidates.RemoveAt(0);

            //save some time
            //early termination
            //suggestion distance=candidate.distance... candidate.distance+editDistanceMax
            //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
            if ((verbose < 2)&&(suggestions.Count > 0)&&(candidate.distance > suggestions[0].distance)) goto sort;
            if (candidate.distance > editDistanceMax) goto sort;

            if (dictionary.TryGetValue(language+candidate.term, out value))
            {
                if (!string.IsNullOrEmpty(value.term))
                {
                    //correct term
                    suggestItem si = new suggestItem();
                    si.term = value.term;
                    si.count = value.count;
                    si.distance = candidate.distance;

                    if (!suggestions.Contains(si))
                    {
                        suggestions.Add(si);
                        //early termination
                        if ((verbose < 2) && (candidate.distance == 0)) goto sort;
                    }
                }

                //edit term (with suggestions to correct term)
                dictionaryItem value2;
                foreach (editItem suggestion in value.suggestions)
                {
                    //save some time
                    //skipping double items early
                    if (suggestions.Find(x => x.term == suggestion.term) == null)
                    {
                        int distance = TrueDistance(suggestion, candidate, input);

                        //save some time.
                        //remove all existing suggestions of higher distance, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear();
                        //do not process higher distances than those already found, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue;

                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(language+suggestion.term, out value2))
                            {
                                suggestItem si = new suggestItem();
                                si.term = value2.term;
                                si.count = value2.count;
                                si.distance = distance;

                                suggestions.Add(si);
                            }
                        }
                    }
                }
            }//end foreach

            //add edits
            if (candidate.distance < editDistanceMax)
            {
                foreach (editItem delete in Edits(candidate.term, candidate.distance,false))
                {
                    if (!candidates.Contains(delete)) candidates.Add(delete);
                }
            }
        }//end while

        sort: suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList();
        if ((verbose == 0)&&(suggestions.Count>1))  return suggestions.GetRange(0, 1); else return suggestions;
    }
Beispiel #3
0
    private static List <suggestItem> Lookup(string input, string language, int editDistanceMax)
    {
        //save some time
        if (input.Length - editDistanceMax > maxlength)
        {
            return(new List <suggestItem>());
        }

        List <string>    candidates = new List <string>();
        HashSet <string> hashset1   = new HashSet <string>();

        List <suggestItem> suggestions = new List <suggestItem>();
        HashSet <string>   hashset2    = new HashSet <string>();

        object valueo;

        //add original term
        candidates.Add(input);

        while (candidates.Count > 0)
        {
            string candidate = candidates[0];
            candidates.RemoveAt(0);

            //save some time
            //early termination
            //suggestion distance=candidate.distance... candidate.distance+editDistanceMax
            //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
            if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length > suggestions[0].distance))
            {
                goto sort;
            }


            //read candidate entry from dictionary
            if (dictionary.TryGetValue(language + candidate, out valueo))
            {
                dictionaryItem value = new dictionaryItem();
                if (valueo is Int32)
                {
                    value.suggestions.Add((Int32)valueo);
                }
                else
                {
                    value = (dictionaryItem)valueo;
                }

                //if count>0 then candidate entry is correct dictionary term, not only delete item
                if ((value.count > 0) && hashset2.Add(candidate))
                {
                    //add correct dictionary term term to suggestion list
                    suggestItem si = new suggestItem();
                    si.term     = candidate;
                    si.count    = value.count;
                    si.distance = input.Length - candidate.Length;
                    suggestions.Add(si);
                    //early termination
                    if ((verbose < 2) && (input.Length - candidate.Length == 0))
                    {
                        goto sort;
                    }
                }

                //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
                object value2;
                foreach (int suggestionint in value.suggestions)
                {
                    //save some time
                    //skipping double items early: different deletes of the input term can lead to the same suggestion
                    //index2word
                    string suggestion = wordlist[suggestionint];
                    if (hashset2.Add(suggestion))
                    {
                        //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                        //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term.
                        //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                        //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                        //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                        //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                        //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                        int distance = 0;
                        if (suggestion != input)
                        {
                            if (suggestion.Length == candidate.Length)
                            {
                                distance = input.Length - candidate.Length;
                            }
                            else if (input.Length == candidate.Length)
                            {
                                distance = suggestion.Length - candidate.Length;
                            }
                            else
                            {
                                //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
                                int ii = 0;
                                int jj = 0;
                                while ((ii < suggestion.Length) && (ii < input.Length) && (suggestion[ii] == input[ii]))
                                {
                                    ii++;
                                }
                                while ((jj < suggestion.Length - ii) && (jj < input.Length - ii) && (suggestion[suggestion.Length - jj - 1] == input[input.Length - jj - 1]))
                                {
                                    jj++;
                                }
                                if ((ii > 0) || (jj > 0))
                                {
                                    distance = DamerauLevenshteinDistance(suggestion.Substring(ii, suggestion.Length - ii - jj), input.Substring(ii, input.Length - ii - jj));
                                }
                                else
                                {
                                    distance = DamerauLevenshteinDistance(suggestion, input);
                                }
                            }
                        }

                        //save some time.
                        //remove all existing suggestions of higher distance, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance))
                        {
                            suggestions.Clear();
                        }
                        //do not process higher distances than those already found, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance))
                        {
                            continue;
                        }

                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(language + suggestion, out value2))
                            {
                                suggestItem si = new suggestItem();
                                si.term     = suggestion;
                                si.count    = (value2 as dictionaryItem).count;
                                si.distance = distance;
                                suggestions.Add(si);
                            }
                        }
                    }
                } //end foreach
            }     //end if

            //add edits
            //derive edits (deletes) from candidate (input) and add them to candidates list
            //this is a recursive process until the maximum edit distance has been reached
            if (input.Length - candidate.Length < editDistanceMax)
            {
                //save some time
                //do not create edits with edit distance smaller than suggestions already found
                if ((verbose < 2) && (suggestions.Count > 0) && (input.Length - candidate.Length >= suggestions[0].distance))
                {
                    continue;
                }

                for (int i = 0; i < candidate.Length; i++)
                {
                    string delete = candidate.Remove(i, 1);
                    if (hashset1.Add(delete))
                    {
                        candidates.Add(delete);
                    }
                }
            }
        }//end while

        //sort by ascending edit distance, then by descending word frequency
        sort : if (verbose < 2)
        {
            suggestions.Sort((x, y) => - x.count.CompareTo(y.count));
        }
        else
        {
            suggestions.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
        }
        if ((verbose == 0) && (suggestions.Count > 1))
        {
            return(suggestions.GetRange(0, 1));
        }
        else
        {
            return(suggestions);
        }
    }
Beispiel #4
0
    private static List <suggestItem> Lookup(string input, string language, int editDistanceMax)
    {
        List <editItem> candidates = new List <editItem>();

        //add original term
        editItem item = new editItem();

        item.term     = input;
        item.distance = 0;
        candidates.Add(item);

        List <suggestItem> suggestions = new List <suggestItem>();
        dictionaryItem     value;

        while (candidates.Count > 0)
        {
            editItem candidate = candidates[0];
            candidates.RemoveAt(0);

            //save some time
            //early termination
            //suggestion distance=candidate.distance... candidate.distance+editDistanceMax
            //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
            if ((verbose < 2) && (suggestions.Count > 0) && (candidate.distance > suggestions[0].distance))
            {
                goto sort;
            }
            if (candidate.distance > editDistanceMax)
            {
                goto sort;
            }

            if (dictionary.TryGetValue(language + candidate.term, out value))
            {
                if (!string.IsNullOrEmpty(value.term))
                {
                    //correct term
                    suggestItem si = new suggestItem();
                    si.term     = value.term;
                    si.count    = value.count;
                    si.distance = candidate.distance;

                    if (!suggestions.Contains(si))
                    {
                        suggestions.Add(si);
                        //early termination
                        if ((verbose < 2) && (candidate.distance == 0))
                        {
                            goto sort;
                        }
                    }
                }

                //edit term (with suggestions to correct term)
                dictionaryItem value2;
                foreach (editItem suggestion in value.suggestions)
                {
                    //save some time
                    //skipping double items early
                    if (suggestions.Find(x => x.term == suggestion.term) == null)
                    {
                        int distance = TrueDistance(suggestion, candidate, input);

                        //save some time.
                        //remove all existing suggestions of higher distance, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance))
                        {
                            suggestions.Clear();
                        }
                        //do not process higher distances than those already found, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance))
                        {
                            continue;
                        }

                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(language + suggestion.term, out value2))
                            {
                                suggestItem si = new suggestItem();
                                si.term     = value2.term;
                                si.count    = value2.count;
                                si.distance = distance;

                                suggestions.Add(si);
                            }
                        }
                    }
                }
            }//end foreach

            //add edits
            if (candidate.distance < editDistanceMax)
            {
                foreach (editItem delete in Edits(candidate.term, candidate.distance, false))
                {
                    if (!candidates.Contains(delete))
                    {
                        candidates.Add(delete);
                    }
                }
            }
        }//end while

        sort : suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList();
        if ((verbose == 0) && (suggestions.Count > 1))
        {
            return(suggestions.GetRange(0, 1));
        }
        else
        {
            return(suggestions);
        }
    }
Beispiel #5
0
    public static List <suggestItem> LookupCompound(string input, string language, int editDistanceMax)
    {
        //parse input string into single terms
        string[] termList1 = parseWords(input).ToArray();

        List <suggestItem> suggestionsPreviousTerm;                    //suggestions for a single term
        List <suggestItem> suggestions     = new List <suggestItem>(); //suggestions for a single term
        List <suggestItem> suggestionParts = new List <suggestItem>(); //1 line with separate parts

        //translate every term to its best suggestion, otherwise it remains unchanged
        bool lastCombi = false;

        for (int i = 0; i < termList1.Length; i++)
        {
            suggestionsPreviousTerm = new List <suggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++)
            {
                suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy());
            }
            suggestions = Lookup(termList1[i], language, editDistanceMax);


            //combi check, always before split
            if ((i > 0) && !lastCombi)
            {
                List <suggestItem> suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], language, editDistanceMax);

                if (suggestionsCombi.Count > 0)
                {
                    suggestItem best1 = suggestionParts[suggestionParts.Count - 1];
                    suggestItem best2 = new suggestItem();
                    if (suggestions.Count > 0)
                    {
                        best2 = suggestions[0];
                    }
                    else
                    {
                        best2.term     = termList1[i];
                        best2.distance = editDistanceMax + 1;
                        best2.count    = 0;
                    }
                    if (suggestionsCombi[0].distance + 1 < DamerauLevenshteinDistance(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term))
                    {
                        suggestionsCombi[0].distance++;
                        suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0];
                        lastCombi = true;
                        goto nextTerm;
                    }
                }
            }
            lastCombi = false;

            //alway split terms without suggestion / never split terms with suggestion ed=0 / never split single char terms
            if ((suggestions.Count > 0) && ((suggestions[0].distance == 0) || (termList1[i].Length == 1)))
            {
                //choose best suggestion
                suggestionParts.Add(suggestions[0]);
            }
            else
            {
                //if no perfect suggestion, split word into pairs
                List <suggestItem> suggestionsSplit = new List <suggestItem>();

                //add original term
                if (suggestions.Count > 0)
                {
                    suggestionsSplit.Add(suggestions[0]);
                }

                if (termList1[i].Length > 1)
                {
                    for (int j = 1; j < termList1[i].Length; j++)
                    {
                        string             part1           = termList1[i].Substring(0, j);
                        string             part2           = termList1[i].Substring(j);
                        suggestItem        suggestionSplit = new suggestItem();
                        List <suggestItem> suggestions1    = Lookup(part1, language, editDistanceMax);
                        if (suggestions1.Count > 0)
                        {
                            if ((suggestions.Count > 0) && (suggestions[0].term == suggestions1[0].term))
                            {
                                break;                                                                          //if split correction1 == einzelwort correction
                            }
                            List <suggestItem> suggestions2 = Lookup(part2, language, editDistanceMax);
                            if (suggestions2.Count > 0)
                            {
                                if ((suggestions.Count > 0) && (suggestions[0].term == suggestions2[0].term))
                                {
                                    break;                                                                          //if split correction1 == einzelwort correction
                                }
                                //select best suggestion for split pair
                                suggestionSplit.term     = suggestions1[0].term + " " + suggestions2[0].term;
                                suggestionSplit.distance = DamerauLevenshteinDistance(termList1[i], suggestions1[0].term + " " + suggestions2[0].term);
                                suggestionSplit.count    = Math.Min(suggestions1[0].count, suggestions2[0].count);
                                suggestionsSplit.Add(suggestionSplit);

                                //early termination of split
                                if (suggestionSplit.distance == 1)
                                {
                                    break;
                                }
                            }
                        }
                    }

                    if (suggestionsSplit.Count > 0)
                    {
                        //select best suggestion for split pair
                        suggestionsSplit.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
                        suggestionParts.Add(suggestionsSplit[0]);
                    }
                    else
                    {
                        suggestItem si = new suggestItem();
                        si.term     = termList1[i];
                        si.count    = 0;
                        si.distance = editDistanceMax + 1;
                        suggestionParts.Add(si);
                    }
                }
                else
                {
                    suggestItem si = new suggestItem();
                    si.term     = termList1[i];
                    si.count    = 0;
                    si.distance = editDistanceMax + 1;
                    suggestionParts.Add(si);
                }
            }
            nextTerm :;
        }

        suggestItem suggestion = new suggestItem();

        suggestion.count = Int64.MaxValue;
        string s = ""; foreach (suggestItem si in suggestionParts)

        {
            s += si.term + " "; suggestion.count = Math.Min(suggestion.count, si.count);
        }                                                                                                                                         //Console.WriteLine(s);

        suggestion.term     = s.TrimEnd();
        suggestion.distance = DamerauLevenshteinDistance(suggestion.term, input);

        List <suggestItem> suggestionsLine = new List <suggestItem>();

        suggestionsLine.Add(suggestion);
        return(suggestionsLine);
    }