示例#1
0
        public void TestGetDistance1()
        {
            StringDistance nsd = new NGramDistance(1);
            float          d   = nsd.GetDistance("al", "al");

            Assert.AreEqual(d, 1.0f, 0.001);
            d = nsd.GetDistance("a", "a");
            Assert.AreEqual(d, 1.0f, 0.001);
            d = nsd.GetDistance("b", "a");
            Assert.AreEqual(d, 0.0f, 0.001);
            d = nsd.GetDistance("martha", "marhta");
            Assert.AreEqual(d, 0.6666, 0.001);
            d = nsd.GetDistance("jones", "johnson");
            Assert.AreEqual(d, 0.4285, 0.001);
            d = nsd.GetDistance("natural", "contrary");
            Assert.AreEqual(d, 0.25, 0.001);
            d = nsd.GetDistance("abcvwxyz", "cabvwxyz");
            Assert.AreEqual(d, 0.75, 0.001);
            d = nsd.GetDistance("dwayne", "duane");
            Assert.AreEqual(d, 0.666, 0.001);
            d = nsd.GetDistance("dixon", "dicksonx");
            Assert.AreEqual(d, 0.5, 0.001);
            d = nsd.GetDistance("six", "ten");
            Assert.AreEqual(d, 0, 0.001);
            float d1 = nsd.GetDistance("zac ephron", "zac efron");
            float d2 = nsd.GetDistance("zac ephron", "kai ephron");

            Assert.AreEqual(d1, d2, 0.001);
            d1 = nsd.GetDistance("brittney spears", "britney spears");
            d2 = nsd.GetDistance("brittney spears", "brittney startzman");
            Assert.IsTrue(d1 > d2);
            d1 = nsd.GetDistance("12345678", "12890678");
            d2 = nsd.GetDistance("12345678", "72385698");
            Assert.AreEqual(d1, d2, 001);
        }
示例#2
0
        public void TestGetDistance3()
        {
            StringDistance sd = new NGramDistance(3);
            float          d  = sd.GetDistance("al", "al");

            Assert.AreEqual(d, 1.0f, 0.001);
            d = sd.GetDistance("a", "a");
            Assert.AreEqual(d, 1.0f, 0.001);
            d = sd.GetDistance("b", "a");
            Assert.AreEqual(d, 0.0f, 0.001);
            d = sd.GetDistance("martha", "marhta");
            Assert.AreEqual(d, 0.7222, 0.001);
            d = sd.GetDistance("jones", "johnson");
            Assert.AreEqual(d, 0.4762, 0.001);
            d = sd.GetDistance("natural", "contrary");
            Assert.AreEqual(d, 0.2083, 0.001);
            d = sd.GetDistance("abcvwxyz", "cabvwxyz");
            Assert.AreEqual(d, 0.5625, 0.001);
            d = sd.GetDistance("dwayne", "duane");
            Assert.AreEqual(d, 0.5277, 0.001);
            d = sd.GetDistance("dixon", "dicksonx");
            Assert.AreEqual(d, 0.4583, 0.001);
            d = sd.GetDistance("six", "ten");
            Assert.AreEqual(d, 0, 0.001);
            float d1 = sd.GetDistance("zac ephron", "zac efron");
            float d2 = sd.GetDistance("zac ephron", "kai ephron");

            Assert.IsTrue(d1 > d2);
            d1 = sd.GetDistance("brittney spears", "britney spears");
            d2 = sd.GetDistance("brittney spears", "brittney startzman");
            Assert.IsTrue(d1 > d2);
            d1 = sd.GetDistance("0012345678", "0012890678");
            d2 = sd.GetDistance("0012345678", "0072385698");
            Assert.IsTrue(d1 < d2);
        }
示例#3
0
        public void TestEmpty()
        {
            StringDistance nsd = new NGramDistance(1);
            float          d   = nsd.GetDistance("", "al");

            Assert.AreEqual(d, 0.0f, 0.001);
        }
        public string GetBestMatchWord(string OriginalWord)
        {
            EnsureIndexed();
            var existing = indexReader.DocFreq(new Term("word", OriginalWord));

            if (existing > 0)
            {
                return(OriginalWord);
            }
            var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, 10, null, "word", true);
            var jaro        = new JaroWinklerDistance();
            var leven       = new LevenshteinDistance();
            var ngram       = new NGramDistance();
            var metrics     = suggestions.Select(s => new
            {
                word  = s,
                freq  = indexReader.DocFreq(new Term("word", s)),
                jaro  = jaro.GetDistance(OriginalWord, s),
                leven = leven.GetDistance(OriginalWord, s),
                ngram = ngram.GetDistance(OriginalWord, s)
            })
                              .OrderByDescending(metric =>
                                                 (
                                                     (metric.freq / 100f) +
                                                     metric.jaro +
                                                     metric.leven +
                                                     metric.ngram
                                                 )
                                                 / 4f
                                                 )
                              .ToList();

            return(metrics.Select(m => m.word).FirstOrDefault());
        }
        public AlternateWordList GetAlternateWordList(string OriginalWord, int NumberToReturn)
        {
            var wordList = new AlternateWordList();

            wordList.OriginalWord = OriginalWord;

            EnsureIndexed();
            var existing = indexReader.DocFreq(new Term("word", OriginalWord));

            wordList.OriginalWordFrequency = existing;

            var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, NumberToReturn, null, "word", true);
            var jaro        = new JaroWinklerDistance();
            var leven       = new LevenshteinDistance();
            var ngram       = new NGramDistance();
            var metrics     = suggestions.Select(s => new
            {
                word  = s,
                freq  = indexReader.DocFreq(new Term("word", s)),
                jaro  = jaro.GetDistance(OriginalWord, s),
                leven = leven.GetDistance(OriginalWord, s),
                ngram = ngram.GetDistance(OriginalWord, s)
            })
                              .OrderByDescending(metric =>
                                                 (
                                                     (metric.freq / 100f) +
                                                     metric.jaro +
                                                     metric.leven +
                                                     metric.ngram
                                                 )
                                                 / 4f
                                                 )
                              .ToList();

            var list      = new List <AlternateWord>();
            var sortOrder = 1;

            foreach (var item in metrics)
            {
                var altWord = new AlternateWord();
                altWord.Word               = item.word;
                altWord.Frequency          = item.freq;
                altWord.JaroWinkler        = item.jaro;
                altWord.Levenshtein        = item.leven;
                altWord.NGram              = item.ngram;
                altWord.BestMatchScore     = ((item.freq / 100f) + item.jaro + item.leven + item.ngram) / 4f;
                altWord.BestMatchSortOrder = sortOrder;

                list.Add(altWord);
                sortOrder++;
            }

            wordList.Words = list;
            return(wordList);
        }
示例#6
0
        public static void TestQGram()
        {
            string fn1 = "acht";
            string fn2 = "yacht";
            string fn3 = "acha";

            NGramDistance ngd = new NGramDistance();

            double x = ngd.GetDistance(fn1, fn2);
            double y = ngd.GetDistance(fn1, fn3);

            double z = x;
        }
        public static double findSimilarDictionaryWord(string word, double maxSimilarity, int index, List <string> equalMinDistanceDictWordList)
        {
            index = index - _minWordLength;
            word  = word.ToLower();
            double NewSimilarity = 0;
            int    WordLength    = word.Length;

            if ((WordLength + index) < 0)
            {
                return(maxSimilarity);
            }
            if ((WordLength + index) >= _IndexDictionary.Length)
            {
                return(maxSimilarity);
            }
            if (_IndexDictionary[WordLength + index] == null)
            {
                return(maxSimilarity);
            }

            for (int j = 0; j < _IndexDictionary[WordLength + index].Count; j++)
            {
                JaroWinklerDistance JaroDist = new JaroWinklerDistance();
                NGramDistance       ng       = new NGramDistance();
                JaccardDistance     jd       = new JaccardDistance();

                NewSimilarity = jd.GetDistance(word, _IndexDictionary[WordLength + index][j]);//(double)JaroDist.GetDistance(word, _IndexDictionary[WordLenght - 1 + index][j]);

                if (NewSimilarity > maxSimilarity)
                {
                    equalMinDistanceDictWordList.Clear();
                    equalMinDistanceDictWordList.Add(_IndexDictionary[WordLength + index][j]);
                    maxSimilarity = NewSimilarity;
                }
                else if (NewSimilarity == maxSimilarity)
                {
                    equalMinDistanceDictWordList.Add(_IndexDictionary[WordLength + index][j]);
                }
            }
            return(maxSimilarity);
        }
        public List <string> GetTopSuggestions(string value, int numberOfItems)
        {
            EnsureIndexed();
            var suggestionCollection = new List <string>();
            var existing             = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, value));

            if (existing > 0)// the fist one will be correct of exist
            {
                suggestionCollection.Add(value);
            }

            var suggestions = _checker.SuggestSimilar(value, numberOfItems, null, SpellCheckerConstants.SpellCheckerKey, true);
            var jaro        = new JaroWinklerDistance();
            var leven       = new LevenshteinDistance();
            var ngram       = new NGramDistance();
            var metrics     = suggestions.Select(s => new
            {
                word  = s,
                freq  = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, s)),
                jaro  = jaro.GetDistance(value, s),
                leven = leven.GetDistance(value, s),
                ngram = ngram.GetDistance(value, s)
            })
                              .OrderByDescending(metric => metric.jaro)
                              .ThenByDescending(m => m.ngram)
                              .ThenByDescending(metric =>
                                                (
                                                    metric.freq / 100f +
                                                    metric.leven
                                                )
                                                / 2f
                                                )
                              .ToList();

            var wordsOnly = metrics.Select(m => m.word).ToList();

            suggestionCollection.AddRange(wordsOnly);

            return(suggestionCollection);
        }
示例#9
0
        private void StringCompareTest(string input, string[] testCases)
        {
            Debug.WriteLine("Dice Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer dice = new DiceCoefficent();
                double diceValue         = dice.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", diceValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Jaccard Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer jaccard = new Jaccard();
                double jaccardValue         = jaccard.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", jaccardValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("ExtendedJaccard Coefficient for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer exjaccard = new ExtendedJaccard();
                double exjaccardValue         = exjaccard.Compare(input, name);
                Debug.WriteLine("\t{0} against {1}", exjaccardValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("DamerauLevenshteinDistance for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer lev = new DamerauLevenshteinDistance();
                var levenStein          = lev.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", levenStein, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("JaroWinkler for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer jw = new JaroWinkler();
                var jwValue            = jw.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", jwValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Monge-Elkan for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer me = new MongeElkan();
                var meValue            = me.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", meValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("NGramDistance(2) for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer ngram2 = new NGramDistance();
                (ngram2 as NGramDistance).NGramLength = 2;
                var ngramValue2 = ngram2.Compare(input, name);

                Debug.WriteLine("\t{0}, against {1}", ngramValue2, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("SmithWaterman for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer sw = new SmithWaterman();
                var swValue            = sw.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", swValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Extended Editex for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer edx = new ExtendedEditex();
                var edxValue            = edx.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", edxValue, name);
            }

            Debug.WriteLine("");
            Debug.WriteLine("Longest Common Subsequence for {0}:", input);
            foreach (var name in testCases)
            {
                StringFuzzyComparer lcs = new LongestCommonSubsequence();
                var lcsValue            = lcs.Compare(input, name);
                Debug.WriteLine("\t{0}, against {1}", lcsValue.ToString("###,###.00000"), name);
            }

            Debug.WriteLine("");
        }
        public static double findSimilarDictionaryWord(string word, double maxSimilarity, int index, Dictionary <string, double> equalMinDistanceDictWordList)
        {
            try
            {
                double distancethreshold = 0.3;
                index = index - _minWordLength;
                double NewDistance = 0;
                int    WordLenght  = word.Length;
                if ((WordLenght + index) < 0)
                {
                    return(maxSimilarity);
                }

                if ((WordLenght + index) >= _IndexDictionary.Length)
                {
                    return(maxSimilarity);
                }
                if (_IndexDictionary[WordLenght - 1 + index] == null)
                {
                    return(0);
                }
                for (int j = 0; j < _IndexDictionary[WordLenght - 1 + index].Count; j++)
                {
                    JaroWinklerDistance JaroDist = new JaroWinklerDistance();
                    NGramDistance       ng       = new NGramDistance();
                    JaccardDistance     jd       = new JaccardDistance();
                    string temp = _IndexDictionary[WordLenght - 1 + index][j];
                    NewDistance = jd.GetDistance(word, temp);
                    double NewDistance2 = -1;

                    if (NewDistance < NewDistance2)
                    {
                        NewDistance = NewDistance2;
                    }

                    if (NewDistance > maxSimilarity)
                    {
                        foreach (var item in equalMinDistanceDictWordList.ToList())
                        {
                            if (item.Value <= NewDistance - distancethreshold)
                            {
                                equalMinDistanceDictWordList.Remove(item.Key);
                            }
                        }

                        tempReplacement = temp;
                        if (!equalMinDistanceDictWordList.ContainsKey(temp))
                        {
                            equalMinDistanceDictWordList.Add(temp, NewDistance);
                        }
                        //else
                        //    equalMinDistanceDictWordList[tempReplacement] = NewDistance;
                        maxSimilarity = NewDistance;
                    }
                    else if (NewDistance <= maxSimilarity + distancethreshold && NewDistance >= maxSimilarity - distancethreshold && NewDistance > 0)
                    {
                        if (!equalMinDistanceDictWordList.ContainsKey(temp))
                        {
                            equalMinDistanceDictWordList.Add(temp, NewDistance);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                throw e;
            }
            return(maxSimilarity);
        }
        public static double findSimilarDictionaryWord(string word, double maxSimilarity, int index, List <string> equalMinDistanceDictWordList, bool exact)
        {
            index = index - _minWordLength;
            int WordLength = word.Length;
            int index2     = index;

            if (index < 0 || (WordLength >= 2 && char.IsUpper(word[0]) && !char.IsUpper(word[1])))
            {
                index2 = 0;
            }
            word = word.ToLower();
            bool noSpace = false;

            if (word.CompareTo(word.Trim()) == 0)
            {
                noSpace = true;
            }
            else
            {
                word = word.Trim();
            }



            double NewSimilarity = 0;

            if ((WordLength + index) < 0)
            {
                return(maxSimilarity);
            }
            if ((WordLength + index) >= _IndexDictionary.Length)
            {
                return(maxSimilarity);
            }
            if (_IndexDictionary[WordLength + index] == null)
            {
                return(maxSimilarity);
            }

            for (int j = 0; j < _IndexDictionary[WordLength + index].Count; j++)
            {
                JaroWinklerDistance JaroDist = new JaroWinklerDistance();
                NGramDistance       ng       = new NGramDistance();
                JaccardDistance     jd       = new JaccardDistance();
                string temp = _IndexDictionary[WordLength + index][j];
                if (noSpace && temp.CompareTo(word) == 0)
                {
                    equalMinDistanceDictWordList.Clear();
                    equalMinDistanceDictWordList.Add(temp);
                    return(10);
                }
                else if (temp.Contains(word))
                {
                    equalMinDistanceDictWordList.Add(/*item);*/ temp);
                    maxSimilarity = 1;
                }
                else if (index <= 2)
                {
                    for (int i = 0; i <= index2; i++)
                    {
                        string s  = temp.Substring(i);
                        string s2 = temp.Substring(0, temp.Length - index2);
                        //Console.WriteLine(item);
                        if (!exact)
                        {
                            NewSimilarity = Math.Max(jd.GetDistance(word, s), jd.GetDistance(word, s2));
                        }
                        else
                        {
                            NewSimilarity = jd.GetDistance(word, temp);
                            if (NewSimilarity == 1)
                            {
                                equalMinDistanceDictWordList.Clear();
                                equalMinDistanceDictWordList.Add(s);
                                maxSimilarity = NewSimilarity;
                            }
                            return(maxSimilarity);
                        }

                        if (NewSimilarity > .33)
                        {
                            //equalMinDistanceDictWordList.Clear();
                            equalMinDistanceDictWordList.Add(/*item);*/ temp);
                            maxSimilarity = NewSimilarity;
                            break;
                        }
                    }
                }
            }
            return(maxSimilarity);
        }