예제 #1
0
        public static void TestQGram()
        {
            string fn1 = "acht";
            string fn2 = "yacht";
            string fn3 = "acha";

            NGramDistance ngd = new NGramDistance();

            double x = ngd.GetDistance(fn1, fn2);
            double y = ngd.GetDistance(fn1, fn3);

            double z = x;
        }
예제 #2
0
        public void TestEmpty()
        {
            StringDistance nsd = new NGramDistance(1);
            float          d   = nsd.GetDistance("", "al");

            Assert.AreEqual(d, 0.0f, 0.001);
        }
        public string GetBestMatchWord(string OriginalWord)
        {
            EnsureIndexed();
            var existing = indexReader.DocFreq(new Term("word", OriginalWord));

            if (existing > 0)
            {
                return(OriginalWord);
            }
            var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, 10, null, "word", true);
            var jaro        = new JaroWinklerDistance();
            var leven       = new LevenshteinDistance();
            var ngram       = new NGramDistance();
            var metrics     = suggestions.Select(s => new
            {
                word  = s,
                freq  = indexReader.DocFreq(new Term("word", s)),
                jaro  = jaro.GetDistance(OriginalWord, s),
                leven = leven.GetDistance(OriginalWord, s),
                ngram = ngram.GetDistance(OriginalWord, s)
            })
                              .OrderByDescending(metric =>
                                                 (
                                                     (metric.freq / 100f) +
                                                     metric.jaro +
                                                     metric.leven +
                                                     metric.ngram
                                                 )
                                                 / 4f
                                                 )
                              .ToList();

            return(metrics.Select(m => m.word).FirstOrDefault());
        }
        public AlternateWordList GetAlternateWordList(string OriginalWord, int NumberToReturn)
        {
            var wordList = new AlternateWordList();

            wordList.OriginalWord = OriginalWord;

            EnsureIndexed();
            var existing = indexReader.DocFreq(new Term("word", OriginalWord));

            wordList.OriginalWordFrequency = existing;

            var suggestions = _luceneChecker.SuggestSimilar(OriginalWord, NumberToReturn, null, "word", true);
            var jaro        = new JaroWinklerDistance();
            var leven       = new LevenshteinDistance();
            var ngram       = new NGramDistance();
            var metrics     = suggestions.Select(s => new
            {
                word  = s,
                freq  = indexReader.DocFreq(new Term("word", s)),
                jaro  = jaro.GetDistance(OriginalWord, s),
                leven = leven.GetDistance(OriginalWord, s),
                ngram = ngram.GetDistance(OriginalWord, s)
            })
                              .OrderByDescending(metric =>
                                                 (
                                                     (metric.freq / 100f) +
                                                     metric.jaro +
                                                     metric.leven +
                                                     metric.ngram
                                                 )
                                                 / 4f
                                                 )
                              .ToList();

            var list      = new List <AlternateWord>();
            var sortOrder = 1;

            foreach (var item in metrics)
            {
                var altWord = new AlternateWord();
                altWord.Word               = item.word;
                altWord.Frequency          = item.freq;
                altWord.JaroWinkler        = item.jaro;
                altWord.Levenshtein        = item.leven;
                altWord.NGram              = item.ngram;
                altWord.BestMatchScore     = ((item.freq / 100f) + item.jaro + item.leven + item.ngram) / 4f;
                altWord.BestMatchSortOrder = sortOrder;

                list.Add(altWord);
                sortOrder++;
            }

            wordList.Words = list;
            return(wordList);
        }
        public List <string> GetTopSuggestions(string value, int numberOfItems)
        {
            EnsureIndexed();
            var suggestionCollection = new List <string>();
            var existing             = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, value));

            if (existing > 0)// the fist one will be correct of exist
            {
                suggestionCollection.Add(value);
            }

            var suggestions = _checker.SuggestSimilar(value, numberOfItems, null, SpellCheckerConstants.SpellCheckerKey, true);
            var jaro        = new JaroWinklerDistance();
            var leven       = new LevenshteinDistance();
            var ngram       = new NGramDistance();
            var metrics     = suggestions.Select(s => new
            {
                word  = s,
                freq  = _indexReader.DocFreq(new Term(SpellCheckerConstants.SpellCheckerKey, s)),
                jaro  = jaro.GetDistance(value, s),
                leven = leven.GetDistance(value, s),
                ngram = ngram.GetDistance(value, s)
            })
                              .OrderByDescending(metric => metric.jaro)
                              .ThenByDescending(m => m.ngram)
                              .ThenByDescending(metric =>
                                                (
                                                    metric.freq / 100f +
                                                    metric.leven
                                                )
                                                / 2f
                                                )
                              .ToList();

            var wordsOnly = metrics.Select(m => m.word).ToList();

            suggestionCollection.AddRange(wordsOnly);

            return(suggestionCollection);
        }
예제 #6
0
        public void TestGetDistance2()
        {
            StringDistance sd = new NGramDistance(2);
            float          d  = sd.GetDistance("al", "al");

            Assert.AreEqual(d, 1.0f, 0.001);
            d = sd.GetDistance("a", "a");
            Assert.AreEqual(d, 1.0f, 0.001);
            d = sd.GetDistance("b", "a");
            Assert.AreEqual(d, 0.0f, 0.001);
            d = sd.GetDistance("a", "aa");
            Assert.AreEqual(d, 0.5f, 0.001);
            d = sd.GetDistance("martha", "marhta");
            Assert.AreEqual(d, 0.6666, 0.001);
            d = sd.GetDistance("jones", "johnson");
            Assert.AreEqual(d, 0.4285, 0.001);
            d = sd.GetDistance("natural", "contrary");
            Assert.AreEqual(d, 0.25, 0.001);
            d = sd.GetDistance("abcvwxyz", "cabvwxyz");
            Assert.AreEqual(d, 0.625, 0.001);
            d = sd.GetDistance("dwayne", "duane");
            Assert.AreEqual(d, 0.5833, 0.001);
            d = sd.GetDistance("dixon", "dicksonx");
            Assert.AreEqual(d, 0.5, 0.001);
            d = sd.GetDistance("six", "ten");
            Assert.AreEqual(d, 0, 0.001);
            float d1 = sd.GetDistance("zac ephron", "zac efron");
            float d2 = sd.GetDistance("zac ephron", "kai ephron");

            Assert.IsTrue(d1 > d2);
            d1 = sd.GetDistance("brittney spears", "britney spears");
            d2 = sd.GetDistance("brittney spears", "brittney startzman");
            Assert.IsTrue(d1 > d2);
            d1 = sd.GetDistance("0012345678", "0012890678");
            d2 = sd.GetDistance("0012345678", "0072385698");
            Assert.AreEqual(d1, d2, 0.001);
        }