Exemple #1
0
        public double CalculateProbability(StringList sample)
        {
            if (Count <= 0)
            {
                return(0d);
            }

            var probability = 0d;

            foreach (var ngram in NGramUtils.GetNGrams(sample, n))
            {
                var nMinusOneToken = NGramUtils.GetNMinusOneTokenFirst(ngram);
                if (Count > 1000000)
                {
                    // use stupid backoff
                    probability += Math.Log(GetStupidBackoffProbability(ngram, nMinusOneToken));
                }
                else
                {
                    // use laplace smoothing
                    probability += Math.Log(GetLaplaceSmoothingProbability(ngram, nMinusOneToken));
                }
            }
            if (double.IsNaN(probability))
            {
                probability = 0d;
            }
            else if (Math.Abs(probability) > 0.000001)
            {
                probability = Math.Exp(probability);
            }
            return(probability);
        }
        public void TestGetNGrams()
        {
            var nGrams = NGramUtils.GetNGrams(new StringList("I", "saw", "brown", "fox"), 2);

            Assert.That(nGrams.Count, Is.EqualTo(3));

            nGrams = NGramUtils.GetNGrams(new StringList("I", "saw", "brown", "fox"), 3);
            Assert.That(nGrams.Count, Is.EqualTo(2));
        }
        public void TestLinearInterpolation2()
        {
            var set = new List <StringList> {
                new StringList("D", "N", "V", "STOP"),
                new StringList("D", "N", "V", "STOP")
            };
            var lambda = 1d / 3d;
            var d      = NGramUtils.CalculateTrigramLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda,
                                                                                   lambda);

            Assert.That(d, Is.EqualTo(1d).Within(0.75d));
        }
        public void TestLinearInterpolation()
        {
            var set = new List <StringList> {
                new StringList("the", "green", "book", "STOP"),
                new StringList("my", "blue", "book", "STOP"),
                new StringList("his", "green", "house", "STOP"),
                new StringList("book", "STOP")
            };
            var lambda = 1d / 3d;
            var d      = NGramUtils.CalculateTrigramLinearInterpolationProbability("the", "green", "book", set, lambda,
                                                                                   lambda, lambda);

            Assert.That(d, Is.EqualTo(0.5714285714285714d).Within(0.000000000001));
        }
        public void TestNgramMLProbability()
        {
            var set = new List <StringList> {
                new StringList("<s>", "I", "am", "Sam", "</s>"),
                new StringList("<s>", "Sam", "I", "am", "</s>"),
                new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"),
                new StringList("")
            };
            var d = NGramUtils.CalculateNgramMLProbability(new StringList("I", "am", "Sam"), set);

            Assert.That(d, Is.EqualTo(0.5d).Within(0.00001));

            d = NGramUtils.CalculateNgramMLProbability(new StringList("Sam", "I", "am"), set);
            Assert.That(d, Is.EqualTo(1d).Within(0.00001));
        }
Exemple #6
0
        private double GetStupidBackoffProbability(StringList ngram, StringList nMinusOneToken)
        {
            var count = GetCount(ngram);

            if (nMinusOneToken == null || nMinusOneToken.Count == 0)
            {
                return((double)count / Count);
            }
            if (count > 0)
            {
                return(count / (double)GetCount(nMinusOneToken)); // maximum likelihood probability
            }
            var nextNgram = NGramUtils.GetNMinusOneTokenLast(ngram);

            return(0.4d * GetStupidBackoffProbability(nextNgram, NGramUtils.GetNMinusOneTokenFirst(nextNgram)));
        }
Exemple #7
0
        public static double GetPerplexity(ILanguageModel lm, IList <StringList> testSet, int ngramSize)
        {
            var perplexity = new BigDecimal(1d);

            foreach (var sentence in testSet)
            {
                foreach (var ngram in NGramUtils.GetNGrams(sentence, ngramSize))
                {
                    var ngramProbability = lm.CalculateProbability(ngram);
                    perplexity = perplexity.multiply(new BigDecimal(1d).divide(new BigDecimal(ngramProbability), MathContext.DECIMAL128));
                }
            }

            var p = Math.Log(perplexity.doubleValue());

            if (double.IsInfinity(p) || double.IsNaN(p))
            {
                return(double.PositiveInfinity); // over/underflow -> too high perplexity
            }
            var log = new BigDecimal(p);

            return(Math.Pow(Math.E, log.divide(new BigDecimal(testSet.Count), MathContext.DECIMAL128).doubleValue()));
        }