Exemple #1
0
        public double GetSentenceProbabilityForUnigrams(IEnumerable <string> tokens)
        {
            IList <NGram> nGrams = extractor.ExtractAsList(tokens);

            double p = 1;

            foreach (NGram nGram in nGrams)
            {
                p *= GetUnigramMLE(nGram);
            }

            return(p);
        }
Exemple #2
0
        public double GetSentenceProbability(IList <string> tokens)
        {
            if (maxNGramSize == 1)
            {
                return(GetSentenceProbabilityForUnigrams(tokens));
            }

            List <string> tokenList = tokens.ToList();

            AddStartStopSymbols(tokenList);

            var           ext    = new NGramExtractor(maxNGramSize - 1, maxNGramSize);
            IList <NGram> nGrams = ext.ExtractAsList(tokenList);

            nGrams.RemoveAt(nGrams.Count - 1);

            double logP = 0;

            for (int i = 0; i < nGrams.Count; i += 2)
            {
                double p = GetMLE(nGrams[i], nGrams[i + 1]);

                logP += Math.Log10(p);

                NGram x = nGrams[i];

                //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p));
                //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i])));
            }

            //double p = 1;
            //for (int i = 0; i < nGrams.Count; i += 2)
            //{
            //    p *= GetMLE(nGrams[i], nGrams[i + 1]);
            //}

            //return p;

            return(logP);
        }
        public void TestExtractAsList()
        {
            var extractor = new NGramExtractor(Unigram);

            IList<NGram> actual = extractor.ExtractAsList(Tokens);
            var expected = new[] {one, two, three, four, five, one, two, three, four};
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Bigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[] {one_two, two_three, three_four, four_five, five_one, one_two, two_three, three_four};
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Trigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {one_two_three, two_three_four, three_four_five, four_five_one, five_one_two, one_two_three, two_three_four};
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Unigram, Bigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {
                one, one_two, two, two_three, three, three_four, four, four_five, five, five_one, one, one_two, two,
                two_three, three, three_four, four
            };
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Bigram, Trigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {
                one_two, one_two_three, two_three, two_three_four, three_four,
                three_four_five, four_five, four_five_one, five_one, five_one_two, one_two, one_two_three, two_three,
                two_three_four, three_four
            };
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Unigram, Trigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {
                one, one_two, one_two_three,
                two, two_three, two_three_four,
                three, three_four, three_four_five,
                four, four_five, four_five_one,
                five, five_one, five_one_two,
                one, one_two, one_two_three,
                two, two_three, two_three_four,
                three, three_four,
                four
            };
            CollectionAssert.AreEqual(expected, actual);
        }
Exemple #4
0
        public double GetSentenceProbability(IList<string> tokens)
        {
            if (maxNGramSize == 1)
            {
                return GetSentenceProbabilityForUnigrams(tokens);
            }

            List<string> tokenList = tokens.ToList();
            AddStartStopSymbols(tokenList);

            var ext = new NGramExtractor(maxNGramSize - 1, maxNGramSize);
            IList<NGram> nGrams = ext.ExtractAsList(tokenList);
            nGrams.RemoveAt(nGrams.Count - 1);

            double logP = 0;
            for (int i = 0; i < nGrams.Count; i += 2)
            {
                double p = GetMLE(nGrams[i], nGrams[i + 1]);

                logP += Math.Log10(p);

                NGram x = nGrams[i];

                //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p));
                //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i])));
            }

            //double p = 1;
            //for (int i = 0; i < nGrams.Count; i += 2)
            //{
            //    p *= GetMLE(nGrams[i], nGrams[i + 1]);
            //}

            //return p;

            return logP;
        }
 public void TestExtractLetterNGramsAsList()
 {
     var extractor = new NGramExtractor(Bigram, Trigram);
     IEnumerable<string> tokens = "beşiktaş".ToCharArray().Select(x => x.ToString());
     IList<NGram> actual = extractor.ExtractAsList(tokens);
 }