public void TestBigramProbabilityNoSmoothing()
        {
            var model = new NGramLanguageModel(2, 0);

            model.Add(new StringList("<s>", "I", "am", "Sam", "</s>"), 1, 2);
            model.Add(new StringList("<s>", "Sam", "I", "am", "</s>"), 1, 2);
            model.Add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"), 1, 2);

            var probability = model.CalculateProbability(new StringList("<s>", "I"));

            Assert.That(probability, Is.EqualTo(0.666d).Within(0.001));

            probability = model.CalculateProbability(new StringList("Sam", "</s>"));
            Assert.That(probability, Is.EqualTo(0.5d).Within(0.001));

            probability = model.CalculateProbability(new StringList("<s>", "Sam"));
            Assert.That(probability, Is.EqualTo(0.333d).Within(0.001));

            probability = model.CalculateProbability(new StringList("am", "Sam"));
            Assert.That(probability, Is.EqualTo(0.5d).Within(0.001));

            probability = model.CalculateProbability(new StringList("I", "am"));
            Assert.That(probability, Is.EqualTo(0.666d).Within(0.001));

            probability = model.CalculateProbability(new StringList("I", "do"));
            Assert.That(probability, Is.EqualTo(0.333d).Within(0.001));

            probability = model.CalculateProbability(new StringList("I", "am", "Sam"));
            Assert.That(probability, Is.EqualTo(0.333d).Within(0.001));
        }
        public void TestTrigramLanguageModelCreationFromText()
        {
            var ngramSize     = 3;
            var languageModel = new NGramLanguageModel(ngramSize);

            var stream = Tests.OpenFile("/opennlp/tools/languagemodel/sentences.txt", Encoding.UTF8);

            string line;

            while ((line = stream.ReadLine()) != null)
            {
                var list             = new List <string>(line.Split(new[] { ' ' }, StringSplitOptions.None));
                var generatedStrings = NGramGenerator.Generate(list, ngramSize, " ");
                foreach (var generatedString in generatedStrings)
                {
                    var tokens = generatedString.Split(new[] { ' ' }, StringSplitOptions.None);
                    if (tokens.Length > 0)
                    {
                        languageModel.Add(new StringList(tokens), 1, ngramSize);
                    }
                }
            }


            var predited = languageModel.PredictNextTokens(new StringList("neural", "network", "language"));

            Assert.That(predited, Is.EqualTo(new StringList("models")));

            var p1 = languageModel.CalculateProbability(new StringList("neural", "network", "language", "models"));
            var p2 = languageModel.CalculateProbability(new StringList("neural", "network", "language", "model"));

            Assert.That(p1, Is.GreaterThan(p2));
        }
        public void TestEmptyVocabularyProbability()
        {
            var model = new NGramLanguageModel();

            Assert.That(model.CalculateProbability(new StringList(string.Empty)), Is.EqualTo(0d),
                        "Probability with an empty vocabulary is always 0");

            Assert.That(model.CalculateProbability(new StringList("1", "2", "3")), Is.EqualTo(0d),
                        "Probability with an empty vocabulary is always 0");
        }
        public void TestSerializedNGramLanguageModel()
        {
            var languageModel = new NGramLanguageModel(Tests.OpenFile("/opennlp/tools/ngram/ngram-model.xml"), 3);

            var probability = languageModel.CalculateProbability(new StringList("The", "brown", "fox", "jumped"));

            Assert.That(probability, Is.InRange(0d, 1d), "a probability measure should be between 0 and 1 [was {0} ]",
                        probability);

            var tokens = languageModel.PredictNextTokens(new StringList("fox"));

            Assert.That(tokens, Is.EqualTo(new StringList("jumped")));
        }
        public void TestRandomVocabularyAndSentence()
        {
            var model = new NGramLanguageModel();

            foreach (var sentence in LanguageModelTestUtils.GenerateRandomVocabulary(10))
            {
                model.Add(sentence, 2, 3);
            }
            var probability = model.CalculateProbability(LanguageModelTestUtils.GenerateRandomSentence());

            Assert.That(probability, Is.InRange(0d, 1d), "a probability measure should be between 0 and 1 [was {0} ]",
                        probability);
        }
        public void TestNgramModel()
        {
            var model = new NGramLanguageModel(4)
            {
                { new StringList("I", "saw", "the", "fox"), 1, 4 },
                { new StringList("the", "red", "house"), 1, 4 },
                { new StringList("I", "saw", "something", "nice"), 1, 2 }
            };
            var probability = model.CalculateProbability(new StringList("I", "saw", "the", "red", "house"));

            Assert.That(probability, Is.InRange(0d, 1d), "a probability measure should be between 0 and 1 [was {0} ]",
                        probability);

            var tokens = model.PredictNextTokens(new StringList("I", "saw"));

            Assert.That(tokens, Is.EqualTo(new StringList("the", "fox")));
        }
        public void TestPerplexityComparison()
        {
            var trainingVocabulary = LanguageModelTestUtils.GenerateRandomVocabulary(11000);

            //var trainingVocabulary = LanguageModelTestUtils.GenerateRandomVocabulary(1100000);
            var testVocabulary = LanguageModelTestUtils.GenerateRandomVocabulary(100);

            var unigramLM = new NGramLanguageModel(1);

            foreach (var sentence in trainingVocabulary)
            {
                unigramLM.Add(sentence, 1, 1);
            }
            var unigramPerplexity = LanguageModelTestUtils.GetPerplexity(unigramLM, testVocabulary, 1);

            var bigramLM = new NGramLanguageModel(2);

            foreach (var sentence in trainingVocabulary)
            {
                bigramLM.Add(sentence, 1, 2);
            }
            var bigramPerplexity = LanguageModelTestUtils.GetPerplexity(bigramLM, testVocabulary, 2);

            Assert.That(unigramPerplexity, Is.GreaterThanOrEqualTo(bigramPerplexity));

            var trigramLM = new NGramLanguageModel(3);

            foreach (var sentence in trainingVocabulary)
            {
                trigramLM.Add(sentence, 2, 3);
            }
            var trigramPerplexity = LanguageModelTestUtils.GetPerplexity(trigramLM, testVocabulary, 3);


            Assert.That(bigramPerplexity, Is.GreaterThanOrEqualTo(trigramPerplexity));
        }