public void TestBigramProbabilityNoSmoothing() { var model = new NGramLanguageModel(2, 0); model.Add(new StringList("<s>", "I", "am", "Sam", "</s>"), 1, 2); model.Add(new StringList("<s>", "Sam", "I", "am", "</s>"), 1, 2); model.Add(new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"), 1, 2); var probability = model.CalculateProbability(new StringList("<s>", "I")); Assert.That(probability, Is.EqualTo(0.666d).Within(0.001)); probability = model.CalculateProbability(new StringList("Sam", "</s>")); Assert.That(probability, Is.EqualTo(0.5d).Within(0.001)); probability = model.CalculateProbability(new StringList("<s>", "Sam")); Assert.That(probability, Is.EqualTo(0.333d).Within(0.001)); probability = model.CalculateProbability(new StringList("am", "Sam")); Assert.That(probability, Is.EqualTo(0.5d).Within(0.001)); probability = model.CalculateProbability(new StringList("I", "am")); Assert.That(probability, Is.EqualTo(0.666d).Within(0.001)); probability = model.CalculateProbability(new StringList("I", "do")); Assert.That(probability, Is.EqualTo(0.333d).Within(0.001)); probability = model.CalculateProbability(new StringList("I", "am", "Sam")); Assert.That(probability, Is.EqualTo(0.333d).Within(0.001)); }
public void TestTrigramLanguageModelCreationFromText() { var ngramSize = 3; var languageModel = new NGramLanguageModel(ngramSize); var stream = Tests.OpenFile("/opennlp/tools/languagemodel/sentences.txt", Encoding.UTF8); string line; while ((line = stream.ReadLine()) != null) { var list = new List <string>(line.Split(new[] { ' ' }, StringSplitOptions.None)); var generatedStrings = NGramGenerator.Generate(list, ngramSize, " "); foreach (var generatedString in generatedStrings) { var tokens = generatedString.Split(new[] { ' ' }, StringSplitOptions.None); if (tokens.Length > 0) { languageModel.Add(new StringList(tokens), 1, ngramSize); } } } var predited = languageModel.PredictNextTokens(new StringList("neural", "network", "language")); Assert.That(predited, Is.EqualTo(new StringList("models"))); var p1 = languageModel.CalculateProbability(new StringList("neural", "network", "language", "models")); var p2 = languageModel.CalculateProbability(new StringList("neural", "network", "language", "model")); Assert.That(p1, Is.GreaterThan(p2)); }
public void TestBigram() { var model = new NGramLanguageModel(2); model.Add(new StringList("I", "see", "the", "fox"), 1, 2); model.Add(new StringList("the", "red", "house"), 1, 2); model.Add(new StringList("I", "saw", "something", "nice"), 1, 2); var probability = model.CalculateProbability(new StringList("I", "saw", "the", "red", "house")); Assert.That(probability, Is.InRange(0d, 1d), "a probability measure should be between 0 and 1 [was {0} ]", probability); var tokens = model.PredictNextTokens(new StringList("I", "saw")); Assert.That(tokens, Is.EqualTo(new StringList("something"))); }
public void TestRandomVocabularyAndSentence() { var model = new NGramLanguageModel(); foreach (var sentence in LanguageModelTestUtils.GenerateRandomVocabulary(10)) { model.Add(sentence, 2, 3); } var probability = model.CalculateProbability(LanguageModelTestUtils.GenerateRandomSentence()); Assert.That(probability, Is.InRange(0d, 1d), "a probability measure should be between 0 and 1 [was {0} ]", probability); }
public void TestPerplexityComparison() { var trainingVocabulary = LanguageModelTestUtils.GenerateRandomVocabulary(11000); //var trainingVocabulary = LanguageModelTestUtils.GenerateRandomVocabulary(1100000); var testVocabulary = LanguageModelTestUtils.GenerateRandomVocabulary(100); var unigramLM = new NGramLanguageModel(1); foreach (var sentence in trainingVocabulary) { unigramLM.Add(sentence, 1, 1); } var unigramPerplexity = LanguageModelTestUtils.GetPerplexity(unigramLM, testVocabulary, 1); var bigramLM = new NGramLanguageModel(2); foreach (var sentence in trainingVocabulary) { bigramLM.Add(sentence, 1, 2); } var bigramPerplexity = LanguageModelTestUtils.GetPerplexity(bigramLM, testVocabulary, 2); Assert.That(unigramPerplexity, Is.GreaterThanOrEqualTo(bigramPerplexity)); var trigramLM = new NGramLanguageModel(3); foreach (var sentence in trainingVocabulary) { trigramLM.Add(sentence, 2, 3); } var trigramPerplexity = LanguageModelTestUtils.GetPerplexity(trigramLM, testVocabulary, 3); Assert.That(bigramPerplexity, Is.GreaterThanOrEqualTo(trigramPerplexity)); }