public void TestTrigrams() { var trigams = new NGramDictionary(new NGramExtractor(Trigram)); trigams.AddSequence(Text1.Split(null).ToList()); trigams.AddSequence(Text2.Split(null).ToList()); trigams.AddSequence(Text3.Split(null).ToList()); var ex = Assert.Throws<ArgumentException>(() => trigams.GetFrequency()); Assert.That(ex.Message, Is.EqualTo(@"Length of nGramTokens (0) must not be greater than maxNGramSize or less than minNGramSize")); ex = Assert.Throws<ArgumentException>(() => trigams.GetFrequency("I")); Assert.That(ex.Message, Is.EqualTo(@"Length of nGramTokens (1) must not be greater than maxNGramSize or less than minNGramSize")); ex = Assert.Throws<ArgumentException>(() => trigams.GetFrequency("I", "am")); Assert.That(ex.Message, Is.EqualTo(@"Length of nGramTokens (2) must not be greater than maxNGramSize or less than minNGramSize")); int freq = trigams.GetFrequency("I", "am", "Sam"); Assert.AreEqual(1, freq); freq = trigams.GetFrequency("Sam", "I", "am"); Assert.AreEqual(1, freq); freq = trigams.GetFrequency("not", "like", "green"); Assert.AreEqual(1, freq); }
public NGramModel(int nGramSize, string modelFilepath) { nGramDictionary = new NGramDictionary(extractor); maxNGramSize = nGramSize; extractor = new NGramExtractor(1, maxNGramSize); string[] lines = File.ReadAllLines(modelFilepath); //AddAll(lines); }
public void SerializationTest() { var corpus = new NGramDictionary(new NGramExtractor(Unigram, Trigram)); corpus.AddSequence(Text1.Split(null).ToList()); corpus.AddSequence(Text2.Split(null).ToList()); NGramDictionary copy = NGramDictionary.DeserializeFrom(corpus.ToString()); corpus.AddSequence(Text3.Split(null).ToList()); copy.AddSequence(Text3.Split(null).ToList()); Assert.AreEqual(corpus.ToString(), copy.ToString()); }
public NGramModel(int nGramSize) { maxNGramSize = nGramSize; extractor = new NGramExtractor(1, maxNGramSize); nGramDictionary = new NGramDictionary(extractor); }
public void TestUnigramsBigramsTrigramsWithStartStopSymbols() { const string text1 = "<s> <s> I am Sam </s>"; const string text2 = "<s> <s> Sam I am </s>"; const string text3 = "<s> <s> I do not like green eggs and ham </s>"; var corpus = new NGramDictionary(new NGramExtractor(Unigram, Trigram)); corpus.AddSequence(text1.Split(null).ToList()); corpus.AddSequence(text2.Split(null).ToList()); corpus.AddSequence(text3.Split(null).ToList()); var ex = Assert.Throws<ArgumentException>(() => corpus.GetFrequency()); Assert.That(ex.Message, Is.EqualTo(@"Length of nGramTokens (0) must not be greater than maxNGramSize or less than minNGramSize")); int freq = corpus.GetFrequency("<s>"); Assert.AreEqual(6, freq); freq = corpus.GetFrequency("</s>"); Assert.AreEqual(3, freq); freq = corpus.GetFrequency("<s>", "<s>"); Assert.AreEqual(3, freq); freq = corpus.GetFrequency("am"); Assert.AreEqual(2, freq); freq = corpus.GetFrequency("not"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("the"); Assert.AreEqual(0, freq); freq = corpus.GetFrequency("<s>", "I", "am"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("<s>", "<s>", "I"); Assert.AreEqual(2, freq); freq = corpus.GetFrequency("<s>", "<s>", "Sam"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("<s>", "Sam", "I"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("am", "I"); Assert.AreEqual(0, freq); freq = corpus.GetFrequency("I", "am", "</s>"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("am", "</s>"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("I", "am", "Sam"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("Sam", "I", "am"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("not", "like", "green"); Assert.AreEqual(1, freq); }
public void TestUnigramsBigrams() { var corpus = new NGramDictionary(new NGramExtractor(Unigram, Bigram)); corpus.AddSequence(Text1.Split(null).ToList()); corpus.AddSequence(Text2.Split(null).ToList()); corpus.AddSequence(Text3.Split(null).ToList()); var ex = Assert.Throws<ArgumentException>(() => corpus.GetFrequency()); Assert.That(ex.Message, Is.EqualTo(@"Length of nGramTokens (0) must not be greater than maxNGramSize or less than minNGramSize")); int freq = corpus.GetFrequency("I"); Assert.AreEqual(3, freq); freq = corpus.GetFrequency("am"); Assert.AreEqual(2, freq); freq = corpus.GetFrequency("not"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("the"); Assert.AreEqual(0, freq); freq = corpus.GetFrequency("I", "am"); Assert.AreEqual(2, freq); freq = corpus.GetFrequency("Sam", "I"); Assert.AreEqual(1, freq); freq = corpus.GetFrequency("am", "I"); Assert.AreEqual(0, freq); }