public void AddStartStopSymbolsTest() { var tokens = new[] {"this", "is", "a", "test"}; List<string> actual = tokens.ToList(); List<string> expected = tokens.ToList(); var model = new NGramModel(Unigram); model.AddStartStopSymbols(actual); CollectionAssert.AreEqual(expected, actual); model = new NGramModel(Bigram); actual = tokens.ToList(); expected = new[] {"<s0>", "this", "is", "a", "test", "</s>"}.ToList(); model.AddStartStopSymbols(actual); CollectionAssert.AreEqual(expected, actual); model = new NGramModel(Trigram); actual = tokens.ToList(); expected = new[] {"<s1>", "<s0>", "this", "is", "a", "test", "</s>"}.ToList(); model.AddStartStopSymbols(actual); CollectionAssert.AreEqual(expected, actual); model = new NGramModel(4); actual = tokens.ToList(); expected = new[] {"<s2>", "<s1>", "<s0>", "this", "is", "a", "test", "</s>"}.ToList(); model.AddStartStopSymbols(actual); CollectionAssert.AreEqual(expected, actual); }
public void TestGetSequenceProbabilityForBigramModel() { var model = new NGramModel(Bigram); model.AddSentence(Text1.Split(null).ToList()); model.AddSentence(Text2.Split(null).ToList()); model.AddSentence(Text3.Split(null).ToList()); double actual = model.GetSentenceProbability("I", "am", "Sam"); double expected = .111; Assert.AreEqual(expected, Math.Round(actual, 3)); actual = model.GetSentenceProbability("Sam", "I", "am"); expected = .056; Assert.AreEqual(expected, Math.Round(actual, 3)); actual = model.GetSentenceProbability("I", "do", "not", "like", "green", "eggs", "and", "ham"); expected = .222; Assert.AreEqual(expected, Math.Round(actual, 3)); actual = model.GetSentenceProbability("I", "am", "Sam", "I", "am"); expected = .037; Assert.AreEqual(expected, Math.Round(actual, 3)); actual = model.GetSentenceProbability("I", "am"); expected = .222; Assert.AreEqual(expected, Math.Round(actual, 3)); actual = model.GetSentenceProbability("I", "am", "the"); expected = .0; Assert.AreEqual(expected, actual); }
public StatisticalStemmer(NGramModel model, WordAnalyzer analyzer) { this.model = model; this.analyzer = analyzer; }
private static NGramModel CreateModel() { var lines = File.ReadAllLines( @"C:\Users\hrzafer\Desktop\workspace\Damla\code\suggestion\unigrams.txt") .Select(x => x.Split(null)); var nGramModel = new NGramModel(2); var counter = 0; foreach (var line in lines) { counter++; var solutions = Analyzer.Analyze(line[0]); foreach (var solution in solutions) { var morphemeIds = solution.GetMorphemeIds(); var times = Math.Round((int.Parse(line[1]) + 99)/(double) 100); for (var i = 0; i < times; i++) { nGramModel.AddSentence(morphemeIds); } } if (counter%100 == 0) { Console.WriteLine(counter); } } nGramModel.Deserialize( @"C:\Users\hrzafer\Desktop\workspace\Prizma\code\prizma\src\main\resources\stemDict\model_uni_bi.json"); return nGramModel; }
public void TestGetSequenceProbabilityForUnigramModel() { var model = new NGramModel(Unigram); model.AddSentence(Text1.Split(null).ToList()); model.AddSentence(Text2.Split(null).ToList()); model.AddSentence(Text3.Split(null).ToList()); double actual = model.GetSentenceProbability("I"); double expected = 0.21; Assert.AreEqual(expected, Math.Round(actual, 2)); actual = model.GetSentenceProbability("Sam"); expected = 0.14; Assert.AreEqual(expected, Math.Round(actual, 2)); actual = model.GetSentenceProbability("am"); expected = 0.14; Assert.AreEqual(expected, Math.Round(actual, 2)); actual = model.GetSentenceProbability("the"); expected = 0.00; Assert.AreEqual(expected, Math.Round(actual, 2)); //p(I) * p(am) actual = model.GetSentenceProbability("I", "am"); expected = 0.0306; Assert.AreEqual(expected, Math.Round(actual, 4)); //p(I) * p(do) actual = model.GetSentenceProbability("I", "do"); expected = 0.015; Assert.AreEqual(expected, Math.Round(actual, 3)); //p(Sam) * p(I) actual = model.GetSentenceProbability("Sam", "I"); expected = 0.0306; Assert.AreEqual(expected, Math.Round(actual, 4)); //p(I) * p(am) * p(Sam) actual = model.GetSentenceProbability("I", "am", "Sam"); expected = 0.0044; Assert.AreEqual(expected, Math.Round(actual, 4)); //p(Sam) * p(I) * p(am) actual = model.GetSentenceProbability("Sam", "I", "am"); expected = 0.0044; Assert.AreEqual(expected, Math.Round(actual, 4)); }