This class is in experimental state. Use at your own risk.
Ejemplo n.º 1
0
        public void AddStartStopSymbolsTest()
        {
            var tokens = new[] {"this", "is", "a", "test"};
            List<string> actual = tokens.ToList();
            List<string> expected = tokens.ToList();
            var model = new NGramModel(Unigram);
            model.AddStartStopSymbols(actual);
            CollectionAssert.AreEqual(expected, actual);

            model = new NGramModel(Bigram);
            actual = tokens.ToList();
            expected = new[] {"<s0>", "this", "is", "a", "test", "</s>"}.ToList();
            model.AddStartStopSymbols(actual);
            CollectionAssert.AreEqual(expected, actual);

            model = new NGramModel(Trigram);
            actual = tokens.ToList();
            expected = new[] {"<s1>", "<s0>", "this", "is", "a", "test", "</s>"}.ToList();
            model.AddStartStopSymbols(actual);
            CollectionAssert.AreEqual(expected, actual);

            model = new NGramModel(4);
            actual = tokens.ToList();
            expected = new[] {"<s2>", "<s1>", "<s0>", "this", "is", "a", "test", "</s>"}.ToList();
            model.AddStartStopSymbols(actual);
            CollectionAssert.AreEqual(expected, actual);
        }
Ejemplo n.º 2
0
        public void TestGetSequenceProbabilityForBigramModel()
        {
            var model = new NGramModel(Bigram);

            model.AddSentence(Text1.Split(null).ToList());
            model.AddSentence(Text2.Split(null).ToList());
            model.AddSentence(Text3.Split(null).ToList());

            double actual = model.GetSentenceProbability("I", "am", "Sam");
            double expected = .111;
            Assert.AreEqual(expected, Math.Round(actual, 3));

            actual = model.GetSentenceProbability("Sam", "I", "am");
            expected = .056;
            Assert.AreEqual(expected, Math.Round(actual, 3));

            actual = model.GetSentenceProbability("I", "do", "not", "like", "green", "eggs", "and", "ham");
            expected = .222;
            Assert.AreEqual(expected, Math.Round(actual, 3));

            actual = model.GetSentenceProbability("I", "am", "Sam", "I", "am");
            expected = .037;
            Assert.AreEqual(expected, Math.Round(actual, 3));

            actual = model.GetSentenceProbability("I", "am");
            expected = .222;
            Assert.AreEqual(expected, Math.Round(actual, 3));

            actual = model.GetSentenceProbability("I", "am", "the");
            expected = .0;
            Assert.AreEqual(expected, actual);
        }
Ejemplo n.º 3
0
 public StatisticalStemmer(NGramModel model, WordAnalyzer analyzer)
 {
     this.model = model;
     this.analyzer = analyzer;
 }
Ejemplo n.º 4
0
        private static NGramModel CreateModel()
        {
            var lines = File.ReadAllLines(
                @"C:\Users\hrzafer\Desktop\workspace\Damla\code\suggestion\unigrams.txt")
                .Select(x => x.Split(null));
            var nGramModel = new NGramModel(2);

            var counter = 0;

            foreach (var line in lines)
            {
                counter++;
                var solutions = Analyzer.Analyze(line[0]);
                foreach (var solution in solutions)
                {
                    var morphemeIds = solution.GetMorphemeIds();
                    var times = Math.Round((int.Parse(line[1]) + 99)/(double) 100);
                    for (var i = 0; i < times; i++)
                    {
                        nGramModel.AddSentence(morphemeIds);
                    }
                }

                if (counter%100 == 0)
                {
                    Console.WriteLine(counter);
                }
            }

            nGramModel.Deserialize(
                @"C:\Users\hrzafer\Desktop\workspace\Prizma\code\prizma\src\main\resources\stemDict\model_uni_bi.json");

            return nGramModel;
        }
Ejemplo n.º 5
0
        public void TestGetSequenceProbabilityForUnigramModel()
        {
            var model = new NGramModel(Unigram);

            model.AddSentence(Text1.Split(null).ToList());
            model.AddSentence(Text2.Split(null).ToList());
            model.AddSentence(Text3.Split(null).ToList());

            double actual = model.GetSentenceProbability("I");
            double expected = 0.21;
            Assert.AreEqual(expected, Math.Round(actual, 2));

            actual = model.GetSentenceProbability("Sam");
            expected = 0.14;
            Assert.AreEqual(expected, Math.Round(actual, 2));

            actual = model.GetSentenceProbability("am");
            expected = 0.14;
            Assert.AreEqual(expected, Math.Round(actual, 2));

            actual = model.GetSentenceProbability("the");
            expected = 0.00;
            Assert.AreEqual(expected, Math.Round(actual, 2));

            //p(I) * p(am)
            actual = model.GetSentenceProbability("I", "am");
            expected = 0.0306;
            Assert.AreEqual(expected, Math.Round(actual, 4));

            //p(I) * p(do)
            actual = model.GetSentenceProbability("I", "do");
            expected = 0.015;
            Assert.AreEqual(expected, Math.Round(actual, 3));

            //p(Sam) * p(I)
            actual = model.GetSentenceProbability("Sam", "I");
            expected = 0.0306;
            Assert.AreEqual(expected, Math.Round(actual, 4));

            //p(I) * p(am) * p(Sam)
            actual = model.GetSentenceProbability("I", "am", "Sam");
            expected = 0.0044;
            Assert.AreEqual(expected, Math.Round(actual, 4));

            //p(Sam) * p(I) * p(am)
            actual = model.GetSentenceProbability("Sam", "I", "am");
            expected = 0.0044;
            Assert.AreEqual(expected, Math.Round(actual, 4));
        }