Ejemplo n.º 1
0
        /// <summary>
        ///     Creates and returns a new NGramDictionary from a string.<br />
        ///     This string must be produced by the ToString() method of a NGramDictionary object.
        /// </summary>
        /// <param name="str">A string produced by the ToString() method of a NGramDictionary object.</param>
        /// <returns>A new NGramDictionary object</returns>
        public static NGramDictionary DeserializeFrom(string str)
        {
            string[] lines    = str.Split('\n');
            int      minNGram = Int32.Parse(lines[0].Split('\t')[0]);
            int      maxNGram = Int32.Parse(lines[0].Split('\t')[1]);

            IDictionary <NGram, int> nGrams = new Dictionary <NGram, int>();
            var extractor = new NGramExtractor(minNGram, maxNGram);

            foreach (string line in lines.Skip(1))
            {
                string[] row = line.Split('\t');

                var nGram = new NGram(row[0].Split(null));

                int freq = Int32.Parse(row[1]);

                if (!nGrams.ContainsKey(nGram))
                {
                    nGrams.Add(nGram, freq);
                }
            }

            return(new NGramDictionary(extractor, nGrams));
        }
Ejemplo n.º 2
0
 public NGramModel(int nGramSize, string modelFilepath)
 {
     nGramDictionary = new NGramDictionary(extractor);
     maxNGramSize    = nGramSize;
     extractor       = new NGramExtractor(1, maxNGramSize);
     string[] lines = File.ReadAllLines(modelFilepath);
     //AddAll(lines);
 }
Ejemplo n.º 3
0
        public double GetSentenceProbability(IList <string> tokens)
        {
            if (maxNGramSize == 1)
            {
                return(GetSentenceProbabilityForUnigrams(tokens));
            }

            List <string> tokenList = tokens.ToList();

            AddStartStopSymbols(tokenList);

            var           ext    = new NGramExtractor(maxNGramSize - 1, maxNGramSize);
            IList <NGram> nGrams = ext.ExtractAsList(tokenList);

            nGrams.RemoveAt(nGrams.Count - 1);

            double logP = 0;

            for (int i = 0; i < nGrams.Count; i += 2)
            {
                double p = GetMLE(nGrams[i], nGrams[i + 1]);

                logP += Math.Log10(p);

                NGram x = nGrams[i];

                //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p));
                //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i])));
            }

            //double p = 1;
            //for (int i = 0; i < nGrams.Count; i += 2)
            //{
            //    p *= GetMLE(nGrams[i], nGrams[i + 1]);
            //}

            //return p;

            return(logP);
        }
Ejemplo n.º 4
0
        public void TestExtractAsDictionary()
        {
            var extractor = new NGramExtractor(Unigram);
            IDictionary<NGram, int> actual = extractor.ExtractAsDictionary(Tokens);
            var expected = new Dictionary<NGram, int>
            {
                {one, 2},
                {two, 2},
                {three, 2},
                {four, 2},
                {five, 1}
            };
            CollectionAssert.AreEquivalent(expected, actual);

            extractor = new NGramExtractor(Bigram);
            actual = extractor.ExtractAsDictionary(Tokens);
            expected = new Dictionary<NGram, int>
            {
                {one_two, 2},
                {two_three, 2},
                {three_four, 2},
                {four_five, 1},
                {five_one, 1}
            };
            CollectionAssert.AreEquivalent(expected, actual);

            extractor = new NGramExtractor(Trigram);
            actual = extractor.ExtractAsDictionary(Tokens);
            expected = new Dictionary<NGram, int>
            {
                {one_two_three, 2},
                {two_three_four, 2},
                {four_five_one, 1},
                {three_four_five, 1},
                {five_one_two, 1},
            };
            CollectionAssert.AreEquivalent(expected, actual);
        }
Ejemplo n.º 5
0
 private NGramDictionary(NGramExtractor extractor, IDictionary<NGram, int> nGrams)
 {
     this.extractor = extractor;
     this.nGrams = nGrams;
 }
Ejemplo n.º 6
0
 public NGramDictionary(NGramExtractor extractor)
 {
     this.extractor = extractor;
     nGrams = new Dictionary<NGram, int>();
 }
Ejemplo n.º 7
0
 private NGramDictionary(NGramExtractor extractor, IDictionary <NGram, int> nGrams)
 {
     this.extractor = extractor;
     this.nGrams    = nGrams;
 }
Ejemplo n.º 8
0
 public NGramDictionary(NGramExtractor extractor)
 {
     this.extractor = extractor;
     nGrams         = new Dictionary <NGram, int>();
 }
Ejemplo n.º 9
0
 public NGramModel(int nGramSize)
 {
     maxNGramSize    = nGramSize;
     extractor       = new NGramExtractor(1, maxNGramSize);
     nGramDictionary = new NGramDictionary(extractor);
 }
Ejemplo n.º 10
0
        public double GetSentenceProbability(IList<string> tokens)
        {
            if (maxNGramSize == 1)
            {
                return GetSentenceProbabilityForUnigrams(tokens);
            }

            List<string> tokenList = tokens.ToList();
            AddStartStopSymbols(tokenList);

            var ext = new NGramExtractor(maxNGramSize - 1, maxNGramSize);
            IList<NGram> nGrams = ext.ExtractAsList(tokenList);
            nGrams.RemoveAt(nGrams.Count - 1);

            double logP = 0;
            for (int i = 0; i < nGrams.Count; i += 2)
            {
                double p = GetMLE(nGrams[i], nGrams[i + 1]);

                logP += Math.Log10(p);

                NGram x = nGrams[i];

                //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p));
                //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i])));
            }

            //double p = 1;
            //for (int i = 0; i < nGrams.Count; i += 2)
            //{
            //    p *= GetMLE(nGrams[i], nGrams[i + 1]);
            //}

            //return p;

            return logP;
        }
Ejemplo n.º 11
0
 public NGramModel(int nGramSize)
 {
     maxNGramSize = nGramSize;
     extractor = new NGramExtractor(1, maxNGramSize);
     nGramDictionary = new NGramDictionary(extractor);
 }
Ejemplo n.º 12
0
        public void TestExtractAsList()
        {
            var extractor = new NGramExtractor(Unigram);

            IList<NGram> actual = extractor.ExtractAsList(Tokens);
            var expected = new[] {one, two, three, four, five, one, two, three, four};
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Bigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[] {one_two, two_three, three_four, four_five, five_one, one_two, two_three, three_four};
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Trigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {one_two_three, two_three_four, three_four_five, four_five_one, five_one_two, one_two_three, two_three_four};
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Unigram, Bigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {
                one, one_two, two, two_three, three, three_four, four, four_five, five, five_one, one, one_two, two,
                two_three, three, three_four, four
            };
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Bigram, Trigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {
                one_two, one_two_three, two_three, two_three_four, three_four,
                three_four_five, four_five, four_five_one, five_one, five_one_two, one_two, one_two_three, two_three,
                two_three_four, three_four
            };
            CollectionAssert.AreEqual(expected, actual);

            extractor = new NGramExtractor(Unigram, Trigram);
            actual = extractor.ExtractAsList(Tokens);
            expected = new[]
            {
                one, one_two, one_two_three,
                two, two_three, two_three_four,
                three, three_four, three_four_five,
                four, four_five, four_five_one,
                five, five_one, five_one_two,
                one, one_two, one_two_three,
                two, two_three, two_three_four,
                three, three_four,
                four
            };
            CollectionAssert.AreEqual(expected, actual);
        }
Ejemplo n.º 13
0
 public void TestExtractLetterNGramsAsList()
 {
     var extractor = new NGramExtractor(Bigram, Trigram);
     IEnumerable<string> tokens = "beşiktaş".ToCharArray().Select(x => x.ToString());
     IList<NGram> actual = extractor.ExtractAsList(tokens);
 }
Ejemplo n.º 14
0
        public void TestExtractAsSet()
        {
            var extractor = new NGramExtractor(Unigram);
            ISet<NGram> actual = extractor.ExtractAsSet(Tokens);
            var expected = new[] {one, two, three, four, five};
            CollectionAssert.AreEquivalent(expected, actual);

            extractor = new NGramExtractor(Bigram);
            actual = extractor.ExtractAsSet(Tokens);
            expected = new[] {one_two, two_three, three_four, four_five, five_one};
            CollectionAssert.AreEquivalent(expected, actual);

            extractor = new NGramExtractor(Trigram);
            actual = extractor.ExtractAsSet(Tokens);
            expected = new[] {one_two_three, two_three_four, three_four_five, four_five_one, five_one_two};
            CollectionAssert.AreEquivalent(expected, actual);
        }
Ejemplo n.º 15
-1
        /// <summary>
        ///     Creates and returns a new NGramDictionary from a string.<br />
        ///     This string must be produced by the ToString() method of a NGramDictionary object.
        /// </summary>
        /// <param name="str">A string produced by the ToString() method of a NGramDictionary object.</param>
        /// <returns>A new NGramDictionary object</returns>
        public static NGramDictionary DeserializeFrom(string str)
        {
            string[] lines = str.Split('\n');
            int minNGram = Int32.Parse(lines[0].Split('\t')[0]);
            int maxNGram = Int32.Parse(lines[0].Split('\t')[1]);

            IDictionary<NGram, int> nGrams = new Dictionary<NGram, int>();
            var extractor = new NGramExtractor(minNGram, maxNGram);

            foreach (string line in lines.Skip(1))
            {
                string[] row = line.Split('\t');

                var nGram = new NGram(row[0].Split(null));

                int freq = Int32.Parse(row[1]);

                if (!nGrams.ContainsKey(nGram))
                {
                    nGrams.Add(nGram, freq);
                }
            }

            return new NGramDictionary(extractor, nGrams);
        }
Ejemplo n.º 16
-1
 public NGramModel(int nGramSize, string modelFilepath)
 {
     nGramDictionary = new NGramDictionary(extractor);
     maxNGramSize = nGramSize;
     extractor = new NGramExtractor(1, maxNGramSize);
     string[] lines = File.ReadAllLines(modelFilepath);
     //AddAll(lines);
 }