/// <summary> /// Creates and returns a new NGramDictionary from a string.<br /> /// This string must be produced by the ToString() method of a NGramDictionary object. /// </summary> /// <param name="str">A string produced by the ToString() method of a NGramDictionary object.</param> /// <returns>A new NGramDictionary object</returns> public static NGramDictionary DeserializeFrom(string str) { string[] lines = str.Split('\n'); int minNGram = Int32.Parse(lines[0].Split('\t')[0]); int maxNGram = Int32.Parse(lines[0].Split('\t')[1]); IDictionary <NGram, int> nGrams = new Dictionary <NGram, int>(); var extractor = new NGramExtractor(minNGram, maxNGram); foreach (string line in lines.Skip(1)) { string[] row = line.Split('\t'); var nGram = new NGram(row[0].Split(null)); int freq = Int32.Parse(row[1]); if (!nGrams.ContainsKey(nGram)) { nGrams.Add(nGram, freq); } } return(new NGramDictionary(extractor, nGrams)); }
public NGramModel(int nGramSize, string modelFilepath) { nGramDictionary = new NGramDictionary(extractor); maxNGramSize = nGramSize; extractor = new NGramExtractor(1, maxNGramSize); string[] lines = File.ReadAllLines(modelFilepath); //AddAll(lines); }
public double GetSentenceProbability(IList <string> tokens) { if (maxNGramSize == 1) { return(GetSentenceProbabilityForUnigrams(tokens)); } List <string> tokenList = tokens.ToList(); AddStartStopSymbols(tokenList); var ext = new NGramExtractor(maxNGramSize - 1, maxNGramSize); IList <NGram> nGrams = ext.ExtractAsList(tokenList); nGrams.RemoveAt(nGrams.Count - 1); double logP = 0; for (int i = 0; i < nGrams.Count; i += 2) { double p = GetMLE(nGrams[i], nGrams[i + 1]); logP += Math.Log10(p); NGram x = nGrams[i]; //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p)); //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i]))); } //double p = 1; //for (int i = 0; i < nGrams.Count; i += 2) //{ // p *= GetMLE(nGrams[i], nGrams[i + 1]); //} //return p; return(logP); }
public void TestExtractAsDictionary() { var extractor = new NGramExtractor(Unigram); IDictionary<NGram, int> actual = extractor.ExtractAsDictionary(Tokens); var expected = new Dictionary<NGram, int> { {one, 2}, {two, 2}, {three, 2}, {four, 2}, {five, 1} }; CollectionAssert.AreEquivalent(expected, actual); extractor = new NGramExtractor(Bigram); actual = extractor.ExtractAsDictionary(Tokens); expected = new Dictionary<NGram, int> { {one_two, 2}, {two_three, 2}, {three_four, 2}, {four_five, 1}, {five_one, 1} }; CollectionAssert.AreEquivalent(expected, actual); extractor = new NGramExtractor(Trigram); actual = extractor.ExtractAsDictionary(Tokens); expected = new Dictionary<NGram, int> { {one_two_three, 2}, {two_three_four, 2}, {four_five_one, 1}, {three_four_five, 1}, {five_one_two, 1}, }; CollectionAssert.AreEquivalent(expected, actual); }
private NGramDictionary(NGramExtractor extractor, IDictionary<NGram, int> nGrams) { this.extractor = extractor; this.nGrams = nGrams; }
public NGramDictionary(NGramExtractor extractor) { this.extractor = extractor; nGrams = new Dictionary<NGram, int>(); }
private NGramDictionary(NGramExtractor extractor, IDictionary <NGram, int> nGrams) { this.extractor = extractor; this.nGrams = nGrams; }
public NGramDictionary(NGramExtractor extractor) { this.extractor = extractor; nGrams = new Dictionary <NGram, int>(); }
public NGramModel(int nGramSize) { maxNGramSize = nGramSize; extractor = new NGramExtractor(1, maxNGramSize); nGramDictionary = new NGramDictionary(extractor); }
public double GetSentenceProbability(IList<string> tokens) { if (maxNGramSize == 1) { return GetSentenceProbabilityForUnigrams(tokens); } List<string> tokenList = tokens.ToList(); AddStartStopSymbols(tokenList); var ext = new NGramExtractor(maxNGramSize - 1, maxNGramSize); IList<NGram> nGrams = ext.ExtractAsList(tokenList); nGrams.RemoveAt(nGrams.Count - 1); double logP = 0; for (int i = 0; i < nGrams.Count; i += 2) { double p = GetMLE(nGrams[i], nGrams[i + 1]); logP += Math.Log10(p); NGram x = nGrams[i]; //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p)); //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i]))); } //double p = 1; //for (int i = 0; i < nGrams.Count; i += 2) //{ // p *= GetMLE(nGrams[i], nGrams[i + 1]); //} //return p; return logP; }
public void TestExtractAsList() { var extractor = new NGramExtractor(Unigram); IList<NGram> actual = extractor.ExtractAsList(Tokens); var expected = new[] {one, two, three, four, five, one, two, three, four}; CollectionAssert.AreEqual(expected, actual); extractor = new NGramExtractor(Bigram); actual = extractor.ExtractAsList(Tokens); expected = new[] {one_two, two_three, three_four, four_five, five_one, one_two, two_three, three_four}; CollectionAssert.AreEqual(expected, actual); extractor = new NGramExtractor(Trigram); actual = extractor.ExtractAsList(Tokens); expected = new[] {one_two_three, two_three_four, three_four_five, four_five_one, five_one_two, one_two_three, two_three_four}; CollectionAssert.AreEqual(expected, actual); extractor = new NGramExtractor(Unigram, Bigram); actual = extractor.ExtractAsList(Tokens); expected = new[] { one, one_two, two, two_three, three, three_four, four, four_five, five, five_one, one, one_two, two, two_three, three, three_four, four }; CollectionAssert.AreEqual(expected, actual); extractor = new NGramExtractor(Bigram, Trigram); actual = extractor.ExtractAsList(Tokens); expected = new[] { one_two, one_two_three, two_three, two_three_four, three_four, three_four_five, four_five, four_five_one, five_one, five_one_two, one_two, one_two_three, two_three, two_three_four, three_four }; CollectionAssert.AreEqual(expected, actual); extractor = new NGramExtractor(Unigram, Trigram); actual = extractor.ExtractAsList(Tokens); expected = new[] { one, one_two, one_two_three, two, two_three, two_three_four, three, three_four, three_four_five, four, four_five, four_five_one, five, five_one, five_one_two, one, one_two, one_two_three, two, two_three, two_three_four, three, three_four, four }; CollectionAssert.AreEqual(expected, actual); }
public void TestExtractLetterNGramsAsList() { var extractor = new NGramExtractor(Bigram, Trigram); IEnumerable<string> tokens = "beşiktaş".ToCharArray().Select(x => x.ToString()); IList<NGram> actual = extractor.ExtractAsList(tokens); }
public void TestExtractAsSet() { var extractor = new NGramExtractor(Unigram); ISet<NGram> actual = extractor.ExtractAsSet(Tokens); var expected = new[] {one, two, three, four, five}; CollectionAssert.AreEquivalent(expected, actual); extractor = new NGramExtractor(Bigram); actual = extractor.ExtractAsSet(Tokens); expected = new[] {one_two, two_three, three_four, four_five, five_one}; CollectionAssert.AreEquivalent(expected, actual); extractor = new NGramExtractor(Trigram); actual = extractor.ExtractAsSet(Tokens); expected = new[] {one_two_three, two_three_four, three_four_five, four_five_one, five_one_two}; CollectionAssert.AreEquivalent(expected, actual); }
/// <summary> /// Creates and returns a new NGramDictionary from a string.<br /> /// This string must be produced by the ToString() method of a NGramDictionary object. /// </summary> /// <param name="str">A string produced by the ToString() method of a NGramDictionary object.</param> /// <returns>A new NGramDictionary object</returns> public static NGramDictionary DeserializeFrom(string str) { string[] lines = str.Split('\n'); int minNGram = Int32.Parse(lines[0].Split('\t')[0]); int maxNGram = Int32.Parse(lines[0].Split('\t')[1]); IDictionary<NGram, int> nGrams = new Dictionary<NGram, int>(); var extractor = new NGramExtractor(minNGram, maxNGram); foreach (string line in lines.Skip(1)) { string[] row = line.Split('\t'); var nGram = new NGram(row[0].Split(null)); int freq = Int32.Parse(row[1]); if (!nGrams.ContainsKey(nGram)) { nGrams.Add(nGram, freq); } } return new NGramDictionary(extractor, nGrams); }