/// <summary> /// Creates and returns a new NGramDictionary from a string.<br /> /// This string must be produced by the ToString() method of a NGramDictionary object. /// </summary> /// <param name="str">A string produced by the ToString() method of a NGramDictionary object.</param> /// <returns>A new NGramDictionary object</returns> public static NGramDictionary DeserializeFrom(string str) { string[] lines = str.Split('\n'); int minNGram = Int32.Parse(lines[0].Split('\t')[0]); int maxNGram = Int32.Parse(lines[0].Split('\t')[1]); IDictionary <NGram, int> nGrams = new Dictionary <NGram, int>(); var extractor = new NGramExtractor(minNGram, maxNGram); foreach (string line in lines.Skip(1)) { string[] row = line.Split('\t'); var nGram = new NGram(row[0].Split(null)); int freq = Int32.Parse(row[1]); if (!nGrams.ContainsKey(nGram)) { nGrams.Add(nGram, freq); } } return(new NGramDictionary(extractor, nGrams)); }
/// <summary> /// returns frequency of the n-gram which consists of nGramTokens /// </summary> /// <param name="nGramTokens">Tokens of the n-gram</param> /// <returns></returns> public int GetFrequency(params string[] nGramTokens) { Validate(nGramTokens); var nGram = new NGram(nGramTokens); return(GetFrequency(nGram)); }
/// <summary> /// returns requency of the n-gram /// </summary> /// <param name="nGram"></param> /// <returns></returns> public int GetFrequency(NGram nGram) { if (nGrams.ContainsKey(nGram)) { return(nGrams[nGram]); } return(0); }
private IEnumerable <NGram> GetNGrams(IList <string> tokens, int index) { var nGrams = new List <NGram>(); int maxWindowSize = Math.Min(tokens.Count() - index, maxNGramSize); for (int windowSize = minNGramSize; windowSize <= maxWindowSize; windowSize++) { NGram nGram = GetNGram(tokens, index, windowSize); nGrams.Add(nGram); } return(nGrams); }
public double GetMLE(NGram denominatorNGram, NGram nominatorNGram) { int nom = nGramDictionary.GetFrequency(nominatorNGram); int denom = nGramDictionary.GetFrequency(denominatorNGram); if (denom == 0) { return(0); } return(nom / (double)denom); }
public double GetSentenceProbability(IList <string> tokens) { if (maxNGramSize == 1) { return(GetSentenceProbabilityForUnigrams(tokens)); } List <string> tokenList = tokens.ToList(); AddStartStopSymbols(tokenList); var ext = new NGramExtractor(maxNGramSize - 1, maxNGramSize); IList <NGram> nGrams = ext.ExtractAsList(tokenList); nGrams.RemoveAt(nGrams.Count - 1); double logP = 0; for (int i = 0; i < nGrams.Count; i += 2) { double p = GetMLE(nGrams[i], nGrams[i + 1]); logP += Math.Log10(p); NGram x = nGrams[i]; //Console.WriteLine(nGrams[i + 1] + "/" + nGrams[i] + ":" + Math.Log10(p)); //Console.WriteLine(nGrams[i] + ":" + Math.Log10(GetUnigramMLE(nGrams[i]))); } //double p = 1; //for (int i = 0; i < nGrams.Count; i += 2) //{ // p *= GetMLE(nGrams[i], nGrams[i + 1]); //} //return p; return(logP); }
public double GetUnigramMLE(NGram nGram) { return(nGramDictionary.GetFrequency(nGram) / (double)tokenCount); }
private bool Equals(NGram other) { return(Tokens.SequenceEqual(other.Tokens)); }
/// <summary> /// Creates and returns a new NGramDictionary from a string.<br /> /// This string must be produced by the ToString() method of a NGramDictionary object. /// </summary> /// <param name="str">A string produced by the ToString() method of a NGramDictionary object.</param> /// <returns>A new NGramDictionary object</returns> public static NGramDictionary DeserializeFrom(string str) { string[] lines = str.Split('\n'); int minNGram = Int32.Parse(lines[0].Split('\t')[0]); int maxNGram = Int32.Parse(lines[0].Split('\t')[1]); IDictionary<NGram, int> nGrams = new Dictionary<NGram, int>(); var extractor = new NGramExtractor(minNGram, maxNGram); foreach (string line in lines.Skip(1)) { string[] row = line.Split('\t'); var nGram = new NGram(row[0].Split(null)); int freq = Int32.Parse(row[1]); if (!nGrams.ContainsKey(nGram)) { nGrams.Add(nGram, freq); } } return new NGramDictionary(extractor, nGrams); }
public void TestEquals() { var trigram1 = new NGram(new List<string> {"one", "two", "three"}); var trigram2 = new NGram(new List<string> {"one", "two", "three"}); Assert.AreEqual(trigram1, trigram2); Assert.AreEqual(trigram1.GetHashCode(), trigram2.GetHashCode()); trigram1 = new NGram(new List<string> {"two", "one", "three"}); trigram2 = new NGram(new List<string> {"one", "two", "three"}); Assert.AreNotEqual(trigram1, trigram2); Assert.AreNotEqual(trigram1.GetHashCode(), trigram2.GetHashCode()); trigram1 = new NGram("one", "two", "three"); trigram2 = new NGram("one", "two", "three"); Assert.AreEqual(trigram1, trigram2); Assert.AreEqual(trigram1.GetHashCode(), trigram2.GetHashCode()); trigram1 = new NGram("two", "one", "three"); trigram2 = new NGram("one", "two", "three"); Assert.AreNotEqual(trigram1, trigram2); Assert.AreNotEqual(trigram1.GetHashCode(), trigram2.GetHashCode()); }
public double GetMLE(NGram denominatorNGram, NGram nominatorNGram) { int nom = nGramDictionary.GetFrequency(nominatorNGram); int denom = nGramDictionary.GetFrequency(denominatorNGram); if (denom == 0) { return 0; } return nom/(double) denom; }
/// <summary> /// returns requency of the n-gram /// </summary> /// <param name="nGram"></param> /// <returns></returns> public int GetFrequency(NGram nGram) { if (nGrams.ContainsKey(nGram)) { return nGrams[nGram]; } return 0; }
/// <summary> /// returns frequency of the n-gram which consists of nGramTokens /// </summary> /// <param name="nGramTokens">Tokens of the n-gram</param> /// <returns></returns> public int GetFrequency(params string[] nGramTokens) { Validate(nGramTokens); var nGram = new NGram(nGramTokens); return GetFrequency(nGram); }
private bool Equals(NGram other) { return Tokens.SequenceEqual(other.Tokens); }
public double GetUnigramMLE(NGram nGram) { return nGramDictionary.GetFrequency(nGram)/(double) tokenCount; }