public double CalculateProbability(StringList sample) { if (Count <= 0) { return(0d); } var probability = 0d; foreach (var ngram in NGramUtils.GetNGrams(sample, n)) { var nMinusOneToken = NGramUtils.GetNMinusOneTokenFirst(ngram); if (Count > 1000000) { // use stupid backoff probability += Math.Log(GetStupidBackoffProbability(ngram, nMinusOneToken)); } else { // use laplace smoothing probability += Math.Log(GetLaplaceSmoothingProbability(ngram, nMinusOneToken)); } } if (double.IsNaN(probability)) { probability = 0d; } else if (Math.Abs(probability) > 0.000001) { probability = Math.Exp(probability); } return(probability); }
public void TestGetNGrams() { var nGrams = NGramUtils.GetNGrams(new StringList("I", "saw", "brown", "fox"), 2); Assert.That(nGrams.Count, Is.EqualTo(3)); nGrams = NGramUtils.GetNGrams(new StringList("I", "saw", "brown", "fox"), 3); Assert.That(nGrams.Count, Is.EqualTo(2)); }
public void TestLinearInterpolation2() { var set = new List <StringList> { new StringList("D", "N", "V", "STOP"), new StringList("D", "N", "V", "STOP") }; var lambda = 1d / 3d; var d = NGramUtils.CalculateTrigramLinearInterpolationProbability("N", "V", "STOP", set, lambda, lambda, lambda); Assert.That(d, Is.EqualTo(1d).Within(0.75d)); }
public void TestLinearInterpolation() { var set = new List <StringList> { new StringList("the", "green", "book", "STOP"), new StringList("my", "blue", "book", "STOP"), new StringList("his", "green", "house", "STOP"), new StringList("book", "STOP") }; var lambda = 1d / 3d; var d = NGramUtils.CalculateTrigramLinearInterpolationProbability("the", "green", "book", set, lambda, lambda, lambda); Assert.That(d, Is.EqualTo(0.5714285714285714d).Within(0.000000000001)); }
public void TestNgramMLProbability() { var set = new List <StringList> { new StringList("<s>", "I", "am", "Sam", "</s>"), new StringList("<s>", "Sam", "I", "am", "</s>"), new StringList("<s>", "I", "do", "not", "like", "green", "eggs", "and", "ham", "</s>"), new StringList("") }; var d = NGramUtils.CalculateNgramMLProbability(new StringList("I", "am", "Sam"), set); Assert.That(d, Is.EqualTo(0.5d).Within(0.00001)); d = NGramUtils.CalculateNgramMLProbability(new StringList("Sam", "I", "am"), set); Assert.That(d, Is.EqualTo(1d).Within(0.00001)); }
private double GetStupidBackoffProbability(StringList ngram, StringList nMinusOneToken) { var count = GetCount(ngram); if (nMinusOneToken == null || nMinusOneToken.Count == 0) { return((double)count / Count); } if (count > 0) { return(count / (double)GetCount(nMinusOneToken)); // maximum likelihood probability } var nextNgram = NGramUtils.GetNMinusOneTokenLast(ngram); return(0.4d * GetStupidBackoffProbability(nextNgram, NGramUtils.GetNMinusOneTokenFirst(nextNgram))); }
public static double GetPerplexity(ILanguageModel lm, IList <StringList> testSet, int ngramSize) { var perplexity = new BigDecimal(1d); foreach (var sentence in testSet) { foreach (var ngram in NGramUtils.GetNGrams(sentence, ngramSize)) { var ngramProbability = lm.CalculateProbability(ngram); perplexity = perplexity.multiply(new BigDecimal(1d).divide(new BigDecimal(ngramProbability), MathContext.DECIMAL128)); } } var p = Math.Log(perplexity.doubleValue()); if (double.IsInfinity(p) || double.IsNaN(p)) { return(double.PositiveInfinity); // over/underflow -> too high perplexity } var log = new BigDecimal(p); return(Math.Pow(Math.E, log.divide(new BigDecimal(testSet.Count), MathContext.DECIMAL128).doubleValue())); }