public double CalculateProbability(StringList sample) { if (Count <= 0) { return(0d); } var probability = 0d; foreach (var ngram in NGramUtils.GetNGrams(sample, n)) { var nMinusOneToken = NGramUtils.GetNMinusOneTokenFirst(ngram); if (Count > 1000000) { // use stupid backoff probability += Math.Log(GetStupidBackoffProbability(ngram, nMinusOneToken)); } else { // use laplace smoothing probability += Math.Log(GetLaplaceSmoothingProbability(ngram, nMinusOneToken)); } } if (double.IsNaN(probability)) { probability = 0d; } else if (Math.Abs(probability) > 0.000001) { probability = Math.Exp(probability); } return(probability); }
public void TestGetNGrams() { var nGrams = NGramUtils.GetNGrams(new StringList("I", "saw", "brown", "fox"), 2); Assert.That(nGrams.Count, Is.EqualTo(3)); nGrams = NGramUtils.GetNGrams(new StringList("I", "saw", "brown", "fox"), 3); Assert.That(nGrams.Count, Is.EqualTo(2)); }
public static double GetPerplexity(ILanguageModel lm, IList <StringList> testSet, int ngramSize) { var perplexity = new BigDecimal(1d); foreach (var sentence in testSet) { foreach (var ngram in NGramUtils.GetNGrams(sentence, ngramSize)) { var ngramProbability = lm.CalculateProbability(ngram); perplexity = perplexity.multiply(new BigDecimal(1d).divide(new BigDecimal(ngramProbability), MathContext.DECIMAL128)); } } var p = Math.Log(perplexity.doubleValue()); if (double.IsInfinity(p) || double.IsNaN(p)) { return(double.PositiveInfinity); // over/underflow -> too high perplexity } var log = new BigDecimal(p); return(Math.Pow(Math.E, log.divide(new BigDecimal(testSet.Count), MathContext.DECIMAL128).doubleValue())); }