public static double CalculatePerplexity(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus, out TestStats testStats) { testStats = new TestStats { UniqueUnksFound = new HashSet<string>(), UniqueWordsFound = new HashSet<string>() }; double logSumOfCorpus = 0; for (int k = 0; k < testCorpus.Sentences.Count; k++) { Sentence sentence = testCorpus.Sentences[k]; double logOfSentence = 0; string previousWord = Constants.Start; string previousPreviousWord = Constants.Start; testStats.TotalSentencesFound++; for (int i = 0; i < sentence.Words.Length; i++) { string calculatedWord = sentence.Words[i]; if (!trainingCorpus.UniqueWords.ContainsKey(sentence.Words[i])) { calculatedWord = Constants.Unknown; testStats.TotalUnksFound++; testStats.UniqueUnksFound.Add(sentence.Words[i]); } testStats.TotalWordsFound++; testStats.UniqueWordsFound.Add(calculatedWord); double modelP = model.P(previousPreviousWord, previousWord, calculatedWord); double logModelP = Math.Log(modelP, 2); logOfSentence += logModelP; previousPreviousWord = previousWord; previousWord = calculatedWord; } if (Double.IsInfinity(logOfSentence)) { throw new InvalidOperationException(); } logSumOfCorpus += logOfSentence; if (Double.IsInfinity(logSumOfCorpus)) { throw new InvalidOperationException(); } if (model is Problem1Model && k % 100 == 0) { Console.WriteLine("Now at sentence {0}/{1}", k, testCorpus.Sentences.Count); } } double sum = logSumOfCorpus / testCorpus.TotalWordCount; return Math.Pow(2, -1*sum); }
private void TestWellDefinedProbability(ILanguageModel model, bool testIfWellDefined) { // Verify the function for P is well defined for trigrams that exist foreach (var wordminus2 in _twoDogSentencesCorpus.UniqueWords.Keys) { if (wordminus2 == Constants.Stop) { continue; } foreach (var wordminus1 in _twoDogSentencesCorpus.UniqueWords.Keys) { if (wordminus1 == Constants.Stop || (wordminus2 != Constants.Start && wordminus1 == Constants.Start)) { continue; } double total = 0; foreach (var word in _twoDogSentencesCorpus.UniqueWords.Keys.Where(w => w != Constants.Start)) { double pml = model.P(wordminus2, wordminus1, word); if (pml > 0) { total += pml; } } Debug.WriteLine("Next! Sum was {0}", total); if (testIfWellDefined) { total.Should().Be(1); } } } }