public static double CalculatePerplexity(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus, out TestStats testStats) { testStats = new TestStats { UniqueUnksFound = new HashSet<string>(), UniqueWordsFound = new HashSet<string>() }; double logSumOfCorpus = 0; for (int k = 0; k < testCorpus.Sentences.Count; k++) { Sentence sentence = testCorpus.Sentences[k]; double logOfSentence = 0; string previousWord = Constants.Start; string previousPreviousWord = Constants.Start; testStats.TotalSentencesFound++; for (int i = 0; i < sentence.Words.Length; i++) { string calculatedWord = sentence.Words[i]; if (!trainingCorpus.UniqueWords.ContainsKey(sentence.Words[i])) { calculatedWord = Constants.Unknown; testStats.TotalUnksFound++; testStats.UniqueUnksFound.Add(sentence.Words[i]); } testStats.TotalWordsFound++; testStats.UniqueWordsFound.Add(calculatedWord); double modelP = model.P(previousPreviousWord, previousWord, calculatedWord); double logModelP = Math.Log(modelP, 2); logOfSentence += logModelP; previousPreviousWord = previousWord; previousWord = calculatedWord; } if (Double.IsInfinity(logOfSentence)) { throw new InvalidOperationException(); } logSumOfCorpus += logOfSentence; if (Double.IsInfinity(logSumOfCorpus)) { throw new InvalidOperationException(); } if (model is Problem1Model && k % 100 == 0) { Console.WriteLine("Now at sentence {0}/{1}", k, testCorpus.Sentences.Count); } } double sum = logSumOfCorpus / testCorpus.TotalWordCount; return Math.Pow(2, -1*sum); }
private static double CalculatePerplexityWrapper(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus) { Perplexity.TestStats testStats; double perplexity = Perplexity.CalculatePerplexity(model, trainingCorpus, testCorpus, out testStats); Console.WriteLine("{0}\tPerplexity", perplexity); Console.WriteLine("Test stats:"); Console.WriteLine(testStats.ToString()); return perplexity; }
private static void AddUnksToParsingResult(StringParsingResult parsingResult) { List<KeyValuePair<string, int>> listOfWordsWithOneOccurrence = parsingResult.UniqueWords.Where(w => w.Value == 1).ToList(); int numberOfUnksToAdd = Convert.ToInt32(listOfWordsWithOneOccurrence.Count * Configs.PercentageOfUnks); // The way we'll make a word an UNK is we'll remove it from the known words list. BAM! // We'll only remove words that appear once while (numberOfUnksToAdd > 0) { parsingResult.UniqueWords.Remove(listOfWordsWithOneOccurrence[numberOfUnksToAdd].Key); numberOfUnksToAdd--; } // Removed enough UNKs // Add Unk to known words parsingResult.UniqueWords.Add(Constants.Unknown, numberOfUnksToAdd); }