Пример #1
0
        public static double CalculatePerplexity(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus, out TestStats testStats)
        {
            testStats = new TestStats
            {
                UniqueUnksFound = new HashSet<string>(),
                UniqueWordsFound = new HashSet<string>()
            };

            double logSumOfCorpus = 0;
            for (int k = 0; k < testCorpus.Sentences.Count; k++)
            {
                Sentence sentence = testCorpus.Sentences[k];
                double logOfSentence = 0;
                string previousWord = Constants.Start;
                string previousPreviousWord = Constants.Start;

                testStats.TotalSentencesFound++;
                for (int i = 0; i < sentence.Words.Length; i++)
                {
                    string calculatedWord = sentence.Words[i];
                    if (!trainingCorpus.UniqueWords.ContainsKey(sentence.Words[i]))
                    {
                        calculatedWord = Constants.Unknown;
                        testStats.TotalUnksFound++;
                        testStats.UniqueUnksFound.Add(sentence.Words[i]);
                    }
                    testStats.TotalWordsFound++;
                    testStats.UniqueWordsFound.Add(calculatedWord);

                    double modelP = model.P(previousPreviousWord, previousWord, calculatedWord);
                    double logModelP = Math.Log(modelP, 2);
                    logOfSentence += logModelP;

                    previousPreviousWord = previousWord;
                    previousWord = calculatedWord;
                }

                if (Double.IsInfinity(logOfSentence))
                {
                    throw new InvalidOperationException();
                }
                logSumOfCorpus += logOfSentence;
                if (Double.IsInfinity(logSumOfCorpus))
                {
                    throw new InvalidOperationException();
                }

                if (model is Problem1Model && k % 100 == 0)
                {
                    Console.WriteLine("Now at sentence {0}/{1}", k, testCorpus.Sentences.Count);
                }
            }

            double sum = logSumOfCorpus / testCorpus.TotalWordCount;
            return Math.Pow(2, -1*sum);
        }
        private void TestWellDefinedProbability(ILanguageModel model, bool testIfWellDefined)
        {
            // Verify the function for P is well defined for trigrams that exist
            foreach (var wordminus2 in _twoDogSentencesCorpus.UniqueWords.Keys)
            {
                if (wordminus2 == Constants.Stop)
                {
                    continue;
                }

                foreach (var wordminus1 in _twoDogSentencesCorpus.UniqueWords.Keys)
                {
                    if (wordminus1 == Constants.Stop || (wordminus2 != Constants.Start && wordminus1 == Constants.Start))
                    {
                        continue;
                    }

                    double total = 0;
                    foreach (var word in _twoDogSentencesCorpus.UniqueWords.Keys.Where(w => w != Constants.Start))
                    {
                        double pml = model.P(wordminus2, wordminus1, word);
                        if (pml > 0)
                        {
                            total += pml;
                        }
                    }
                    Debug.WriteLine("Next! Sum was {0}", total);
                    if (testIfWellDefined)
                    {
                        total.Should().Be(1);
                    }
                }
            }
        }