Ejemplo n.º 1
0
        public static double CalculatePerplexity(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus, out TestStats testStats)
        {
            testStats = new TestStats
            {
                UniqueUnksFound = new HashSet<string>(),
                UniqueWordsFound = new HashSet<string>()
            };

            double logSumOfCorpus = 0;
            for (int k = 0; k < testCorpus.Sentences.Count; k++)
            {
                Sentence sentence = testCorpus.Sentences[k];
                double logOfSentence = 0;
                string previousWord = Constants.Start;
                string previousPreviousWord = Constants.Start;

                testStats.TotalSentencesFound++;
                for (int i = 0; i < sentence.Words.Length; i++)
                {
                    string calculatedWord = sentence.Words[i];
                    if (!trainingCorpus.UniqueWords.ContainsKey(sentence.Words[i]))
                    {
                        calculatedWord = Constants.Unknown;
                        testStats.TotalUnksFound++;
                        testStats.UniqueUnksFound.Add(sentence.Words[i]);
                    }
                    testStats.TotalWordsFound++;
                    testStats.UniqueWordsFound.Add(calculatedWord);

                    double modelP = model.P(previousPreviousWord, previousWord, calculatedWord);
                    double logModelP = Math.Log(modelP, 2);
                    logOfSentence += logModelP;

                    previousPreviousWord = previousWord;
                    previousWord = calculatedWord;
                }

                if (Double.IsInfinity(logOfSentence))
                {
                    throw new InvalidOperationException();
                }
                logSumOfCorpus += logOfSentence;
                if (Double.IsInfinity(logSumOfCorpus))
                {
                    throw new InvalidOperationException();
                }

                if (model is Problem1Model && k % 100 == 0)
                {
                    Console.WriteLine("Now at sentence {0}/{1}", k, testCorpus.Sentences.Count);
                }
            }

            double sum = logSumOfCorpus / testCorpus.TotalWordCount;
            return Math.Pow(2, -1*sum);
        }
Ejemplo n.º 2
0
        private static double CalculatePerplexityWrapper(ILanguageModel model, CorpusParsingResult trainingCorpus, StringParsingResult testCorpus)
        {
            Perplexity.TestStats testStats;
            double perplexity = Perplexity.CalculatePerplexity(model, trainingCorpus, testCorpus, out testStats);

            Console.WriteLine("{0}\tPerplexity", perplexity);
            Console.WriteLine("Test stats:");
            Console.WriteLine(testStats.ToString());
            return perplexity;
        }
Ejemplo n.º 3
0
        private static void AddUnksToParsingResult(StringParsingResult parsingResult)
        {
            List<KeyValuePair<string, int>> listOfWordsWithOneOccurrence = parsingResult.UniqueWords.Where(w => w.Value == 1).ToList();

            int numberOfUnksToAdd = Convert.ToInt32(listOfWordsWithOneOccurrence.Count * Configs.PercentageOfUnks);

            // The way we'll make a word an UNK is we'll remove it from the known words list. BAM!
            // We'll only remove words that appear once
            while (numberOfUnksToAdd > 0)
            {
                parsingResult.UniqueWords.Remove(listOfWordsWithOneOccurrence[numberOfUnksToAdd].Key);
                numberOfUnksToAdd--;
            }

            // Removed enough UNKs
            // Add Unk to known words
            parsingResult.UniqueWords.Add(Constants.Unknown, numberOfUnksToAdd);
        }