Example #1
0
        private static double CalculateProblem1ModelPerplexityOnTestCorpus(CorpusParsingResult trainingCorpus, ReadCorpusResult evaluatingCorpus, bool development)
        {
            Console.WriteLine("Calculating Perplexity after training on {0}", trainingCorpus.CorpusName);
            Console.WriteLine("Calculating perplexity for {0}", evaluatingCorpus.CorpusName);

            Console.WriteLine("{0}\tProblem", evaluatingCorpus.CorpusName.ToString());

            ILanguageModel model = new Problem1Model(trainingCorpus);
            Console.WriteLine("{0}\tModel", model.GetModelName());
            StringParsingResult testCorpus = CorpusParsing.ParseString(development ? evaluatingCorpus.Development : evaluatingCorpus.Evaluation);
            double perplexity = CalculatePerplexityWrapper(model, trainingCorpus, testCorpus);

            Console.WriteLine("============================================================");

            return perplexity;
        }
        public static CorpusParsingResult ParseCorpus(ReadCorpusResult readCorpus, bool unkEnabled, string postTrainWith = null)
        {
            // If we have a post training corpus, then use it several times (count it more than original training)
            StringParsingResult parsingResult;
            if (postTrainWith != null)
            {
                StringBuilder sb = new StringBuilder(readCorpus.Training);
                for (int i = 0; i < Configs.X; i++)
                {
                    sb.Append(postTrainWith);
                }
                parsingResult = ParseString(sb.ToString());
            }
            else
            {
                parsingResult = ParseString(readCorpus.Training);
            }

            if (unkEnabled)
            {
                AddUnksToParsingResult(parsingResult);
            }

            // Keep track of the count of unigrams, bigrams, trigrams
            int totalUnigrams = 0, totalBigrams = 0, totalTrigrams = 0;

            // Create one bi-dimensional array for bigrams
            var unigrams = new Dictionary<string, int>();

            // Create one bi-dimensional array for bigrams
            var bigrams = new Dictionary<Tuple<string, string>, int>();

            // Create one tri-dimensional array for the trigram
            var trigrams = new Dictionary<Tuple<string, string, string>, int>();

            // Pass through all sentences and through all words, populating unigram
            foreach (var sentence in parsingResult.Sentences)
            {
                // Consider start
                string previousWord = Constants.Start;
                string previousPreviousWord = Constants.Start;

                // Add start as a unigram and bigram before starting
                unigrams[Constants.Start] = unigrams.ContainsKey(Constants.Start) ? unigrams[Constants.Start] + 1 : 1;
                var startKey = new Tuple<string, string>(Constants.Start, Constants.Start);
                bigrams[startKey] = bigrams.ContainsKey(startKey) ? bigrams[startKey] + 1 : 1;

                for (int i = 0; i < sentence.Words.Length + 1; i++)
                {
                    // Consider STOP
                    string word;
                    if (i == sentence.Words.Length)
                    {
                        word = Constants.Stop;
                    }
                    else
                    {
                        // Get the word, or UNK if that's the case
                        if (parsingResult.UniqueWords.ContainsKey(sentence.Words[i]))
                        {
                            word = sentence.Words[i];
                        }
                        else
                        {
                            word = Constants.Unknown;
                        }
                    }

                    // Unigram
                    var unigramKey = word;
                    unigrams[unigramKey] = unigrams.ContainsKey(unigramKey) ? unigrams[unigramKey] + 1 : 1;
                    totalUnigrams++;

                    // Bigram
                    var bigramKey = new Tuple<string, string>(previousWord, word);
                    bigrams[bigramKey] = bigrams.ContainsKey(bigramKey) ? bigrams[bigramKey] + 1 : 1;
                    totalBigrams++;

                    // Trigram
                    var trigramKey = new Tuple<string, string, string>(previousPreviousWord, previousWord, word);
                    trigrams[trigramKey] = trigrams.ContainsKey(trigramKey) ? trigrams[trigramKey] + 1 : 1;
                    totalTrigrams++;

                    // Move to next
                    previousPreviousWord = previousWord;
                    previousWord = word;
                }
            }

            return new CorpusParsingResult()
            {
                Sentences = parsingResult.Sentences,
                Unigrams = unigrams,
                TotalWordCount = parsingResult.TotalWordCount,
                Bigrams = bigrams,
                Trigrams = trigrams,
                UniqueWords = parsingResult.UniqueWords,
                TotalUnigrams = totalUnigrams,
                TotalBigrams = totalBigrams,
                TotalTrigrams = totalTrigrams,
                CorpusName = readCorpus.CorpusName
            };
        }