private static double CalculateProblem1ModelPerplexityOnTestCorpus(CorpusParsingResult trainingCorpus, ReadCorpusResult evaluatingCorpus, bool development) { Console.WriteLine("Calculating Perplexity after training on {0}", trainingCorpus.CorpusName); Console.WriteLine("Calculating perplexity for {0}", evaluatingCorpus.CorpusName); Console.WriteLine("{0}\tProblem", evaluatingCorpus.CorpusName.ToString()); ILanguageModel model = new Problem1Model(trainingCorpus); Console.WriteLine("{0}\tModel", model.GetModelName()); StringParsingResult testCorpus = CorpusParsing.ParseString(development ? evaluatingCorpus.Development : evaluatingCorpus.Evaluation); double perplexity = CalculatePerplexityWrapper(model, trainingCorpus, testCorpus); Console.WriteLine("============================================================"); return perplexity; }
public static CorpusParsingResult ParseCorpus(ReadCorpusResult readCorpus, bool unkEnabled, string postTrainWith = null) { // If we have a post training corpus, then use it several times (count it more than original training) StringParsingResult parsingResult; if (postTrainWith != null) { StringBuilder sb = new StringBuilder(readCorpus.Training); for (int i = 0; i < Configs.X; i++) { sb.Append(postTrainWith); } parsingResult = ParseString(sb.ToString()); } else { parsingResult = ParseString(readCorpus.Training); } if (unkEnabled) { AddUnksToParsingResult(parsingResult); } // Keep track of the count of unigrams, bigrams, trigrams int totalUnigrams = 0, totalBigrams = 0, totalTrigrams = 0; // Create one bi-dimensional array for bigrams var unigrams = new Dictionary<string, int>(); // Create one bi-dimensional array for bigrams var bigrams = new Dictionary<Tuple<string, string>, int>(); // Create one tri-dimensional array for the trigram var trigrams = new Dictionary<Tuple<string, string, string>, int>(); // Pass through all sentences and through all words, populating unigram foreach (var sentence in parsingResult.Sentences) { // Consider start string previousWord = Constants.Start; string previousPreviousWord = Constants.Start; // Add start as a unigram and bigram before starting unigrams[Constants.Start] = unigrams.ContainsKey(Constants.Start) ? unigrams[Constants.Start] + 1 : 1; var startKey = new Tuple<string, string>(Constants.Start, Constants.Start); bigrams[startKey] = bigrams.ContainsKey(startKey) ? bigrams[startKey] + 1 : 1; for (int i = 0; i < sentence.Words.Length + 1; i++) { // Consider STOP string word; if (i == sentence.Words.Length) { word = Constants.Stop; } else { // Get the word, or UNK if that's the case if (parsingResult.UniqueWords.ContainsKey(sentence.Words[i])) { word = sentence.Words[i]; } else { word = Constants.Unknown; } } // Unigram var unigramKey = word; unigrams[unigramKey] = unigrams.ContainsKey(unigramKey) ? unigrams[unigramKey] + 1 : 1; totalUnigrams++; // Bigram var bigramKey = new Tuple<string, string>(previousWord, word); bigrams[bigramKey] = bigrams.ContainsKey(bigramKey) ? bigrams[bigramKey] + 1 : 1; totalBigrams++; // Trigram var trigramKey = new Tuple<string, string, string>(previousPreviousWord, previousWord, word); trigrams[trigramKey] = trigrams.ContainsKey(trigramKey) ? trigrams[trigramKey] + 1 : 1; totalTrigrams++; // Move to next previousPreviousWord = previousWord; previousWord = word; } } return new CorpusParsingResult() { Sentences = parsingResult.Sentences, Unigrams = unigrams, TotalWordCount = parsingResult.TotalWordCount, Bigrams = bigrams, Trigrams = trigrams, UniqueWords = parsingResult.UniqueWords, TotalUnigrams = totalUnigrams, TotalBigrams = totalBigrams, TotalTrigrams = totalTrigrams, CorpusName = readCorpus.CorpusName }; }