/// <summary> /// Run a single test for a single model /// </summary> /// <param name="sizeVocab">Size of the vocabulary</param> /// <param name="numTopics">Number of topics</param> /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param> /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param> /// <param name="alpha">Background pseudo-counts for distributions over topics</param> /// <param name="beta">Background pseudo-counts for distributions over words</param> /// <param name="shared">If true, uses shared variable version of the model</param> /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param> /// <param name="wordsInTestDoc">Lists of words in test documents</param> /// <param name="vocabulary">Vocabulary</param> static void RunTest( int sizeVocab, int numTopics, Dictionary <int, int>[] trainWordsInTrainDoc, Dictionary <int, int>[] testWordsInTrainDoc, double alpha, double beta, bool shared, Dirichlet[] trueThetaTest, Dictionary <int, int>[] wordsInTestDoc, Dictionary <int, string> vocabulary = null) { Stopwatch stopWatch = new Stopwatch(); // Square root of number of documents is the optimal for memory int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length); Rand.Restart(5); ILDA model; LDAPredictionModel predictionModel; LDATopicInferenceModel topicInfModel; if (shared) { model = new LDAShared(batchCount, sizeVocab, numTopics); ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray(); } else { model = new LDAModel(sizeVocab, numTopics); model.Engine.NumberOfIterations = 50; } Console.WriteLine("\n\n************************************"); Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched ")); // Train the model - we will also get rough estimates of execution time and memory GC.Collect(); PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes"); float preMem = memCounter.NextValue(); stopWatch.Reset(); stopWatch.Start(); double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out Dirichlet[] postTheta, out Dirichlet[] postPhi);
/// <summary> /// Run a single test for a single model /// </summary> /// <param name="sizeVocab">Size of the vocabulary</param> /// <param name="numTopics">Number of topics</param> /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param> /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param> /// <param name="alpha">Background pseudo-counts for distributions over topics</param> /// <param name="beta">Background pseudo-counts for distributions over words</param> /// <param name="shared">If true, uses shared variable version of the model</param> /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param> /// <param name="wordsInTestDoc">Lists of words in test documents</param> /// <param name="vocabulary">Vocabulary</param> static void RunTest( int sizeVocab, int numTopics, Dictionary <int, int>[] trainWordsInTrainDoc, Dictionary <int, int>[] testWordsInTrainDoc, double alpha, double beta, bool shared, Dirichlet[] trueThetaTest, Dictionary <int, int>[] wordsInTestDoc, Dictionary <int, string> vocabulary = null) { Stopwatch stopWatch = new Stopwatch(); // Square root of number of documents is the optimal for memory int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length); Rand.Restart(5); ILDA model; LDAPredictionModel predictionModel; LDATopicInferenceModel topicInfModel; if (shared) { model = new LDAShared(batchCount, sizeVocab, numTopics); ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray(); } else { model = new LDAModel(sizeVocab, numTopics); model.Engine.NumberOfIterations = 50; } Console.WriteLine("\n\n************************************"); Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched ")); // Train the model - we will also get rough estimates of execution time and memory Dirichlet[] postTheta, postPhi; GC.Collect(); PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes"); float preMem = memCounter.NextValue(); stopWatch.Reset(); stopWatch.Start(); double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out postTheta, out postPhi); stopWatch.Stop(); float postMem = memCounter.NextValue(); double approxMB = preMem - postMem; GC.KeepAlive(model); // Keep the model alive to this point (for the memory counter) Console.WriteLine(String.Format("Approximate memory usage: {0:F2} MB", approxMB)); Console.WriteLine(String.Format("Approximate execution time (including model compilation): {0} seconds", stopWatch.ElapsedMilliseconds / 1000)); // Calculate average log evidence over total training words int totalWords = trainWordsInTrainDoc.Sum(doc => doc.Sum(w => w.Value)); Console.WriteLine("\nTotal number of training words = {0}", totalWords); Console.WriteLine(String.Format("Average log evidence of model: {0:F2}", logEvidence / (double)totalWords)); if (vocabulary != null) { int numWordsToPrint = 20; // Print out the top n words for each topic for (int i = 0; i < postPhi.Length; i++) { double[] pc = postPhi[i].PseudoCount.ToArray(); int[] wordIndices = new int[pc.Length]; for (int j = 0; j < wordIndices.Length; j++) { wordIndices[j] = j; } Array.Sort(pc, wordIndices); Console.WriteLine("Top {0} words in topic {1}:", numWordsToPrint, i); int idx = wordIndices.Length; for (int j = 0; j < numWordsToPrint; j++) { Console.Write("\t{0}", vocabulary[wordIndices[--idx]]); } Console.WriteLine(); } } if (testWordsInTrainDoc != null) { // Test on unseen words in training documents Console.WriteLine("\n\nCalculating perplexity on test words in training documents..."); predictionModel = new LDAPredictionModel(sizeVocab, numTopics); predictionModel.Engine.NumberOfIterations = 5; var predDist = predictionModel.Predict(postTheta, postPhi); var perplexity = Utilities.Perplexity(predDist, testWordsInTrainDoc); Console.WriteLine(String.Format("\nPerplexity = {0:F3}", perplexity)); } if (wordsInTestDoc != null) { // Test on unseen documents. Note that topic ids for the trained model will be a random // permutation of the topic ids for the ground truth Console.WriteLine("\n\nInferring topics for test documents..."); topicInfModel = new LDATopicInferenceModel(sizeVocab, numTopics); topicInfModel.Engine.NumberOfIterations = 10; var inferredTopicDists = topicInfModel.InferTopic(alpha, postPhi, wordsInTestDoc); Dictionary <TopicPair, int> topicPairCounts = new Dictionary <TopicPair, int>(); for (int i = 0; i < inferredTopicDists.Length; i++) { int infTopic = inferredTopicDists[i].PseudoCount.IndexOfMaximum(); int trueTopic = trueThetaTest[i].PseudoCount.IndexOfMaximum(); TopicPair tp = new TopicPair() { InferredTopic = infTopic, TrueTopic = trueTopic }; if (!topicPairCounts.ContainsKey(tp)) { topicPairCounts.Add(tp, 1); } else { topicPairCounts[tp] = topicPairCounts[tp] + 1; } } var correctCount = CountCorrectTopicPredictions(topicPairCounts, numTopics); Console.WriteLine(String.Format("Maximum inferred topic matches maximum true topic {0} times out of {1}", correctCount, inferredTopicDists.Length)); Console.WriteLine("\nThis uses a greedy algorithm to determine the mapping from inferred topic indices to true topic indices"); Console.WriteLine("\n************************************"); } }