/// <summary> /// Run a single test for a single model /// </summary> /// <param name="sizeVocab">Size of the vocabulary</param> /// <param name="numTopics">Number of topics</param> /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param> /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param> /// <param name="alpha">Background pseudo-counts for distributions over topics</param> /// <param name="beta">Background pseudo-counts for distributions over words</param> /// <param name="shared">If true, uses shared variable version of the model</param> /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param> /// <param name="wordsInTestDoc">Lists of words in test documents</param> /// <param name="vocabulary">Vocabulary</param> static void RunTest( int sizeVocab, int numTopics, Dictionary <int, int>[] trainWordsInTrainDoc, Dictionary <int, int>[] testWordsInTrainDoc, double alpha, double beta, bool shared, Dirichlet[] trueThetaTest, Dictionary <int, int>[] wordsInTestDoc, Dictionary <int, string> vocabulary = null) { Stopwatch stopWatch = new Stopwatch(); // Square root of number of documents is the optimal for memory int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length); Rand.Restart(5); ILDA model; LDAPredictionModel predictionModel; LDATopicInferenceModel topicInfModel; if (shared) { model = new LDAShared(batchCount, sizeVocab, numTopics); ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray(); } else { model = new LDAModel(sizeVocab, numTopics); model.Engine.NumberOfIterations = 50; } Console.WriteLine("\n\n************************************"); Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched ")); // Train the model - we will also get rough estimates of execution time and memory GC.Collect(); PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes"); float preMem = memCounter.NextValue(); stopWatch.Reset(); stopWatch.Start(); double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out Dirichlet[] postTheta, out Dirichlet[] postPhi);
/// <summary> /// Run a single test for a single model /// </summary> /// <param name="sizeVocab">Size of the vocabulary</param> /// <param name="numTopics">Number of topics</param> /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param> /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param> /// <param name="alpha">Background pseudo-counts for distributions over topics</param> /// <param name="beta">Background pseudo-counts for distributions over words</param> /// <param name="shared">If true, uses shared variable version of the model</param> /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param> /// <param name="wordsInTestDoc">Lists of words in test documents</param> /// <param name="vocabulary">Vocabulary</param> static void RunTest( int sizeVocab, int numTopics, Dictionary <int, int>[] trainWordsInTrainDoc, Dictionary <int, int>[] testWordsInTrainDoc, double alpha, double beta, bool shared, Dirichlet[] trueThetaTest, Dictionary <int, int>[] wordsInTestDoc, Dictionary <int, string> vocabulary = null) { Stopwatch stopWatch = new Stopwatch(); // Square root of number of documents is the optimal for memory int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length); Rand.Restart(5); ILDA model; LDAPredictionModel predictionModel; LDATopicInferenceModel topicInfModel; if (shared) { model = new LDAShared(batchCount, sizeVocab, numTopics); ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray(); } else { model = new LDAModel(sizeVocab, numTopics); model.Engine.NumberOfIterations = 50; } Console.WriteLine("\n\n************************************"); Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched ")); // Train the model - we will also get rough estimates of execution time and memory Dirichlet[] postTheta, postPhi; GC.Collect(); PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes"); float preMem = memCounter.NextValue(); stopWatch.Reset(); stopWatch.Start(); double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out postTheta, out postPhi); stopWatch.Stop(); float postMem = memCounter.NextValue(); double approxMB = preMem - postMem; GC.KeepAlive(model); // Keep the model alive to this point (for the memory counter) Console.WriteLine(String.Format("Approximate memory usage: {0:F2} MB", approxMB)); Console.WriteLine(String.Format("Approximate execution time (including model compilation): {0} seconds", stopWatch.ElapsedMilliseconds / 1000)); // Calculate average log evidence over total training words int totalWords = trainWordsInTrainDoc.Sum(doc => doc.Sum(w => w.Value)); Console.WriteLine("\nTotal number of training words = {0}", totalWords); Console.WriteLine(String.Format("Average log evidence of model: {0:F2}", logEvidence / (double)totalWords)); if (vocabulary != null) { int numWordsToPrint = 20; // Print out the top n words for each topic for (int i = 0; i < postPhi.Length; i++) { double[] pc = postPhi[i].PseudoCount.ToArray(); int[] wordIndices = new int[pc.Length]; for (int j = 0; j < wordIndices.Length; j++) { wordIndices[j] = j; } Array.Sort(pc, wordIndices); Console.WriteLine("Top {0} words in topic {1}:", numWordsToPrint, i); int idx = wordIndices.Length; for (int j = 0; j < numWordsToPrint; j++) { Console.Write("\t{0}", vocabulary[wordIndices[--idx]]); } Console.WriteLine(); } } if (testWordsInTrainDoc != null) { // Test on unseen words in training documents Console.WriteLine("\n\nCalculating perplexity on test words in training documents..."); predictionModel = new LDAPredictionModel(sizeVocab, numTopics); predictionModel.Engine.NumberOfIterations = 5; var predDist = predictionModel.Predict(postTheta, postPhi); var perplexity = Utilities.Perplexity(predDist, testWordsInTrainDoc); Console.WriteLine(String.Format("\nPerplexity = {0:F3}", perplexity)); } if (wordsInTestDoc != null) { // Test on unseen documents. Note that topic ids for the trained model will be a random // permutation of the topic ids for the ground truth Console.WriteLine("\n\nInferring topics for test documents..."); topicInfModel = new LDATopicInferenceModel(sizeVocab, numTopics); topicInfModel.Engine.NumberOfIterations = 10; var inferredTopicDists = topicInfModel.InferTopic(alpha, postPhi, wordsInTestDoc); Dictionary <TopicPair, int> topicPairCounts = new Dictionary <TopicPair, int>(); for (int i = 0; i < inferredTopicDists.Length; i++) { int infTopic = inferredTopicDists[i].PseudoCount.IndexOfMaximum(); int trueTopic = trueThetaTest[i].PseudoCount.IndexOfMaximum(); TopicPair tp = new TopicPair() { InferredTopic = infTopic, TrueTopic = trueTopic }; if (!topicPairCounts.ContainsKey(tp)) { topicPairCounts.Add(tp, 1); } else { topicPairCounts[tp] = topicPairCounts[tp] + 1; } } var correctCount = CountCorrectTopicPredictions(topicPairCounts, numTopics); Console.WriteLine(String.Format("Maximum inferred topic matches maximum true topic {0} times out of {1}", correctCount, inferredTopicDists.Length)); Console.WriteLine("\nThis uses a greedy algorithm to determine the mapping from inferred topic indices to true topic indices"); Console.WriteLine("\n************************************"); } }
/// <summary> /// Runs inference on the LDA model. /// <para> /// Words in documents are observed, topic distributions per document (<see cref="Theta"/>) /// and word distributions per topic (<see cref="Phi"/>) are inferred. /// </para> /// </summary> /// <param name="wordsInDoc">For each document, the unique word counts in the document</param> /// <param name="alpha">Hyper-parameter for <see cref="Theta"/></param> /// <param name="beta">Hyper-parameter for <see cref="Phi"/></param> /// <param name="postTheta">Posterior marginals for <see cref="Theta"/></param> /// <param name="postPhi">Posterior marginals for <see cref="Phi"/></param> /// <returns>Log evidence - can be used for model selection.</returns> public virtual double Infer(Dictionary <int, int>[] wordsInDoc, double alpha, double beta, out Dirichlet[] postTheta, out Dirichlet[] postPhi) { int numDocs = wordsInDoc.Length; var thetaPosterior = new Dirichlet[numDocs]; int numIters = Engine.NumberOfIterations; bool showProgress = Engine.ShowProgress; Engine.ShowProgress = false; // temporarily disable Infer.NET progress // Set up document index boundaries for each batch double numDocsPerBatch = ((double)numDocs) / NumBatches; if (numDocsPerBatch == 0) { numDocsPerBatch = 1; } int[] boundary = new int[NumBatches + 1]; boundary[0] = 0; double currBoundary = 0.0; for (int batch = 1; batch <= NumBatches; batch++) { currBoundary += numDocsPerBatch; int bnd = (int)currBoundary; if (bnd > numDocs) { bnd = numDocs; } boundary[batch] = bnd; } boundary[NumBatches] = numDocs; PhiPrior.ObservedValue = new Dirichlet[NumTopics]; for (int i = 0; i < NumTopics; i++) { PhiPrior.ObservedValue[i] = Dirichlet.Symmetric(SizeVocab, beta); } NumDocuments.ObservedValue = -1; try { for (int pass = 0; pass < NumPasses; pass++) { Engine.NumberOfIterations = IterationsPerPass[pass]; if (showProgress) { Console.Write(String.Format( "\nPass {0} ({1} iteration{2} per batch)", pass, IterationsPerPass[pass], IterationsPerPass[pass] == 1 ? "" : "s")); } PhiDefModel.InferShared(EnginePhiDef, 0); for (int batch = 0; batch < NumBatches; batch++) { int startDoc = boundary[batch]; int endDoc = boundary[batch + 1]; if (startDoc >= numDocs) { break; } int numDocsInThisBatch = endDoc - startDoc; // Set up the observed values if (NumDocuments.ObservedValue != numDocsInThisBatch) { NumDocuments.ObservedValue = numDocsInThisBatch; ThetaPrior.ObservedValue = new Dirichlet[numDocsInThisBatch]; for (int i = 0; i < numDocsInThisBatch; i++) { ThetaPrior.ObservedValue[i] = Dirichlet.Symmetric(NumTopics, alpha); } } if (pass == 0) { ThetaInit.ObservedValue = LDAModel.GetInitialisation(numDocsInThisBatch, NumTopics, ThetaSparsity); } else { ThetaInit.ObservedValue = Util.ArrayInit(numDocsInThisBatch, d => new Dirichlet(thetaPosterior[d + startDoc])); } int[] numWordsInDocBatch = new int[numDocsInThisBatch]; int[][] wordsInDocBatch = new int[numDocsInThisBatch][]; double[][] wordCountsInDocBatch = new double[numDocsInThisBatch][]; for (int i = 0, j = startDoc; j < endDoc; i++, j++) { numWordsInDocBatch[i] = wordsInDoc[j].Count; wordsInDocBatch[i] = wordsInDoc[j].Keys.ToArray(); ICollection <int> cnts = wordsInDoc[j].Values; wordCountsInDocBatch[i] = new double[cnts.Count]; int k = 0; foreach (int val in cnts) { wordCountsInDocBatch[i][k++] = (double)val; } } NumWordsInDoc.ObservedValue = numWordsInDocBatch; Words.ObservedValue = wordsInDocBatch; WordCounts.ObservedValue = wordCountsInDocBatch; DocModel.InferShared(Engine, batch); var postThetaBatch = Engine.Infer <Dirichlet[]>(Theta); for (int i = 0, j = startDoc; j < endDoc; i++, j++) { thetaPosterior[j] = postThetaBatch[i]; } if (showProgress) { if ((batch % 80) == 0) { Console.WriteLine(""); } Console.Write("."); } } } } finally { Engine.NumberOfIterations = numIters; Engine.ShowProgress = showProgress; } if (showProgress) { Console.WriteLine(); } postTheta = thetaPosterior; postPhi = Phi.Marginal <Dirichlet[]>(); return(Model.GetEvidenceForAll(PhiDefModel, DocModel)); }