Beispiel #1
0
		/// <summary>
		/// Run a single test for a single model
		/// </summary>
		/// <param name="sizeVocab">Size of the vocabulary</param>
		/// <param name="numTopics">Number of topics</param>
		/// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param>
		/// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param>
		/// <param name="alpha">Background pseudo-counts for distributions over topics</param>
		/// <param name="beta">Background pseudo-counts for distributions over words</param>
		/// <param name="shared">If true, uses shared variable version of the model</param>
		/// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param>
		/// <param name="wordsInTestDoc">Lists of words in test documents</param>
		/// <param name="vocabulary">Vocabulary</param>
		static void RunTest(
			int sizeVocab, 
			int numTopics,
			Dictionary<int, int>[] trainWordsInTrainDoc, 
			Dictionary<int, int>[] testWordsInTrainDoc,
			double alpha, 
			double beta, 
			bool shared,
			Dirichlet[] trueThetaTest, 
			Dictionary<int, int>[] wordsInTestDoc,
			Dictionary<int, string> vocabulary = null
			)
		{
			Stopwatch stopWatch = new Stopwatch();
			// Square root of number of documents is the optimal for memory
			int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length);
			Rand.Restart(5);
			ILDA model;
			LDAPredictionModel predictionModel;
			LDATopicInferenceModel topicInfModel;
			if (shared)
			{
				model = new LDAShared(batchCount, sizeVocab, numTopics);
				((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray();
			}
			else
			{
				model = new LDAModel(sizeVocab, numTopics);
				model.Engine.NumberOfIterations = 50;
			}

			Console.WriteLine("\n\n************************************");
			Console.WriteLine(
				String.Format("\nTraining {0}LDA model...\n",
				shared ? "batched " : "non-batched "));

			// Train the model - we will also get rough estimates of execution time and memory
			Dirichlet[] postTheta, postPhi;
			GC.Collect();
			PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes");
			float preMem = memCounter.NextValue();
			stopWatch.Reset();
			stopWatch.Start();
			double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out postTheta, out postPhi);
			stopWatch.Stop();
			float postMem = memCounter.NextValue();
			double approxMB = preMem - postMem;
			GC.KeepAlive(model); // Keep the model alive to this point (for the	memory counter)
			Console.WriteLine(String.Format("Approximate memory usage: {0:F2} MB", approxMB));
			Console.WriteLine(String.Format("Approximate execution time (including model compilation): {0} seconds", stopWatch.ElapsedMilliseconds/1000));

			// Calculate average log evidence over total training words
			int totalWords = trainWordsInTrainDoc.Sum(doc => doc.Sum(w => w.Value));
			Console.WriteLine("\nTotal number of training words = {0}", totalWords);
			Console.WriteLine(String.Format("Average log evidence of model: {0:F2}", logEvidence / (double)totalWords));

			if (vocabulary != null)
			{
				int numWordsToPrint = 20;
				// Print out the top n words for each topic
				for (int i = 0; i < postPhi.Length; i++)
				{
					double[] pc = postPhi[i].PseudoCount.ToArray();
					int[] wordIndices = new int[pc.Length];
					for (int j=0; j < wordIndices.Length; j++)
						wordIndices[j] = j;
					Array.Sort(pc, wordIndices);
					Console.WriteLine("Top {0} words in topic {1}:", numWordsToPrint, i);
					int idx = wordIndices.Length;
					for (int j = 0; j < numWordsToPrint; j++)
						Console.Write("\t{0}", vocabulary[wordIndices[--idx]]);
					Console.WriteLine();
				}
			}

			if (testWordsInTrainDoc != null)
			{
				// Test on unseen words in training documents
				Console.WriteLine("\n\nCalculating perplexity on test words in training documents...");
				predictionModel = new LDAPredictionModel(sizeVocab, numTopics);
				predictionModel.Engine.NumberOfIterations = 5;
				var predDist = predictionModel.Predict(postTheta, postPhi);
				var perplexity = Utilities.Perplexity(predDist, testWordsInTrainDoc);
				Console.WriteLine(String.Format("\nPerplexity = {0:F3}", perplexity));
			}

			if (wordsInTestDoc != null)
			{
				// Test on unseen documents. Note that topic ids for the trained model will be a random
				// permutation of the topic ids for the ground truth
				Console.WriteLine("\n\nInferring topics for test documents...");
				topicInfModel = new LDATopicInferenceModel(sizeVocab, numTopics);
				topicInfModel.Engine.NumberOfIterations = 10;
				var inferredTopicDists = topicInfModel.InferTopic(alpha, postPhi, wordsInTestDoc);
				Dictionary<TopicPair,int> topicPairCounts = new Dictionary<TopicPair, int>();
				for (int i = 0; i < inferredTopicDists.Length; i++)
				{
					int infTopic = inferredTopicDists[i].PseudoCount.IndexOfMaximum();
					int trueTopic = trueThetaTest[i].PseudoCount.IndexOfMaximum();
					TopicPair tp = new TopicPair() { InferredTopic = infTopic, TrueTopic = trueTopic };
					if (!topicPairCounts.ContainsKey(tp)) topicPairCounts.Add(tp, 1);
					else topicPairCounts[tp] = topicPairCounts[tp]+1;
				}
				var correctCount = CountCorrectTopicPredictions(topicPairCounts, numTopics);
				Console.WriteLine(String.Format("Maximum inferred topic matches maximum true topic {0} times out of {1}",
					correctCount, inferredTopicDists.Length));
				Console.WriteLine("\nThis uses a greedy algorithm to determine the mapping from inferred topic indices to true topic indices");
				Console.WriteLine("\n************************************");
			}
		}
Beispiel #2
0
        /// <summary>
        /// Run a single test for a single model
        /// </summary>
        /// <param name="sizeVocab">Size of the vocabulary</param>
        /// <param name="numTopics">Number of topics</param>
        /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param>
        /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param>
        /// <param name="alpha">Background pseudo-counts for distributions over topics</param>
        /// <param name="beta">Background pseudo-counts for distributions over words</param>
        /// <param name="shared">If true, uses shared variable version of the model</param>
        /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param>
        /// <param name="wordsInTestDoc">Lists of words in test documents</param>
        /// <param name="vocabulary">Vocabulary</param>
        static void RunTest(
            int sizeVocab,
            int numTopics,
            Dictionary <int, int>[] trainWordsInTrainDoc,
            Dictionary <int, int>[] testWordsInTrainDoc,
            double alpha,
            double beta,
            bool shared,
            Dirichlet[] trueThetaTest,
            Dictionary <int, int>[] wordsInTestDoc,
            Dictionary <int, string> vocabulary = null)
        {
            Stopwatch stopWatch = new Stopwatch();

            // Square root of number of documents is the optimal for memory
            int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length);

            Rand.Restart(5);
            ILDA model;
            LDAPredictionModel     predictionModel;
            LDATopicInferenceModel topicInfModel;

            if (shared)
            {
                model = new LDAShared(batchCount, sizeVocab, numTopics);
                ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray();
            }
            else
            {
                model = new LDAModel(sizeVocab, numTopics);
                model.Engine.NumberOfIterations = 50;
            }

            Console.WriteLine("\n\n************************************");
            Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched "));

            // Train the model - we will also get rough estimates of execution time and memory
            Dirichlet[] postTheta, postPhi;
            GC.Collect();
            PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes");
            float preMem = memCounter.NextValue();

            stopWatch.Reset();
            stopWatch.Start();
            double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out postTheta, out postPhi);

            stopWatch.Stop();
            float  postMem  = memCounter.NextValue();
            double approxMB = preMem - postMem;

            GC.KeepAlive(model); // Keep the model alive to this point (for the memory counter)
            Console.WriteLine(String.Format("Approximate memory usage: {0:F2} MB", approxMB));
            Console.WriteLine(String.Format("Approximate execution time (including model compilation): {0} seconds", stopWatch.ElapsedMilliseconds / 1000));

            // Calculate average log evidence over total training words
            int totalWords = trainWordsInTrainDoc.Sum(doc => doc.Sum(w => w.Value));

            Console.WriteLine("\nTotal number of training words = {0}", totalWords);
            Console.WriteLine(String.Format("Average log evidence of model: {0:F2}", logEvidence / (double)totalWords));

            if (vocabulary != null)
            {
                int numWordsToPrint = 20;

                // Print out the top n words for each topic
                for (int i = 0; i < postPhi.Length; i++)
                {
                    double[] pc          = postPhi[i].PseudoCount.ToArray();
                    int[]    wordIndices = new int[pc.Length];
                    for (int j = 0; j < wordIndices.Length; j++)
                    {
                        wordIndices[j] = j;
                    }

                    Array.Sort(pc, wordIndices);
                    Console.WriteLine("Top {0} words in topic {1}:", numWordsToPrint, i);
                    int idx = wordIndices.Length;
                    for (int j = 0; j < numWordsToPrint; j++)
                    {
                        Console.Write("\t{0}", vocabulary[wordIndices[--idx]]);
                    }

                    Console.WriteLine();
                }
            }

            if (testWordsInTrainDoc != null)
            {
                // Test on unseen words in training documents
                Console.WriteLine("\n\nCalculating perplexity on test words in training documents...");
                predictionModel = new LDAPredictionModel(sizeVocab, numTopics);
                predictionModel.Engine.NumberOfIterations = 5;
                var predDist   = predictionModel.Predict(postTheta, postPhi);
                var perplexity = Utilities.Perplexity(predDist, testWordsInTrainDoc);
                Console.WriteLine(String.Format("\nPerplexity = {0:F3}", perplexity));
            }

            if (wordsInTestDoc != null)
            {
                // Test on unseen documents. Note that topic ids for the trained model will be a random
                // permutation of the topic ids for the ground truth
                Console.WriteLine("\n\nInferring topics for test documents...");
                topicInfModel = new LDATopicInferenceModel(sizeVocab, numTopics);
                topicInfModel.Engine.NumberOfIterations = 10;
                var inferredTopicDists = topicInfModel.InferTopic(alpha, postPhi, wordsInTestDoc);
                Dictionary <TopicPair, int> topicPairCounts = new Dictionary <TopicPair, int>();
                for (int i = 0; i < inferredTopicDists.Length; i++)
                {
                    int       infTopic  = inferredTopicDists[i].PseudoCount.IndexOfMaximum();
                    int       trueTopic = trueThetaTest[i].PseudoCount.IndexOfMaximum();
                    TopicPair tp        = new TopicPair()
                    {
                        InferredTopic = infTopic, TrueTopic = trueTopic
                    };
                    if (!topicPairCounts.ContainsKey(tp))
                    {
                        topicPairCounts.Add(tp, 1);
                    }
                    else
                    {
                        topicPairCounts[tp] = topicPairCounts[tp] + 1;
                    }
                }

                var correctCount = CountCorrectTopicPredictions(topicPairCounts, numTopics);
                Console.WriteLine(String.Format("Maximum inferred topic matches maximum true topic {0} times out of {1}", correctCount, inferredTopicDists.Length));
                Console.WriteLine("\nThis uses a greedy algorithm to determine the mapping from inferred topic indices to true topic indices");
                Console.WriteLine("\n************************************");
            }
        }