Esempio n. 1
0
        /// <summary>
        /// Run a single test for a single model
        /// </summary>
        /// <param name="sizeVocab">Size of the vocabulary</param>
        /// <param name="numTopics">Number of topics</param>
        /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param>
        /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param>
        /// <param name="alpha">Background pseudo-counts for distributions over topics</param>
        /// <param name="beta">Background pseudo-counts for distributions over words</param>
        /// <param name="shared">If true, uses shared variable version of the model</param>
        /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param>
        /// <param name="wordsInTestDoc">Lists of words in test documents</param>
        /// <param name="vocabulary">Vocabulary</param>
        static void RunTest(
            int sizeVocab,
            int numTopics,
            Dictionary <int, int>[] trainWordsInTrainDoc,
            Dictionary <int, int>[] testWordsInTrainDoc,
            double alpha,
            double beta,
            bool shared,
            Dirichlet[] trueThetaTest,
            Dictionary <int, int>[] wordsInTestDoc,
            Dictionary <int, string> vocabulary = null)
        {
            Stopwatch stopWatch = new Stopwatch();

            // Square root of number of documents is the optimal for memory
            int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length);

            Rand.Restart(5);
            ILDA model;
            LDAPredictionModel     predictionModel;
            LDATopicInferenceModel topicInfModel;

            if (shared)
            {
                model = new LDAShared(batchCount, sizeVocab, numTopics);
                ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray();
            }
            else
            {
                model = new LDAModel(sizeVocab, numTopics);
                model.Engine.NumberOfIterations = 50;
            }

            Console.WriteLine("\n\n************************************");
            Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched "));

            // Train the model - we will also get rough estimates of execution time and memory
            GC.Collect();
            PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes");
            float preMem = memCounter.NextValue();

            stopWatch.Reset();
            stopWatch.Start();
            double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out Dirichlet[] postTheta, out Dirichlet[] postPhi);
Esempio n. 2
0
        /// <summary>
        /// Run a single test for a single model
        /// </summary>
        /// <param name="sizeVocab">Size of the vocabulary</param>
        /// <param name="numTopics">Number of topics</param>
        /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param>
        /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param>
        /// <param name="alpha">Background pseudo-counts for distributions over topics</param>
        /// <param name="beta">Background pseudo-counts for distributions over words</param>
        /// <param name="shared">If true, uses shared variable version of the model</param>
        /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param>
        /// <param name="wordsInTestDoc">Lists of words in test documents</param>
        /// <param name="vocabulary">Vocabulary</param>
        static void RunTest(
            int sizeVocab,
            int numTopics,
            Dictionary <int, int>[] trainWordsInTrainDoc,
            Dictionary <int, int>[] testWordsInTrainDoc,
            double alpha,
            double beta,
            bool shared,
            Dirichlet[] trueThetaTest,
            Dictionary <int, int>[] wordsInTestDoc,
            Dictionary <int, string> vocabulary = null)
        {
            Stopwatch stopWatch = new Stopwatch();

            // Square root of number of documents is the optimal for memory
            int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length);

            Rand.Restart(5);
            ILDA model;
            LDAPredictionModel     predictionModel;
            LDATopicInferenceModel topicInfModel;

            if (shared)
            {
                model = new LDAShared(batchCount, sizeVocab, numTopics);
                ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray();
            }
            else
            {
                model = new LDAModel(sizeVocab, numTopics);
                model.Engine.NumberOfIterations = 50;
            }

            Console.WriteLine("\n\n************************************");
            Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched "));

            // Train the model - we will also get rough estimates of execution time and memory
            Dirichlet[] postTheta, postPhi;
            GC.Collect();
            PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes");
            float preMem = memCounter.NextValue();

            stopWatch.Reset();
            stopWatch.Start();
            double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out postTheta, out postPhi);

            stopWatch.Stop();
            float  postMem  = memCounter.NextValue();
            double approxMB = preMem - postMem;

            GC.KeepAlive(model); // Keep the model alive to this point (for the memory counter)
            Console.WriteLine(String.Format("Approximate memory usage: {0:F2} MB", approxMB));
            Console.WriteLine(String.Format("Approximate execution time (including model compilation): {0} seconds", stopWatch.ElapsedMilliseconds / 1000));

            // Calculate average log evidence over total training words
            int totalWords = trainWordsInTrainDoc.Sum(doc => doc.Sum(w => w.Value));

            Console.WriteLine("\nTotal number of training words = {0}", totalWords);
            Console.WriteLine(String.Format("Average log evidence of model: {0:F2}", logEvidence / (double)totalWords));

            if (vocabulary != null)
            {
                int numWordsToPrint = 20;

                // Print out the top n words for each topic
                for (int i = 0; i < postPhi.Length; i++)
                {
                    double[] pc          = postPhi[i].PseudoCount.ToArray();
                    int[]    wordIndices = new int[pc.Length];
                    for (int j = 0; j < wordIndices.Length; j++)
                    {
                        wordIndices[j] = j;
                    }

                    Array.Sort(pc, wordIndices);
                    Console.WriteLine("Top {0} words in topic {1}:", numWordsToPrint, i);
                    int idx = wordIndices.Length;
                    for (int j = 0; j < numWordsToPrint; j++)
                    {
                        Console.Write("\t{0}", vocabulary[wordIndices[--idx]]);
                    }

                    Console.WriteLine();
                }
            }

            if (testWordsInTrainDoc != null)
            {
                // Test on unseen words in training documents
                Console.WriteLine("\n\nCalculating perplexity on test words in training documents...");
                predictionModel = new LDAPredictionModel(sizeVocab, numTopics);
                predictionModel.Engine.NumberOfIterations = 5;
                var predDist   = predictionModel.Predict(postTheta, postPhi);
                var perplexity = Utilities.Perplexity(predDist, testWordsInTrainDoc);
                Console.WriteLine(String.Format("\nPerplexity = {0:F3}", perplexity));
            }

            if (wordsInTestDoc != null)
            {
                // Test on unseen documents. Note that topic ids for the trained model will be a random
                // permutation of the topic ids for the ground truth
                Console.WriteLine("\n\nInferring topics for test documents...");
                topicInfModel = new LDATopicInferenceModel(sizeVocab, numTopics);
                topicInfModel.Engine.NumberOfIterations = 10;
                var inferredTopicDists = topicInfModel.InferTopic(alpha, postPhi, wordsInTestDoc);
                Dictionary <TopicPair, int> topicPairCounts = new Dictionary <TopicPair, int>();
                for (int i = 0; i < inferredTopicDists.Length; i++)
                {
                    int       infTopic  = inferredTopicDists[i].PseudoCount.IndexOfMaximum();
                    int       trueTopic = trueThetaTest[i].PseudoCount.IndexOfMaximum();
                    TopicPair tp        = new TopicPair()
                    {
                        InferredTopic = infTopic, TrueTopic = trueTopic
                    };
                    if (!topicPairCounts.ContainsKey(tp))
                    {
                        topicPairCounts.Add(tp, 1);
                    }
                    else
                    {
                        topicPairCounts[tp] = topicPairCounts[tp] + 1;
                    }
                }

                var correctCount = CountCorrectTopicPredictions(topicPairCounts, numTopics);
                Console.WriteLine(String.Format("Maximum inferred topic matches maximum true topic {0} times out of {1}", correctCount, inferredTopicDists.Length));
                Console.WriteLine("\nThis uses a greedy algorithm to determine the mapping from inferred topic indices to true topic indices");
                Console.WriteLine("\n************************************");
            }
        }