Ejemplo n.º 1
0
        /// <summary>
        /// Run a single test for a single model
        /// </summary>
        /// <param name="sizeVocab">Size of the vocabulary</param>
        /// <param name="numTopics">Number of topics</param>
        /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param>
        /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param>
        /// <param name="alpha">Background pseudo-counts for distributions over topics</param>
        /// <param name="beta">Background pseudo-counts for distributions over words</param>
        /// <param name="shared">If true, uses shared variable version of the model</param>
        /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param>
        /// <param name="wordsInTestDoc">Lists of words in test documents</param>
        /// <param name="vocabulary">Vocabulary</param>
        static void RunTest(
            int sizeVocab,
            int numTopics,
            Dictionary <int, int>[] trainWordsInTrainDoc,
            Dictionary <int, int>[] testWordsInTrainDoc,
            double alpha,
            double beta,
            bool shared,
            Dirichlet[] trueThetaTest,
            Dictionary <int, int>[] wordsInTestDoc,
            Dictionary <int, string> vocabulary = null)
        {
            Stopwatch stopWatch = new Stopwatch();

            // Square root of number of documents is the optimal for memory
            int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length);

            Rand.Restart(5);
            ILDA model;
            LDAPredictionModel     predictionModel;
            LDATopicInferenceModel topicInfModel;

            if (shared)
            {
                model = new LDAShared(batchCount, sizeVocab, numTopics);
                ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray();
            }
            else
            {
                model = new LDAModel(sizeVocab, numTopics);
                model.Engine.NumberOfIterations = 50;
            }

            Console.WriteLine("\n\n************************************");
            Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched "));

            // Train the model - we will also get rough estimates of execution time and memory
            GC.Collect();
            PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes");
            float preMem = memCounter.NextValue();

            stopWatch.Reset();
            stopWatch.Start();
            double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out Dirichlet[] postTheta, out Dirichlet[] postPhi);
Ejemplo n.º 2
0
        /// <summary>
        /// Run a single test for a single model
        /// </summary>
        /// <param name="sizeVocab">Size of the vocabulary</param>
        /// <param name="numTopics">Number of topics</param>
        /// <param name="trainWordsInTrainDoc">Lists of words in training documents used for training</param>
        /// <param name="testWordsInTrainDoc">Lists of words in training documents used for testing</param>
        /// <param name="alpha">Background pseudo-counts for distributions over topics</param>
        /// <param name="beta">Background pseudo-counts for distributions over words</param>
        /// <param name="shared">If true, uses shared variable version of the model</param>
        /// <param name="trueThetaTest">The true topic distributions for the documents in the test set</param>
        /// <param name="wordsInTestDoc">Lists of words in test documents</param>
        /// <param name="vocabulary">Vocabulary</param>
        static void RunTest(
            int sizeVocab,
            int numTopics,
            Dictionary <int, int>[] trainWordsInTrainDoc,
            Dictionary <int, int>[] testWordsInTrainDoc,
            double alpha,
            double beta,
            bool shared,
            Dirichlet[] trueThetaTest,
            Dictionary <int, int>[] wordsInTestDoc,
            Dictionary <int, string> vocabulary = null)
        {
            Stopwatch stopWatch = new Stopwatch();

            // Square root of number of documents is the optimal for memory
            int batchCount = (int)Math.Sqrt((double)trainWordsInTrainDoc.Length);

            Rand.Restart(5);
            ILDA model;
            LDAPredictionModel     predictionModel;
            LDATopicInferenceModel topicInfModel;

            if (shared)
            {
                model = new LDAShared(batchCount, sizeVocab, numTopics);
                ((LDAShared)model).IterationsPerPass = Enumerable.Repeat(10, 5).ToArray();
            }
            else
            {
                model = new LDAModel(sizeVocab, numTopics);
                model.Engine.NumberOfIterations = 50;
            }

            Console.WriteLine("\n\n************************************");
            Console.WriteLine(String.Format("\nTraining {0}LDA model...\n", shared ? "batched " : "non-batched "));

            // Train the model - we will also get rough estimates of execution time and memory
            Dirichlet[] postTheta, postPhi;
            GC.Collect();
            PerformanceCounter memCounter = new PerformanceCounter("Memory", "Available MBytes");
            float preMem = memCounter.NextValue();

            stopWatch.Reset();
            stopWatch.Start();
            double logEvidence = model.Infer(trainWordsInTrainDoc, alpha, beta, out postTheta, out postPhi);

            stopWatch.Stop();
            float  postMem  = memCounter.NextValue();
            double approxMB = preMem - postMem;

            GC.KeepAlive(model); // Keep the model alive to this point (for the memory counter)
            Console.WriteLine(String.Format("Approximate memory usage: {0:F2} MB", approxMB));
            Console.WriteLine(String.Format("Approximate execution time (including model compilation): {0} seconds", stopWatch.ElapsedMilliseconds / 1000));

            // Calculate average log evidence over total training words
            int totalWords = trainWordsInTrainDoc.Sum(doc => doc.Sum(w => w.Value));

            Console.WriteLine("\nTotal number of training words = {0}", totalWords);
            Console.WriteLine(String.Format("Average log evidence of model: {0:F2}", logEvidence / (double)totalWords));

            if (vocabulary != null)
            {
                int numWordsToPrint = 20;

                // Print out the top n words for each topic
                for (int i = 0; i < postPhi.Length; i++)
                {
                    double[] pc          = postPhi[i].PseudoCount.ToArray();
                    int[]    wordIndices = new int[pc.Length];
                    for (int j = 0; j < wordIndices.Length; j++)
                    {
                        wordIndices[j] = j;
                    }

                    Array.Sort(pc, wordIndices);
                    Console.WriteLine("Top {0} words in topic {1}:", numWordsToPrint, i);
                    int idx = wordIndices.Length;
                    for (int j = 0; j < numWordsToPrint; j++)
                    {
                        Console.Write("\t{0}", vocabulary[wordIndices[--idx]]);
                    }

                    Console.WriteLine();
                }
            }

            if (testWordsInTrainDoc != null)
            {
                // Test on unseen words in training documents
                Console.WriteLine("\n\nCalculating perplexity on test words in training documents...");
                predictionModel = new LDAPredictionModel(sizeVocab, numTopics);
                predictionModel.Engine.NumberOfIterations = 5;
                var predDist   = predictionModel.Predict(postTheta, postPhi);
                var perplexity = Utilities.Perplexity(predDist, testWordsInTrainDoc);
                Console.WriteLine(String.Format("\nPerplexity = {0:F3}", perplexity));
            }

            if (wordsInTestDoc != null)
            {
                // Test on unseen documents. Note that topic ids for the trained model will be a random
                // permutation of the topic ids for the ground truth
                Console.WriteLine("\n\nInferring topics for test documents...");
                topicInfModel = new LDATopicInferenceModel(sizeVocab, numTopics);
                topicInfModel.Engine.NumberOfIterations = 10;
                var inferredTopicDists = topicInfModel.InferTopic(alpha, postPhi, wordsInTestDoc);
                Dictionary <TopicPair, int> topicPairCounts = new Dictionary <TopicPair, int>();
                for (int i = 0; i < inferredTopicDists.Length; i++)
                {
                    int       infTopic  = inferredTopicDists[i].PseudoCount.IndexOfMaximum();
                    int       trueTopic = trueThetaTest[i].PseudoCount.IndexOfMaximum();
                    TopicPair tp        = new TopicPair()
                    {
                        InferredTopic = infTopic, TrueTopic = trueTopic
                    };
                    if (!topicPairCounts.ContainsKey(tp))
                    {
                        topicPairCounts.Add(tp, 1);
                    }
                    else
                    {
                        topicPairCounts[tp] = topicPairCounts[tp] + 1;
                    }
                }

                var correctCount = CountCorrectTopicPredictions(topicPairCounts, numTopics);
                Console.WriteLine(String.Format("Maximum inferred topic matches maximum true topic {0} times out of {1}", correctCount, inferredTopicDists.Length));
                Console.WriteLine("\nThis uses a greedy algorithm to determine the mapping from inferred topic indices to true topic indices");
                Console.WriteLine("\n************************************");
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Runs inference on the LDA model.
        /// <para>
        /// Words in documents are observed, topic distributions per document (<see cref="Theta"/>)
        /// and word distributions per topic (<see cref="Phi"/>) are inferred.
        /// </para>
        /// </summary>
        /// <param name="wordsInDoc">For each document, the unique word counts in the document</param>
        /// <param name="alpha">Hyper-parameter for <see cref="Theta"/></param>
        /// <param name="beta">Hyper-parameter for <see cref="Phi"/></param>
        /// <param name="postTheta">Posterior marginals for <see cref="Theta"/></param>
        /// <param name="postPhi">Posterior marginals for <see cref="Phi"/></param>
        /// <returns>Log evidence - can be used for model selection.</returns>
        public virtual double Infer(Dictionary <int, int>[] wordsInDoc, double alpha, double beta, out Dirichlet[] postTheta, out Dirichlet[] postPhi)
        {
            int  numDocs        = wordsInDoc.Length;
            var  thetaPosterior = new Dirichlet[numDocs];
            int  numIters       = Engine.NumberOfIterations;
            bool showProgress   = Engine.ShowProgress;

            Engine.ShowProgress = false; // temporarily disable Infer.NET progress

            // Set up document index boundaries for each batch
            double numDocsPerBatch = ((double)numDocs) / NumBatches;

            if (numDocsPerBatch == 0)
            {
                numDocsPerBatch = 1;
            }

            int[] boundary = new int[NumBatches + 1];
            boundary[0] = 0;
            double currBoundary = 0.0;

            for (int batch = 1; batch <= NumBatches; batch++)
            {
                currBoundary += numDocsPerBatch;
                int bnd = (int)currBoundary;
                if (bnd > numDocs)
                {
                    bnd = numDocs;
                }

                boundary[batch] = bnd;
            }

            boundary[NumBatches] = numDocs;

            PhiPrior.ObservedValue = new Dirichlet[NumTopics];
            for (int i = 0; i < NumTopics; i++)
            {
                PhiPrior.ObservedValue[i] = Dirichlet.Symmetric(SizeVocab, beta);
            }

            NumDocuments.ObservedValue = -1;
            try
            {
                for (int pass = 0; pass < NumPasses; pass++)
                {
                    Engine.NumberOfIterations = IterationsPerPass[pass];
                    if (showProgress)
                    {
                        Console.Write(String.Format(
                                          "\nPass {0} ({1} iteration{2} per batch)",
                                          pass, IterationsPerPass[pass], IterationsPerPass[pass] == 1 ? "" : "s"));
                    }

                    PhiDefModel.InferShared(EnginePhiDef, 0);
                    for (int batch = 0; batch < NumBatches; batch++)
                    {
                        int startDoc = boundary[batch];
                        int endDoc   = boundary[batch + 1];
                        if (startDoc >= numDocs)
                        {
                            break;
                        }

                        int numDocsInThisBatch = endDoc - startDoc;

                        // Set up the observed values
                        if (NumDocuments.ObservedValue != numDocsInThisBatch)
                        {
                            NumDocuments.ObservedValue = numDocsInThisBatch;

                            ThetaPrior.ObservedValue = new Dirichlet[numDocsInThisBatch];
                            for (int i = 0; i < numDocsInThisBatch; i++)
                            {
                                ThetaPrior.ObservedValue[i] = Dirichlet.Symmetric(NumTopics, alpha);
                            }
                        }
                        if (pass == 0)
                        {
                            ThetaInit.ObservedValue = LDAModel.GetInitialisation(numDocsInThisBatch, NumTopics, ThetaSparsity);
                        }
                        else
                        {
                            ThetaInit.ObservedValue = Util.ArrayInit(numDocsInThisBatch, d => new Dirichlet(thetaPosterior[d + startDoc]));
                        }


                        int[]      numWordsInDocBatch   = new int[numDocsInThisBatch];
                        int[][]    wordsInDocBatch      = new int[numDocsInThisBatch][];
                        double[][] wordCountsInDocBatch = new double[numDocsInThisBatch][];
                        for (int i = 0, j = startDoc; j < endDoc; i++, j++)
                        {
                            numWordsInDocBatch[i] = wordsInDoc[j].Count;
                            wordsInDocBatch[i]    = wordsInDoc[j].Keys.ToArray();
                            ICollection <int> cnts = wordsInDoc[j].Values;
                            wordCountsInDocBatch[i] = new double[cnts.Count];
                            int k = 0;
                            foreach (int val in cnts)
                            {
                                wordCountsInDocBatch[i][k++] = (double)val;
                            }
                        }

                        NumWordsInDoc.ObservedValue = numWordsInDocBatch;
                        Words.ObservedValue         = wordsInDocBatch;
                        WordCounts.ObservedValue    = wordCountsInDocBatch;

                        DocModel.InferShared(Engine, batch);
                        var postThetaBatch = Engine.Infer <Dirichlet[]>(Theta);
                        for (int i = 0, j = startDoc; j < endDoc; i++, j++)
                        {
                            thetaPosterior[j] = postThetaBatch[i];
                        }

                        if (showProgress)
                        {
                            if ((batch % 80) == 0)
                            {
                                Console.WriteLine("");
                            }

                            Console.Write(".");
                        }
                    }
                }
            }
            finally
            {
                Engine.NumberOfIterations = numIters;
                Engine.ShowProgress       = showProgress;
            }

            if (showProgress)
            {
                Console.WriteLine();
            }

            postTheta = thetaPosterior;
            postPhi   = Phi.Marginal <Dirichlet[]>();

            return(Model.GetEvidenceForAll(PhiDefModel, DocModel));
        }