Esempio n. 1
0
        private bool EnsureCorpusVocabulary()
        {
            if (this.disposed)
            {
                throw new ObjectDisposedException("LDA Object has already been disposed.");
            }


            if (this.corpusVocabulary == null)
            {
                if (File.Exists(this.corpusVocabularyFileName))
                {
                    StatusMessage.Write("LDA.GetTopicAllocations: Loading Corpus Vocabulary");
                    this.corpusVocabulary = CorpusVocabulary.NewInstance(File.ReadLines(this.corpusVocabularyFileName));
                    return(true);
                }
                else
                {
                    StatusMessage.Write(string.Format("LDA.GetTopicAllocations: Error. Cannot find Corpus Vocabulary {0}", this.corpusVocabularyFileName));
                }

                return(false);
            }

            return(true);
        }
Esempio n. 2
0
        /// <summary>
        /// Initialize module and generate data files if not exist
        /// </summary>
        /// <param name="modelConfig">model config file</param>
        /// <returns>true if successful</returns>
        public bool Initialize(LDAConfig modelConfig)
        {
            // initialize vocabularies
            StatusMessage.Write("Loading corpus vocabulary file...");

            this.corpusVocabulary = CorpusVocabulary.Load(modelConfig.CorpusVocabulary);
            if (this.corpusVocabulary == null)
            {
                return(false);
            }

            this.NumTopics = modelConfig.LDAParameters.NumTopics;
            this.NumWords  = modelConfig.ModelStatistics.VocabularySize;
            this.documentTopicAllocationsFile = modelConfig.DocumentTopicAllocations;
            this.numDocs   = modelConfig.ModelStatistics.DocumentCount;
            this.numPasses = modelConfig.LDAParameters.Passes;

            // Initialize array of topic metrics:  Coherence, Specificity, Distinctiveness
            this.topicMetrics = new double[this.NumTopics, sizeof(MetricType)];

            // sanity check
            if (this.NumWords != this.corpusVocabulary.Count())
            {
                StatusMessage.Write("Number of vocabularies mismatch. Check your parameters.");
                return(false);
            }

            // initialize topic words allocation
            this.wordTopicAllocationsFileName = modelConfig.WordTopicAllocations;
            string topicWordsAllocBinFile = wordTopicAllocationsFileName + BinFileExt;

            StatusMessage.Write("Loading topic words allocation bin file...");
            if (!this.LoadTopicWords(topicWordsAllocBinFile))
            {
                StatusMessage.Write("Generating topic words allocation bin file...");
                if (!this.BuildTopicWordsAllocation(this.wordTopicAllocationsFileName, topicWordsAllocBinFile))
                {
                    return(false);
                }
            }

            // initialize words documents list map
            string wordDocsListMapFile = Path.GetDirectoryName(modelConfig.FeaturizedDocuments) + @"\" + WordDocsListMapFileName;

            StatusMessage.Write("Loading word documents list bin file...");
            if (!this.LoadWordDocsListMap(wordDocsListMapFile))
            {
                StatusMessage.Write("Generating word documents list bin file...");
                if (!this.BuildWordDocsListMap(modelConfig.FeaturizedDocuments, wordDocsListMapFile))
                {
                    return(false);
                }
            }

            // init topics info
            var topicsInfoFile = Path.GetDirectoryName(modelConfig.DocumentTopicAllocations) + @"\" + TopicsInfoFileName;

            if (!this.LoadTopicsInfo(topicsInfoFile))
            {
                if (!this.BuildTopicsInfo(topicsInfoFile))
                {
                    return(false);
                }
            }

            return(true);
        }