Пример #1
0
        public static bool Learn(LDAConfig ldaConfig, bool copyFeaturizedDoc)
        {
            if (File.Exists(ldaConfig.Model))
            {
                StatusMessage.Write("Skipping, model already exists. " + ldaConfig.Model);
                return(false);
            }

            var featurizedDocFile = ldaConfig.FeaturizedDocuments;

            if (copyFeaturizedDoc)
            {
                featurizedDocFile = string.Format(@"{0}\{1}", Path.GetDirectoryName(ldaConfig.WordTopicAllocations), Path.GetFileName(ldaConfig.FeaturizedDocuments));
                File.Copy(ldaConfig.FeaturizedDocuments, featurizedDocFile);
            }

            StatusMessage.Write("Running VW to learn LDA...");

            var command = AppDomain.CurrentDomain.BaseDirectory + "vw.exe";

            var args =
                " " + featurizedDocFile +
                " --hash strings" +
                " --lda " + ldaConfig.LDAParameters.NumTopics +
                " --lda_alpha " + ldaConfig.LDAParameters.Alpha +
                " --lda_rho " + ldaConfig.LDAParameters.Rho +
                " --lda_D " + ldaConfig.ModelStatistics.DocumentCount +
                " --minibatch " + ldaConfig.LDAParameters.Minibatch +
                " --power_t " + ldaConfig.LDAParameters.PowerT +
                " --initial_t " + ldaConfig.LDAParameters.InitialT +
                " -b " + (int)Math.Ceiling(Math.Log(ldaConfig.ModelStatistics.VocabularySize, 2.0)) + // Gets size of the hash table used to store the topic allocations for each word.
                " --passes " + ldaConfig.LDAParameters.Passes +
                " -c " +
                " --readable_model " + ldaConfig.WordTopicAllocations +
                " -p " + ldaConfig.DocumentTopicAllocations +
                " -f " + ldaConfig.Model;

            Console.RunCommand(command, args);

            if (copyFeaturizedDoc)
            {
                ConsoleColor color;
                FileManager.DeleteFile(featurizedDocFile, out color);
                FileManager.DeleteFile(featurizedDocFile + ".cache", out color);
            }
            return(true);
        }
Пример #2
0
        // Load existing LDA config files (*.LDAConfig.json)
        public static List <string> LoadLDAConfigFiles(string modelRepositoryPath,
                                                       string trainingSampleName,
                                                       LDAConfig defaultModelConfig,
                                                       string SourceFolderOfLDAConfigFiles,
                                                       string learningConfigFilesFolder,
                                                       ref List <string> listOfLDAConfigFilesForFeaturization,
                                                       ref List <string> listOfLDAConfigFilesForTest)
        {
            // Get all ldaconfig.json files.
            List <string> listOfLDAConfigFilesForTraining = new List <string>();

            var ldaConfigFiles = Directory.GetFiles(SourceFolderOfLDAConfigFiles, "*LDAConfig.json", SearchOption.AllDirectories);

            foreach (string ldaConfigFile in ldaConfigFiles)
            {
                // Load ldaconfig
                var config = Program.LoadLDAConfig(ldaConfigFile);

                var newConfig = new LDAConfig()
                {
                    LDAParameters           = config.LDAParameters,
                    FeaturizationParameters = config.FeaturizationParameters,
                    SampleName          = trainingSampleName,
                    ModelRepositoryPath = modelRepositoryPath,
                    Locale          = defaultModelConfig.Locale,
                    Corpus          = defaultModelConfig.Corpus,
                    ModelStatistics = new ModelStatistics()
                };

                bool minMaxAdded = false;
                UpdateConfigFileLists(ref listOfLDAConfigFilesForTraining, ref listOfLDAConfigFilesForTest, ref listOfLDAConfigFilesForFeaturization, learningConfigFilesFolder, newConfig, ref minMaxAdded);
            }

            listOfLDAConfigFilesForFeaturization.AddRange(listOfLDAConfigFilesForTraining.GroupBy(
                                                              config => new
            {
                Program.LoadLDAConfig(config).FeaturizationParameters.MinWordDocumentFrequency,
                Program.LoadLDAConfig(config).FeaturizationParameters.MaxRalativeWordDocumentFrequency
            })
                                                          .Select(g => g.First()));

            return(listOfLDAConfigFilesForTraining);
        }
Пример #3
0
        /// <summary>
        /// Load individual parameter range files and generate List<LDAConfig>
        /// </summary>
        /// <param name="modelRepositoryPath">The top folder where you want to save all the models that will be learned</param>
        /// <param name="folderOfParamRangeFiles">The folder where the parameter range files are located</param>
        /// <param name="trainingSampleName">the name of the training sample</param>
        /// <param name="defaultModelConfig">the template of LDAConfig file</param>
        /// <param name="configFilesFolder">The folder where LDAConfig files (for training) will be saved</param>
        /// <param name="listOfLDAConfigFilesForFeaturization">absolute paths for LDAConfig files (for featurization)</param>
        /// <param name="listOfLDAConfigFilesForTest">absolute paths for LDAConfig files (for metrics computation)</param>
        /// <returns>absolute paths for LDAConfig files (for training)</returns>
        public static List <string> GenerateLDAConfigFiles(string modelRepositoryPath,
                                                           string folderOfParamRangeFiles,
                                                           string trainingSampleName,
                                                           LDAConfig defaultModelConfig,
                                                           string configFilesFolder,
                                                           ref List <string> listOfLDAConfigFilesForFeaturization,
                                                           ref List <string> listOfLDAConfigFilesForTest)
        {
            var alphaRange             = new List <double>();
            var rhoRange               = new List <double>();
            var numOfTopicsRange       = new List <int>();
            var minibatchRange         = new List <int>();
            var powerTRange            = new List <double>();
            var initialTRange          = new List <double>();
            var passesRange            = new List <int>();
            var minWordDocFreqRange    = new List <int>();
            var maxRelWordDocFreqRange = new List <float>();

            // Load LDA parameters from a single file or individual files.
            int parameterIndex = -1;

            if (File.Exists(Path.Combine(folderOfParamRangeFiles, "LDAParameters.tsv")))
            {
                foreach (var line in File.ReadLines(Path.Combine(folderOfParamRangeFiles, "LDAParameters.tsv")))
                {
                    string lineContent = line.Trim().ToLower();
                    if (PARAMETER_INDEX_DICTIONARY.ContainsKey(lineContent))
                    {
                        parameterIndex = PARAMETER_INDEX_DICTIONARY[lineContent];
                    }
                    else
                    {
                        switch (parameterIndex)
                        {
                        case 0:
                            alphaRange = Helper.ParseListOfValues <double>(lineContent);
                            break;

                        case 1:
                            rhoRange = Helper.ParseListOfValues <double>(lineContent);
                            break;

                        case 2:
                            minWordDocFreqRange = Helper.ParseListOfValues <int>(lineContent);
                            break;

                        case 3:
                            maxRelWordDocFreqRange = Helper.ParseListOfValues <float>(lineContent);
                            break;

                        case 4:
                            numOfTopicsRange = Helper.ParseListOfValues <int>(lineContent);
                            break;

                        case 5:
                            passesRange = Helper.ParseListOfValues <int>(lineContent);
                            break;

                        case 6:
                            minibatchRange = Helper.ParseListOfValues <int>(lineContent);
                            break;

                        case 7:
                            initialTRange = Helper.ParseListOfValues <double>(lineContent);
                            break;

                        case 8:
                            powerTRange = Helper.ParseListOfValues <double>(lineContent);
                            break;

                        default:
                            break;
                        }
                        parameterIndex = -1;
                    }
                }
            }
            else
            {
                alphaRange       = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Alpha.txt")).Select(Double.Parse).ToList();
                rhoRange         = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Rho.txt")).Select(Double.Parse).ToList();
                numOfTopicsRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "NumTopics.txt")).Select(int.Parse).ToList();
                minibatchRange   = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Minibatch.txt")).Select(int.Parse).ToList();
                powerTRange      = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "PowerT.txt")).Select(Double.Parse).ToList();
                initialTRange    = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "InitialT.txt")).Select(Double.Parse).ToList();
                passesRange      = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Passes.txt")).Select(int.Parse).ToList();

                minWordDocFreqRange    = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "MinWordDocumentFrequency.txt")).Select(int.Parse).ToList();
                maxRelWordDocFreqRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "MaxRalativeWordDocumentFrequency.txt")).Select(Single.Parse).ToList();
            }

            List <string> listOfLDAConfigFilesForTraining = new List <string>();


            foreach (var min in minWordDocFreqRange)
            {
                foreach (var max in maxRelWordDocFreqRange)
                {
                    var f = new FeaturizationParameters()
                    {
                        MinWordDocumentFrequency         = min,
                        MaxRalativeWordDocumentFrequency = max
                    };

                    // a boolean flag ensuring each combination of min/max to be added into the list only once.
                    bool minMaxAdded = false;

                    foreach (var alpha in alphaRange)
                    {
                        foreach (var rho in rhoRange)
                        {
                            foreach (var numOfTopics in numOfTopicsRange)
                            {
                                foreach (var miniBatch in minibatchRange)
                                {
                                    foreach (var powerT in powerTRange)
                                    {
                                        foreach (var initialT in initialTRange)
                                        {
                                            foreach (var passes in passesRange)
                                            {
                                                var p = new LDAParameters()
                                                {
                                                    Alpha     = alpha,
                                                    Rho       = rho,
                                                    NumTopics = numOfTopics,
                                                    Minibatch = miniBatch,
                                                    PowerT    = powerT,
                                                    InitialT  = initialT,
                                                    Passes    = passes
                                                };

                                                var config = new LDAConfig()
                                                {
                                                    LDAParameters           = p,
                                                    FeaturizationParameters = f,
                                                    SampleName          = trainingSampleName,
                                                    ModelRepositoryPath = modelRepositoryPath,
                                                    Locale          = defaultModelConfig.Locale,
                                                    Corpus          = defaultModelConfig.Corpus,
                                                    ModelStatistics = new ModelStatistics()
                                                };

                                                UpdateConfigFileLists(ref listOfLDAConfigFilesForTraining, ref listOfLDAConfigFilesForTest, ref listOfLDAConfigFilesForFeaturization, configFilesFolder, config, ref minMaxAdded);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }

                    minMaxAdded = false;
                }  // foreach max
            }

            return(listOfLDAConfigFilesForTraining);
        }
Пример #4
0
        /// <summary>
        /// Update the following lists of LDA config files, given an instance of LDAConfig:
        /// listOfLDAConfigFilesForTraining, listOfLDAConfigFilesForTest and listOfLDAConfigFilesForFeaturization.
        /// </summary>
        private static void UpdateConfigFileLists(ref List <string> listOfLDAConfigFilesForTraining, ref List <string> listOfLDAConfigFilesForTest, ref List <string> listOfLDAConfigFilesForFeaturization, string learningConfigFilesFolder, LDAConfig config, ref bool minMaxAdded)
        {
            string json = JsonConvert.SerializeObject(config);

            string ldaConfigFilePath = string.Format(@"{0}\{1}.LDAConfig.json", learningConfigFilesFolder, config.modelName);

            File.WriteAllText(ldaConfigFilePath, json);

            // Add config files into lists.
            listOfLDAConfigFilesForTraining.Add(ldaConfigFilePath);
            listOfLDAConfigFilesForTest.Add(config.LDAConfigFile);
            if (!minMaxAdded)
            {
                listOfLDAConfigFilesForFeaturization.Add(ldaConfigFilePath);
                minMaxAdded = true;
            }
        }
Пример #5
0
        public static List <string> LoadLDAParameterTable(string modelRepositoryPath,
                                                          string trainingSampleName,
                                                          LDAConfig defaultModelConfig,
                                                          string ldaParameterTablePath,
                                                          string learningConfigFilesFolder,
                                                          ref List <string> listOfLDAConfigFilesForFeaturization,
                                                          ref List <string> listOfLDAConfigFilesForTest)
        {
            // Get all ldaconfig.json files.
            List <string> listOfLDAConfigFilesForTraining = new List <string>();

            List <List <string> > lists = File.ReadLines(ldaParameterTablePath)
                                          .Skip(1) // skip column header
                                          .Select(line => line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries).ToList())
                                          .ToList();

            // Group all rows by <min, max>.
            var minMaxGroups = lists.GroupBy(list => new
            {
                min = Helper.GetValue <int>(list[0]),
                max = Helper.GetValue <float>(list[1])
            });

            foreach (var group in minMaxGroups)
            {
                var f = new FeaturizationParameters()
                {
                    MinWordDocumentFrequency         = group.Key.min,
                    MaxRalativeWordDocumentFrequency = group.Key.max
                };

                // a boolean flag ensuring each combination of min/max to be added into the list only once.
                bool minMaxAdded = false;
                foreach (var row in group)
                {
                    // The default (LDA parameter) table has the following columns.
                    // min	max	topicCount	alpha	rho	miniBatch	passes	initialT	powerT
                    // E.g. \\ICE-Recommender\ModelRepository\TuneLDAParameters\RangesOfParams\LDAParameterTable.tsv
                    int    numOfTopics = Helper.GetValue <int>(row[2]);
                    double alpha       = Helper.GetValue <double>(row[3]);
                    double rho         = Helper.GetValue <double>(row[4]);
                    int    miniBatch   = Helper.GetValue <int>(row[5]);
                    int    passes      = Helper.GetValue <int>(row[6]);
                    double initialT    = Helper.GetValue <double>(row[7]);
                    double powerT      = Helper.GetValue <double>(row[8]);

                    var p = new LDAParameters()
                    {
                        Alpha     = alpha,
                        Rho       = rho,
                        NumTopics = numOfTopics,
                        Minibatch = miniBatch,
                        PowerT    = powerT,
                        InitialT  = initialT,
                        Passes    = passes
                    };

                    var config = new LDAConfig()
                    {
                        LDAParameters           = p,
                        FeaturizationParameters = f,
                        SampleName          = trainingSampleName,
                        ModelRepositoryPath = modelRepositoryPath,
                        Locale          = defaultModelConfig.Locale,
                        Corpus          = defaultModelConfig.Corpus,
                        ModelStatistics = new ModelStatistics()
                    };

                    UpdateConfigFileLists(ref listOfLDAConfigFilesForTraining, ref listOfLDAConfigFilesForTest, ref listOfLDAConfigFilesForFeaturization, learningConfigFilesFolder, config, ref minMaxAdded);
                }
                minMaxAdded = false;
            }

            return(listOfLDAConfigFilesForTraining);
        }
Пример #6
0
        /// <summary>
        /// Initialize module and generate data files if not exist
        /// </summary>
        /// <param name="modelConfig">model config file</param>
        /// <returns>true if successful</returns>
        public bool Initialize(LDAConfig modelConfig)
        {
            // initialize vocabularies
            StatusMessage.Write("Loading corpus vocabulary file...");

            this.corpusVocabulary = CorpusVocabulary.Load(modelConfig.CorpusVocabulary);
            if (this.corpusVocabulary == null)
            {
                return(false);
            }

            this.NumTopics = modelConfig.LDAParameters.NumTopics;
            this.NumWords  = modelConfig.ModelStatistics.VocabularySize;
            this.documentTopicAllocationsFile = modelConfig.DocumentTopicAllocations;
            this.numDocs   = modelConfig.ModelStatistics.DocumentCount;
            this.numPasses = modelConfig.LDAParameters.Passes;

            // Initialize array of topic metrics:  Coherence, Specificity, Distinctiveness
            this.topicMetrics = new double[this.NumTopics, sizeof(MetricType)];

            // sanity check
            if (this.NumWords != this.corpusVocabulary.Count())
            {
                StatusMessage.Write("Number of vocabularies mismatch. Check your parameters.");
                return(false);
            }

            // initialize topic words allocation
            this.wordTopicAllocationsFileName = modelConfig.WordTopicAllocations;
            string topicWordsAllocBinFile = wordTopicAllocationsFileName + BinFileExt;

            StatusMessage.Write("Loading topic words allocation bin file...");
            if (!this.LoadTopicWords(topicWordsAllocBinFile))
            {
                StatusMessage.Write("Generating topic words allocation bin file...");
                if (!this.BuildTopicWordsAllocation(this.wordTopicAllocationsFileName, topicWordsAllocBinFile))
                {
                    return(false);
                }
            }

            // initialize words documents list map
            string wordDocsListMapFile = Path.GetDirectoryName(modelConfig.FeaturizedDocuments) + @"\" + WordDocsListMapFileName;

            StatusMessage.Write("Loading word documents list bin file...");
            if (!this.LoadWordDocsListMap(wordDocsListMapFile))
            {
                StatusMessage.Write("Generating word documents list bin file...");
                if (!this.BuildWordDocsListMap(modelConfig.FeaturizedDocuments, wordDocsListMapFile))
                {
                    return(false);
                }
            }

            // init topics info
            var topicsInfoFile = Path.GetDirectoryName(modelConfig.DocumentTopicAllocations) + @"\" + TopicsInfoFileName;

            if (!this.LoadTopicsInfo(topicsInfoFile))
            {
                if (!this.BuildTopicsInfo(topicsInfoFile))
                {
                    return(false);
                }
            }

            return(true);
        }