/// <summary> /// Load individual parameter range files and generate List<LDAConfig> /// </summary> /// <param name="modelRepositoryPath">The top folder where you want to save all the models that will be learned</param> /// <param name="folderOfParamRangeFiles">The folder where the parameter range files are located</param> /// <param name="trainingSampleName">the name of the training sample</param> /// <param name="defaultModelConfig">the template of LDAConfig file</param> /// <param name="configFilesFolder">The folder where LDAConfig files (for training) will be saved</param> /// <param name="listOfLDAConfigFilesForFeaturization">absolute paths for LDAConfig files (for featurization)</param> /// <param name="listOfLDAConfigFilesForTest">absolute paths for LDAConfig files (for metrics computation)</param> /// <returns>absolute paths for LDAConfig files (for training)</returns> public static List <string> GenerateLDAConfigFiles(string modelRepositoryPath, string folderOfParamRangeFiles, string trainingSampleName, LDAConfig defaultModelConfig, string configFilesFolder, ref List <string> listOfLDAConfigFilesForFeaturization, ref List <string> listOfLDAConfigFilesForTest) { var alphaRange = new List <double>(); var rhoRange = new List <double>(); var numOfTopicsRange = new List <int>(); var minibatchRange = new List <int>(); var powerTRange = new List <double>(); var initialTRange = new List <double>(); var passesRange = new List <int>(); var minWordDocFreqRange = new List <int>(); var maxRelWordDocFreqRange = new List <float>(); // Load LDA parameters from a single file or individual files. int parameterIndex = -1; if (File.Exists(Path.Combine(folderOfParamRangeFiles, "LDAParameters.tsv"))) { foreach (var line in File.ReadLines(Path.Combine(folderOfParamRangeFiles, "LDAParameters.tsv"))) { string lineContent = line.Trim().ToLower(); if (PARAMETER_INDEX_DICTIONARY.ContainsKey(lineContent)) { parameterIndex = PARAMETER_INDEX_DICTIONARY[lineContent]; } else { switch (parameterIndex) { case 0: alphaRange = Helper.ParseListOfValues <double>(lineContent); break; case 1: rhoRange = Helper.ParseListOfValues <double>(lineContent); break; case 2: minWordDocFreqRange = Helper.ParseListOfValues <int>(lineContent); break; case 3: maxRelWordDocFreqRange = Helper.ParseListOfValues <float>(lineContent); break; case 4: numOfTopicsRange = Helper.ParseListOfValues <int>(lineContent); break; case 5: passesRange = Helper.ParseListOfValues <int>(lineContent); break; case 6: minibatchRange = Helper.ParseListOfValues <int>(lineContent); break; case 7: initialTRange = Helper.ParseListOfValues <double>(lineContent); break; case 8: powerTRange = Helper.ParseListOfValues <double>(lineContent); break; default: break; } parameterIndex = -1; } } } else { alphaRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Alpha.txt")).Select(Double.Parse).ToList(); rhoRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Rho.txt")).Select(Double.Parse).ToList(); numOfTopicsRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "NumTopics.txt")).Select(int.Parse).ToList(); minibatchRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Minibatch.txt")).Select(int.Parse).ToList(); powerTRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "PowerT.txt")).Select(Double.Parse).ToList(); initialTRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "InitialT.txt")).Select(Double.Parse).ToList(); passesRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "Passes.txt")).Select(int.Parse).ToList(); minWordDocFreqRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "MinWordDocumentFrequency.txt")).Select(int.Parse).ToList(); maxRelWordDocFreqRange = File.ReadLines(Path.Combine(folderOfParamRangeFiles, "MaxRalativeWordDocumentFrequency.txt")).Select(Single.Parse).ToList(); } List <string> listOfLDAConfigFilesForTraining = new List <string>(); foreach (var min in minWordDocFreqRange) { foreach (var max in maxRelWordDocFreqRange) { var f = new FeaturizationParameters() { MinWordDocumentFrequency = min, MaxRalativeWordDocumentFrequency = max }; // a boolean flag ensuring each combination of min/max to be added into the list only once. bool minMaxAdded = false; foreach (var alpha in alphaRange) { foreach (var rho in rhoRange) { foreach (var numOfTopics in numOfTopicsRange) { foreach (var miniBatch in minibatchRange) { foreach (var powerT in powerTRange) { foreach (var initialT in initialTRange) { foreach (var passes in passesRange) { var p = new LDAParameters() { Alpha = alpha, Rho = rho, NumTopics = numOfTopics, Minibatch = miniBatch, PowerT = powerT, InitialT = initialT, Passes = passes }; var config = new LDAConfig() { LDAParameters = p, FeaturizationParameters = f, SampleName = trainingSampleName, ModelRepositoryPath = modelRepositoryPath, Locale = defaultModelConfig.Locale, Corpus = defaultModelConfig.Corpus, ModelStatistics = new ModelStatistics() }; UpdateConfigFileLists(ref listOfLDAConfigFilesForTraining, ref listOfLDAConfigFilesForTest, ref listOfLDAConfigFilesForFeaturization, configFilesFolder, config, ref minMaxAdded); } } } } } } } minMaxAdded = false; } // foreach max } return(listOfLDAConfigFilesForTraining); }
public static List <string> LoadLDAParameterTable(string modelRepositoryPath, string trainingSampleName, LDAConfig defaultModelConfig, string ldaParameterTablePath, string learningConfigFilesFolder, ref List <string> listOfLDAConfigFilesForFeaturization, ref List <string> listOfLDAConfigFilesForTest) { // Get all ldaconfig.json files. List <string> listOfLDAConfigFilesForTraining = new List <string>(); List <List <string> > lists = File.ReadLines(ldaParameterTablePath) .Skip(1) // skip column header .Select(line => line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries).ToList()) .ToList(); // Group all rows by <min, max>. var minMaxGroups = lists.GroupBy(list => new { min = Helper.GetValue <int>(list[0]), max = Helper.GetValue <float>(list[1]) }); foreach (var group in minMaxGroups) { var f = new FeaturizationParameters() { MinWordDocumentFrequency = group.Key.min, MaxRalativeWordDocumentFrequency = group.Key.max }; // a boolean flag ensuring each combination of min/max to be added into the list only once. bool minMaxAdded = false; foreach (var row in group) { // The default (LDA parameter) table has the following columns. // min max topicCount alpha rho miniBatch passes initialT powerT // E.g. \\ICE-Recommender\ModelRepository\TuneLDAParameters\RangesOfParams\LDAParameterTable.tsv int numOfTopics = Helper.GetValue <int>(row[2]); double alpha = Helper.GetValue <double>(row[3]); double rho = Helper.GetValue <double>(row[4]); int miniBatch = Helper.GetValue <int>(row[5]); int passes = Helper.GetValue <int>(row[6]); double initialT = Helper.GetValue <double>(row[7]); double powerT = Helper.GetValue <double>(row[8]); var p = new LDAParameters() { Alpha = alpha, Rho = rho, NumTopics = numOfTopics, Minibatch = miniBatch, PowerT = powerT, InitialT = initialT, Passes = passes }; var config = new LDAConfig() { LDAParameters = p, FeaturizationParameters = f, SampleName = trainingSampleName, ModelRepositoryPath = modelRepositoryPath, Locale = defaultModelConfig.Locale, Corpus = defaultModelConfig.Corpus, ModelStatistics = new ModelStatistics() }; UpdateConfigFileLists(ref listOfLDAConfigFilesForTraining, ref listOfLDAConfigFilesForTest, ref listOfLDAConfigFilesForFeaturization, learningConfigFilesFolder, config, ref minMaxAdded); } minMaxAdded = false; } return(listOfLDAConfigFilesForTraining); }