public void DistributedBuild(Metrics metrics, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { this.regressionTrees = new RegressionTree[numIter, this.boostTreeLoss.NumTreesPerIteration]; this.tempSpace = new TempSpace(this.numSamples); DistributedBuildBoostTree(metrics, this.boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, numIter, cThreads, r); SaveBoostTree(); }
/// <summary> /// This method implements the main functionality of stochastic gradient boosting /// </summary> private void BuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { float minValidationErr = 100; float[] funValueGain = new float[this.numSamples]; //(1) compute scores produced by the sub-model boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); //(3) compute the metrics of the sub-model int m = optIter = 0; metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(m)); #endif //(4) creat samplers to sub-sampl the features and data during node spliting RandomSampler featureSampler = new RandomSampler(r); RandomSampler dataSampler = new RandomSampler(r); //(5) creat the object that does node splitting #if SINGLE_THREAD // single-threaded this.findSplit = new FindSplitSync(); #else // multi-threaded this.findSplit = new FindSplitAsync(cThreads); #endif //SINGLE_THREAD //(6) Iteratively building boosted trees for (m = 0; m < numIter; m++) { // selecting a fraction of data groups for each iteration float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m); DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r); workDataSet.Sort(); // sorting gains some noticable speedup. // compute the pseudo response of the current system boostTreeLoss.ComputePseudoResponse(workDataSet); //set the data and feature sampling rate for node spliting in this iteration featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m); dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m); // fit a residual model (regression trees) from the pesuso response // to compensate the error of the current system for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { //only use the important data points if necessary int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m); //build a regression tree according to the pseduo-response this.regressionTrees[m, k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex, dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace); //compute the function value of all data points produced by the newly generated regression tree this.regressionTrees[m, k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain); //try to do a more global optimalization - refine the leaf node response of a decision tree //by looking at all the training data points, instead of only the ones falling into the regaion. //Here we are estimate and apply a global mutiplication factor for all leaf nodes float adjFactor = (m>0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F; //apply the multiplication factor to the leaf nodes of the newly generated regression tree this.regressionTrees[m, k].AdjustResponse(adjFactor); //update the function value for all data points given the new regression tree boostTreeLoss.AccFuncValueGain(funValueGain, adjFactor, k); } //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false); #if VERBOSE Console.WriteLine(metrics.ResultsStr(m+1)); #endif //keep track of the best (minimal Error) iteration on the Validation data set this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr); if ((m+1) % 5 == 0) // save the tree every 5 iterations SaveBoostTree(); } if (this.findSplit != null) { this.findSplit.Cleanup(); } }
public void AddWeakLearner(RegressionTree[] candidateTree, float[] funValueGain, int m, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int cThreads, Random r) { //update the function value for all data points given the new regression tree for (int i = 0; i < boostTreeLoss.NumTreesPerIteration; i++) { candidateTree[i].PredictFunValue(this.labelFeatureDataCoded, true, ref funValueGain); this.regressionTrees[m, i] = candidateTree[i]; boostTreeLoss.AccFuncValueGain(funValueGain, candidateTree[i].AdjustFactor, i); } }
public RegressionTree[] GetNextWeakLearner(int m, float[] funValueGain, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, RandomSampler dataSampler, RandomSampler featureSampler, int maxTreeSize, int minNumSamples, int cThreads, Random r) { // select a fraction of data groups for this iteration float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m); DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r); workDataSet.Sort(); // sorting gains some noticable speedup. // compute the pseudo response of the current system boostTreeLoss.ComputePseudoResponse(workDataSet); //set the data and feature sampling rate for node spliting in this iteration featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m); dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m); // fit a residual model (regression trees) from the pseudo response // to compensate the error of the current system RegressionTree[] newTree = new RegressionTree[boostTreeLoss.NumTreesPerIteration]; for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { //only use the important data points if necessary int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m); //build a regression tree according to the pseduo-response newTree[k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex, dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace); //compute the function value of all data points produced by the newly generated regression tree newTree[k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain); //try to do a more global optimalization - refine the leaf node response of a decision tree //by looking at all the training data points, instead of only the ones falling into the regaion. //Here we are estimate and apply a global mutiplication factor for all leaf nodes float adjFactor = (m > 0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F; //apply the multiplication factor to the leaf nodes of the newly generated regression tree newTree[k].AdjustResponse(adjFactor); newTree[k].AdjustFactor = adjFactor; } //return the k regression trees return newTree; }
/// <summary> /// This method implements the main functionality of stochastic gradient boosting, for distributed computing /// </summary> private void DistributedBuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { float minValidationErr = 100; float[] funValueGain = new float[this.numSamples]; //(1) compute scores produced by the sub-model boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); //(3) compute the metrics of the sub-model int m = optIter = 0; metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(m)); #endif //(4) creat samplers to sub-sampl the features and data during node spliting RandomSampler featureSampler = new RandomSampler(r); RandomSampler dataSampler = new RandomSampler(r); //(5) creat the object that does node splitting #if SINGLE_THREAD // single-threaded this.findSplit = new FindSplitSync(); #else // multi-threaded this.findSplit = new FindSplitAsync(cThreads); #endif //SINGLE_THREAD //(6) Iteratively building boosted trees for (m = 0; m < numIter; m++) { //returns array of regression trees (one per class k) for this iteration RegressionTree[] candidateTree = GetNextWeakLearner(m, funValueGain, metrics,boostTreeLoss,dataFeatureSampleRate, dataSampler, featureSampler, maxTreeSize,minNumSamples,cThreads,r); AddWeakLearner(candidateTree, funValueGain, m, metrics, boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, cThreads, r); //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false); #if VERBOSE Console.WriteLine(metrics.ResultsStr(m + 1)); #endif //keep track of the best (minimal Error) iteration on the Validation data set this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr); if ((m + 1) % 5 == 0) // save the tree every 5 iterations SaveBoostTree(); } if (this.findSplit != null) { this.findSplit.Cleanup(); } }
/// <summary> /// Main Program /// </summary> /// <param name="args"> /// Should contain five parameters in the order of /// file name for training source data (DataProcess class), /// file name for saving the tree /// tree size, i.e., maximum number of terminal nodes, usually 16 - 20 /// learning rate, usually 0.02 - 0.06 /// number of iterations, usallay >500(can be pruned later) /// </param> public static void Main(string[] args) { try { TrainArgs cmd = new TrainArgs(args); Random r = null; if (cmd.seed == -1) { r = new Random(); } else { r = new Random(cmd.seed); } string[] activeFeatureNames = null; //read and process only a subset of activated features as specified in the activeFeatureFile if (cmd.activeFeatureFile != null) { activeFeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile); } //feature parser: special module that understand MSN style value encoding MsnFeatureParser featureParser = new MsnFeatureParser(activeFeatureNames); //the column name for label: string[] labelName = { cmd.labelName }; //label/rating parser: IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile); //data boundary: no boundary OnelineGroup noBounadry = new OnelineGroup(); //Load coded data if exist LabelFeatureDataCoded trainLabelFeatureDataCoded = (CLabelFeatureDataCoded)CLabelFeatureData.Load(cmd.trainFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureDataCoded), activeFeatureNames, cmd.cThreads, cmd.SparseCoded); LabelFeatureData validLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.validFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded); LabelFeatureData testLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.testFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded); //build composite data - an aggregated data object that keeps tract of train/valid/test data CLabelFeatureDataCodedComposite labelFeatureDataCoded = CLabelFeatureDataCodedComposite.Create(trainLabelFeatureDataCoded, validLabelFeatureData, testLabelFeatureData); //initial submodel to boost on Model subModel = null; if (cmd.subModel != null && cmd.subModel.Length > 0) { if (cmd.cLayer > 0) { string[] layerNames = new string[cmd.cLayer]; for (int i = 0; i < cmd.cLayer; i++) { string num = ""; if (cmd.cLayer > 1) { num = (i + 1).ToString(); } layerNames[i] = cmd.subModel + "layer" + num + ".txt"; } subModel = new NNModel(layerNames); } else { string iniName = cmd.subModel + ".ini"; subModel = NNModelMSN.Create(iniName); } } if (subModel != null && !subModel.SetFeatureNames(labelFeatureDataCoded.FeatureNames)) { Console.WriteLine("Fail to initialize specified submodel - training with empty submodel"); subModel = null; } LabelConverter labelConvert = new LabelConverterNull(); LabelFeatureData subModelScore = null; McBoostTreeLoss boostTreeLoss = new McBoostTreeLoss(labelFeatureDataCoded, labelConvert, cmd.learnRate); BoostTree boostTree = new BoostTree(labelFeatureDataCoded, subModelScore, subModel, boostTreeLoss, cmd.binaryTreeFile, cmd.textTreeFile); //set up the error metrics that we like to keep tract of during testing DataPartitionType[] dataTypes = { DataPartitionType.Train, DataPartitionType.Validation, DataPartitionType.Test }; //dp.LabelFeatureData is the data we are evaluating Metrics metrics = new ClassError(labelFeatureDataCoded, labelConvert, dataTypes); DataFeatureSampleRate dataFeatureSampleRate = new DataFeatureSampleRate(cmd.sampleFeatureRate, cmd.sampleDataRate, cmd.sampleDataGroupRate); boostTree.Build(metrics, dataFeatureSampleRate, cmd.cLeafNodes, cmd.minNumSamples, cmd.numIter, cmd.cThreads, r); metrics.SaveAllResults("TrainingErrHistory.txt"); } catch (Exception exc) { Console.WriteLine(exc.Message); } }