/// <summary> /// This method implements the main functionality of stochastic gradient boosting /// </summary> private void BuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { float minValidationErr = 100; float[] funValueGain = new float[this.numSamples]; //(1) compute scores produced by the sub-model boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); //(3) compute the metrics of the sub-model int m = optIter = 0; metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(m)); #endif //(4) creat samplers to sub-sampl the features and data during node spliting RandomSampler featureSampler = new RandomSampler(r); RandomSampler dataSampler = new RandomSampler(r); //(5) creat the object that does node splitting #if SINGLE_THREAD // single-threaded this.findSplit = new FindSplitSync(); #else // multi-threaded this.findSplit = new FindSplitAsync(cThreads); #endif //SINGLE_THREAD //(6) Iteratively building boosted trees for (m = 0; m < numIter; m++) { // selecting a fraction of data groups for each iteration float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m); DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r); workDataSet.Sort(); // sorting gains some noticable speedup. // compute the pseudo response of the current system boostTreeLoss.ComputePseudoResponse(workDataSet); //set the data and feature sampling rate for node spliting in this iteration featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m); dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m); // fit a residual model (regression trees) from the pesuso response // to compensate the error of the current system for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { //only use the important data points if necessary int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m); //build a regression tree according to the pseduo-response this.regressionTrees[m, k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex, dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace); //compute the function value of all data points produced by the newly generated regression tree this.regressionTrees[m, k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain); //try to do a more global optimalization - refine the leaf node response of a decision tree //by looking at all the training data points, instead of only the ones falling into the regaion. //Here we are estimate and apply a global mutiplication factor for all leaf nodes float adjFactor = (m>0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F; //apply the multiplication factor to the leaf nodes of the newly generated regression tree this.regressionTrees[m, k].AdjustResponse(adjFactor); //update the function value for all data points given the new regression tree boostTreeLoss.AccFuncValueGain(funValueGain, adjFactor, k); } //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false); #if VERBOSE Console.WriteLine(metrics.ResultsStr(m+1)); #endif //keep track of the best (minimal Error) iteration on the Validation data set this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr); if ((m+1) % 5 == 0) // save the tree every 5 iterations SaveBoostTree(); } if (this.findSplit != null) { this.findSplit.Cleanup(); } }
public void Predict(LabelFeatureData labelFeatureData, int numIter, BoostTreeLoss boostTreeLoss, Metrics metrics, //reporting the error for each iteration if the following are set bool silent // If true, only report results on the last iteration ) { if (numIter > this.TotalIter) numIter = this.TotalIter; boostTreeLoss.Reset(labelFeatureData.NumDataPoint); //(1) compute the probabilities produced by the sub-model boostTreeLoss.ModelEval(this.subModel, labelFeatureData, null); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); if (metrics != null) { metrics.ComputeMetrics(boostTreeLoss.ModelScores, 0, this.optIter == 0); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(0)); #endif } //(3) accumulate the function values for each boosted regression tree int numSamples = labelFeatureData.NumDataPoint; float[] funValueGain = new float[numSamples]; #if GET_PER_DOC_PER_ITER_SCORES float[][] saveScores = ArrayUtils.FloatMatrix(numIter+2, labelFeatureData.NumDataPoint); // We will take transpose when we print for (int i = 0; i < labelFeatureData.NumDataPoint; ++i) { saveScores[0][i] = labelFeatureData.GetGroupId(i); saveScores[1][i] = labelFeatureData.GetLabel(i); } #endif for (int m = 0; m < numIter; m++) { // fit a residual model (regression trees) from the pesuso response // to compensate the error of the current system for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { if (this.regressionTrees[m, 0] == null) break; #if GET_PER_DOC_PER_ITER_SCORES this.regressionTrees[m, k].PredictFunValueNKeepScores(labelFeatureData, this.Train2TestIdx, funValueGain, saveScores[m+2]); #else this.regressionTrees[m, k].PredictFunValue(labelFeatureData, this.Train2TestIdx, funValueGain); #endif boostTreeLoss.AccFuncValueGain(funValueGain, 1.0f, k); } if (metrics != null) { //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, this.optIter == m + 1); if(m==numIter-1 || !silent) Console.WriteLine(metrics.ResultsStr(m + 1)); } } #if GET_PER_DOC_PER_ITER_SCORES using (StreamWriter sw = new StreamWriter("allScores.tsv")) { sw.Write("m:QueryID\tm:Rating"); // Write the header (with no tab at the end!) for (int j = 1; j < numIter+1; ++j) sw.Write("\tFtr_" + j.ToString("0000")); sw.WriteLine(); for (int j = 0; j < labelFeatureData.NumDataPoint; ++j) { sw.Write("{0}\t{1}", saveScores[0][j], saveScores[1][j]); // Write the query ID and label for (int m = 2; m < numIter + 2; ++m) sw.Write("\t{0:G6}", saveScores[m][j]); sw.WriteLine(); } } #endif if (metrics == null) { boostTreeLoss.FuncValuesToModelScores(); } else metrics.SaveScores("DataScores.txt", boostTreeLoss.ModelScores); }
/// <summary> /// This method implements the main functionality of stochastic gradient boosting, for distributed computing /// </summary> private void DistributedBuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { float minValidationErr = 100; float[] funValueGain = new float[this.numSamples]; //(1) compute scores produced by the sub-model boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); //(3) compute the metrics of the sub-model int m = optIter = 0; metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(m)); #endif //(4) creat samplers to sub-sampl the features and data during node spliting RandomSampler featureSampler = new RandomSampler(r); RandomSampler dataSampler = new RandomSampler(r); //(5) creat the object that does node splitting #if SINGLE_THREAD // single-threaded this.findSplit = new FindSplitSync(); #else // multi-threaded this.findSplit = new FindSplitAsync(cThreads); #endif //SINGLE_THREAD //(6) Iteratively building boosted trees for (m = 0; m < numIter; m++) { //returns array of regression trees (one per class k) for this iteration RegressionTree[] candidateTree = GetNextWeakLearner(m, funValueGain, metrics,boostTreeLoss,dataFeatureSampleRate, dataSampler, featureSampler, maxTreeSize,minNumSamples,cThreads,r); AddWeakLearner(candidateTree, funValueGain, m, metrics, boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, cThreads, r); //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false); #if VERBOSE Console.WriteLine(metrics.ResultsStr(m + 1)); #endif //keep track of the best (minimal Error) iteration on the Validation data set this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr); if ((m + 1) % 5 == 0) // save the tree every 5 iterations SaveBoostTree(); } if (this.findSplit != null) { this.findSplit.Cleanup(); } }