/// <summary> /// This method implements the main functionality of stochastic gradient boosting /// </summary> private void BuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { float minValidationErr = 100; float[] funValueGain = new float[this.numSamples]; //(1) compute scores produced by the sub-model boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); //(3) compute the metrics of the sub-model int m = optIter = 0; metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(m)); #endif //(4) creat samplers to sub-sampl the features and data during node spliting RandomSampler featureSampler = new RandomSampler(r); RandomSampler dataSampler = new RandomSampler(r); //(5) creat the object that does node splitting #if SINGLE_THREAD // single-threaded this.findSplit = new FindSplitSync(); #else // multi-threaded this.findSplit = new FindSplitAsync(cThreads); #endif //SINGLE_THREAD //(6) Iteratively building boosted trees for (m = 0; m < numIter; m++) { // selecting a fraction of data groups for each iteration float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m); DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r); workDataSet.Sort(); // sorting gains some noticable speedup. // compute the pseudo response of the current system boostTreeLoss.ComputePseudoResponse(workDataSet); //set the data and feature sampling rate for node spliting in this iteration featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m); dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m); // fit a residual model (regression trees) from the pesuso response // to compensate the error of the current system for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { //only use the important data points if necessary int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m); //build a regression tree according to the pseduo-response this.regressionTrees[m, k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex, dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace); //compute the function value of all data points produced by the newly generated regression tree this.regressionTrees[m, k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain); //try to do a more global optimalization - refine the leaf node response of a decision tree //by looking at all the training data points, instead of only the ones falling into the regaion. //Here we are estimate and apply a global mutiplication factor for all leaf nodes float adjFactor = (m>0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F; //apply the multiplication factor to the leaf nodes of the newly generated regression tree this.regressionTrees[m, k].AdjustResponse(adjFactor); //update the function value for all data points given the new regression tree boostTreeLoss.AccFuncValueGain(funValueGain, adjFactor, k); } //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false); #if VERBOSE Console.WriteLine(metrics.ResultsStr(m+1)); #endif //keep track of the best (minimal Error) iteration on the Validation data set this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr); if ((m+1) % 5 == 0) // save the tree every 5 iterations SaveBoostTree(); } if (this.findSplit != null) { this.findSplit.Cleanup(); } }
/// <summary> /// This method implements the main functionality of stochastic gradient boosting, for distributed computing /// </summary> private void DistributedBuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int numIter, int cThreads, Random r) { float minValidationErr = 100; float[] funValueGain = new float[this.numSamples]; //(1) compute scores produced by the sub-model boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); //(3) compute the metrics of the sub-model int m = optIter = 0; metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(m)); #endif //(4) creat samplers to sub-sampl the features and data during node spliting RandomSampler featureSampler = new RandomSampler(r); RandomSampler dataSampler = new RandomSampler(r); //(5) creat the object that does node splitting #if SINGLE_THREAD // single-threaded this.findSplit = new FindSplitSync(); #else // multi-threaded this.findSplit = new FindSplitAsync(cThreads); #endif //SINGLE_THREAD //(6) Iteratively building boosted trees for (m = 0; m < numIter; m++) { //returns array of regression trees (one per class k) for this iteration RegressionTree[] candidateTree = GetNextWeakLearner(m, funValueGain, metrics,boostTreeLoss,dataFeatureSampleRate, dataSampler, featureSampler, maxTreeSize,minNumSamples,cThreads,r); AddWeakLearner(candidateTree, funValueGain, m, metrics, boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, cThreads, r); //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false); #if VERBOSE Console.WriteLine(metrics.ResultsStr(m + 1)); #endif //keep track of the best (minimal Error) iteration on the Validation data set this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr); if ((m + 1) % 5 == 0) // save the tree every 5 iterations SaveBoostTree(); } if (this.findSplit != null) { this.findSplit.Cleanup(); } }
public RegressionTree[] GetNextWeakLearner(int m, float[] funValueGain, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, RandomSampler dataSampler, RandomSampler featureSampler, int maxTreeSize, int minNumSamples, int cThreads, Random r) { // select a fraction of data groups for this iteration float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m); DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r); workDataSet.Sort(); // sorting gains some noticable speedup. // compute the pseudo response of the current system boostTreeLoss.ComputePseudoResponse(workDataSet); //set the data and feature sampling rate for node spliting in this iteration featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m); dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m); // fit a residual model (regression trees) from the pseudo response // to compensate the error of the current system RegressionTree[] newTree = new RegressionTree[boostTreeLoss.NumTreesPerIteration]; for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { //only use the important data points if necessary int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m); //build a regression tree according to the pseduo-response newTree[k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex, dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace); //compute the function value of all data points produced by the newly generated regression tree newTree[k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain); //try to do a more global optimalization - refine the leaf node response of a decision tree //by looking at all the training data points, instead of only the ones falling into the regaion. //Here we are estimate and apply a global mutiplication factor for all leaf nodes float adjFactor = (m > 0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F; //apply the multiplication factor to the leaf nodes of the newly generated regression tree newTree[k].AdjustResponse(adjFactor); newTree[k].AdjustFactor = adjFactor; } //return the k regression trees return newTree; }
private void BuildRegressionTree(BoostTreeLoss boostTreeLoss, int iTree, IFindSplit findSplit, RandomSampler featureSampler, RandomSampler dataSampler) { this.responses = boostTreeLoss.PseudoResponse(iTree); TreeNode root = new TreeNode(); root.isTerminal = true; root.dataPoints = Vector.IndexArray(this.workIndex.Length); this.tree = new TreeNode[2 * maxTreeSize - 1]; this.tree[0] = root; for (int i = 0; i < maxTreeSize - 1; i++) { float maxGain = -1; int bestRegion = -1; TreeNode leftNode = new TreeNode(); TreeNode rightNode = new TreeNode(); //qiangwu: compute the best split for new nodes // We only need to explore the last two nodes because they are and only they are new nodes i.e. // for (int j = 2*i; j >= 0; j--) for (int j = 0; j < 2 * i + 1; j++) { TreeNode curNode = this.tree[j]; //qiangwu: (assert curNode.split<0 && curNode.isTerminal) <==> (2*i-1 <= j <= 2*i) if (curNode.split<0 && curNode.isTerminal && curNode.dataPoints.Length >= this.minNumSamples) { dataSampler.Shuffle(curNode.dataPoints.Length); featureSampler.Shuffle(this.numFeatures); Split bestSplit = findSplit.FindBestSplit(this.labelFeatureDataCoded, this.responses, curNode.dataPoints, this.workIndex, featureSampler, dataSampler, this.minNumSamples); //qiangwu: the only way (bestSplit.feature < 0) not slippint is because this.dataColRange[dim]=1 for all // dimensions. I.e. the values all of data points in every dimension are the same (or in one bin) if (bestSplit.feature >= 0) { curNode.split = bestSplit.feature; curNode.gain = (float)bestSplit.gain; curNode.splitValueCoded = bestSplit.iThresh + 0.2F; // add 0.2 to avoid boundary check or floating point rounding curNode.splitValue = this.labelFeatureDataCoded.ConvertToOrigData(curNode.split, curNode.splitValueCoded); //SplitOneDim(curNode.dataPoints, regionSplitDim, regionSplitPoint, out curNode.leftPoints, out curNode.rightPoints); } } if (curNode.gain > maxGain) { maxGain = curNode.gain; bestRegion = j; } } if (bestRegion == -1) break; TreeNode bestNode = this.tree[bestRegion]; SplitOneDim(bestNode.dataPoints, bestNode.split, (int)bestNode.splitValueCoded, out bestNode.leftPoints, out bestNode.rightPoints); leftNode.isTerminal = true; leftNode.parent = bestRegion; leftNode.dataPoints = bestNode.leftPoints; rightNode.isTerminal = true; rightNode.parent = bestRegion; rightNode.dataPoints = bestNode.rightPoints; this.tree[2 * i + 1] = leftNode; this.tree[2 * i + 2] = rightNode; this.featureImportance[bestNode.split] += bestNode.gain; bestNode.leftChild = 2 * i + 1; bestNode.rightChild = 2 * i + 2; bestNode.isTerminal = false; bestNode.gain = -1; bestNode.dataPoints = null; bestNode.leftPoints = null; bestNode.rightPoints = null; GC.Collect(); // hope for the best. } //qiangwu: compute the response of newly created region (node) for (int i = 0; i < this.tree.Length; i++) { if (this.tree[i] != null && this.tree[i].isTerminal) { Debug.Assert(this.tree[i].dataPoints.Length >= this.minNumSamples, "Regression Tree split has problems"); float v = boostTreeLoss.Response(this.tree[i].dataPoints, this.workIndex, iTree); //round the regional value to 5 decimal point //to remove/alleviate the differences due to floating point precision //so that different algorithms produces the same model/results #if ROUND this.tree[i].regionValue = (float)Math.Round(v, 5); #else this.tree[i].regionValue = v; #endif //ROUND this.tree[i].dataPoints = null; this.tree[i].leftPoints = null; this.tree[i].rightPoints = null; GC.Collect(); } } }
public RegressionTree(LabelFeatureDataCoded labelFeatureDataCoded, BoostTreeLoss boostTreeLoss, int iTree, int[] workIndex, RandomSampler featureSampler, RandomSampler dataSampler, int maxTreeSize, int minNumSamples, IFindSplit findSplit, TempSpace tempSpace) { this.labelFeatureDataCoded = labelFeatureDataCoded; this.workIndex = workIndex; this.numFeatures = labelFeatureDataCoded.NumFeatures; this.maxTreeSize = maxTreeSize; this.featureImportance = new float[this.numFeatures]; this.minNumSamples = minNumSamples; //distributed setting this.adjustFactor = 1.0F; InitTempSpace(tempSpace); BuildRegressionTree(boostTreeLoss, iTree, findSplit, dataSampler, featureSampler); GC.Collect(); // hope for the best!!! }
public Split FindBestSplit(LabelFeatureDataCoded labelFeatureDataCoded, float[] responses, int[] dataPoints, int[] workIndex, RandomSampler featureSampler, RandomSampler dataSampler, int minNumSamples) { findSplitObj.SetData(labelFeatureDataCoded, responses, dataPoints, workIndex, 0, featureSampler.SampleSize, featureSampler, dataSampler, minNumSamples); findSplitObj.Find(); return findSplitObj.bestSplit; }
public void SetData(LabelFeatureDataCoded LabelFeatureDataCoded, float[] responses, int[] dataPoints, int[] workIndex, int iStart, int iEnd, RandomSampler featureSampler, RandomSampler dataSampler, int minNumSamples) { this.LabelFeatureDataCoded = LabelFeatureDataCoded; this.responses = responses; this.dataPoints = dataPoints; this.workIndex = workIndex; this.iStart = iStart; this.iEnd = iEnd; this.featureSampler = featureSampler; this.dataSampler = dataSampler; this.minNumSamples = minNumSamples; }
public Split FindBestSplit(LabelFeatureDataCoded labelFeatureDataCoded, float[] responses, int[] dataPoints, int[] workIndex, RandomSampler featureSampler, RandomSampler dataSampler, int minNumSamples) { InitThreads(); for (int i = 0; i < featureSampler.SampleSize; i++) { //wait for any of the thread to finish int iThread = WaitHandle.WaitAny(this.DoneEvents); DoneEvents[iThread].Reset(); FindSplitObj_Thread threadObj = findSplitThreadObjList[iThread]; //update the bestSplit given the result of just finished thread if (threadObj.bestSplit > bestSplit) { threadObj.bestSplit.CopyTo(bestSplit); } //assign the data to the thread threadObj.SetData(labelFeatureDataCoded, responses, dataPoints, workIndex, i, i+1, featureSampler, dataSampler, minNumSamples); //set the thread into motion StartEvents[iThread].Set(); } WaitHandle.WaitAll(DoneEvents); for (int i = 0; i < this.cThreads; i++) { FindSplitObj_Thread threadObj = findSplitThreadObjList[i]; if (threadObj.bestSplit > bestSplit) { threadObj.bestSplit.CopyTo(bestSplit); } } return bestSplit; }