Example #1
0
        // The data are preprocessed (quantized) to avoid sorting
        public BoostTree(LabelFeatureDataCoded labelFeatureDataCoded, LabelFeatureData subModelScore,
                        Model subModel, BoostTreeLoss boostTreeLoss,
                        string saveTreeBinFile, string saveTreeTextFile)
        {
            this.labelFeatureDataCoded = labelFeatureDataCoded;
            this.subModelScore = subModelScore;

            UnpackData();

            this.subModel = subModel;
            this.boostTreeLoss = boostTreeLoss;
            this.featureNames = labelFeatureDataCoded.FeatureNames;

            this.saveTreeBinFile = saveTreeBinFile;
            this.saveTreeTextFile = saveTreeTextFile;
            this.saveTreeXmlFile = saveTreeTextFile + ".xml";
        }
Example #2
0
 public void Predict(LabelFeatureData testData, int numIter,
                         BoostTreeLoss boostTreeLoss
                         )
 {
     Predict(testData, numIter, boostTreeLoss, null, false);
 }
Example #3
0
        public void Predict(LabelFeatureData labelFeatureData, int numIter,
                            BoostTreeLoss boostTreeLoss,
                            Metrics metrics, //reporting the error for each iteration if the following are set
                            bool silent // If true, only report results on the last iteration
                            )
        {
            if (numIter > this.TotalIter)
                numIter = this.TotalIter;

            boostTreeLoss.Reset(labelFeatureData.NumDataPoint);

            //(1) compute the probabilities produced by the sub-model
            boostTreeLoss.ModelEval(this.subModel, labelFeatureData, null);

            //(2) compute the corresponding function values;
            boostTreeLoss.ModelScoresToFuncValues();

            if (metrics != null)
            {
                metrics.ComputeMetrics(boostTreeLoss.ModelScores, 0, this.optIter == 0);
#if VERBOSE
                Console.WriteLine(metrics.ResultsHeaderStr());
                Console.WriteLine(metrics.ResultsStr(0));
#endif
            }

            //(3) accumulate the function values for each boosted regression tree
            int numSamples = labelFeatureData.NumDataPoint;
            float[] funValueGain = new float[numSamples];

#if GET_PER_DOC_PER_ITER_SCORES
            float[][] saveScores = ArrayUtils.FloatMatrix(numIter+2, labelFeatureData.NumDataPoint); // We will take transpose when we print
            for (int i = 0; i < labelFeatureData.NumDataPoint; ++i)
            {
                saveScores[0][i] = labelFeatureData.GetGroupId(i);
                saveScores[1][i] = labelFeatureData.GetLabel(i);
            }
#endif

            for (int m = 0; m < numIter; m++)
            {
                // fit a residual model (regression trees) from the pesuso response
                // to compensate the error of the current system
                for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
                {
                    if (this.regressionTrees[m, 0] == null)
                        break;
#if GET_PER_DOC_PER_ITER_SCORES
                    this.regressionTrees[m, k].PredictFunValueNKeepScores(labelFeatureData, this.Train2TestIdx, funValueGain, saveScores[m+2]);
#else
                    this.regressionTrees[m, k].PredictFunValue(labelFeatureData, this.Train2TestIdx, funValueGain);
#endif
                    boostTreeLoss.AccFuncValueGain(funValueGain, 1.0f, k);
                }


                if (metrics != null)
                {
                    //compute the metrics of the current system
                    boostTreeLoss.FuncValuesToModelScores();
                    metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, this.optIter == m + 1);
                    if(m==numIter-1 || !silent)
                        Console.WriteLine(metrics.ResultsStr(m + 1));
                }
            }

#if GET_PER_DOC_PER_ITER_SCORES
            using (StreamWriter sw = new StreamWriter("allScores.tsv"))
            {
                sw.Write("m:QueryID\tm:Rating"); // Write the header (with no tab at the end!)
                for (int j = 1; j < numIter+1; ++j)
                    sw.Write("\tFtr_" + j.ToString("0000"));
                sw.WriteLine();
                for (int j = 0; j < labelFeatureData.NumDataPoint; ++j)
                {
                    sw.Write("{0}\t{1}", saveScores[0][j], saveScores[1][j]); // Write the query ID and label
                    for (int m = 2; m < numIter + 2; ++m)
                        sw.Write("\t{0:G6}", saveScores[m][j]);
                    sw.WriteLine();
                }
            }
#endif

            if (metrics == null)
            {
                boostTreeLoss.FuncValuesToModelScores();
            }
            else
                metrics.SaveScores("DataScores.txt", boostTreeLoss.ModelScores);
        }
Example #4
0
        public double EvaluateWeakLearner(RegressionTree[] candidateTree, float[] funValueGain, Metrics metrics, BoostTreeLoss boostTreeLoss, int id)
        {
            float[][] scores = new float[boostTreeLoss.NumTreesPerIteration][];
            for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
            {
                scores[k] = new float[funValueGain.GetLength(0)];
                for (int i = 0; i < funValueGain.GetLength(0); i++)
                {
                    scores[k][i] = 0.0F;
                }
            }

            double result = 0.0;

            for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
            {

                candidateTree[k].PredictFunValue(this.labelFeatureDataCoded, true, ref funValueGain);

                //we hard code here that k=0 (not performing classification)
                //kms: this is a bit of hack...only will really work for non-classification currently
                // upgrade to have a per loss function evaluation
                for (int i = 0; i < funValueGain.GetLength(0); i++)
                {
                    scores[k][i] =  boostTreeLoss.ModelScores[k][i] +(funValueGain[i] * candidateTree[k].AdjustFactor);
                }
            }

            //need to update id so we have unique id.  For now, we take M + m + 1;
            //assume only want NDCGPairwise for now
            metrics.ComputeMetrics(scores, id, false);
            //NDCGPairwiseType = 2;
            result = metrics.ResultsStrMatrix(id)[(int)DataPartitionType.Train][2];
            //Console.WriteLine(result);

            return result;
        }
Example #5
0
        /// <summary>
        /// This method implements the main functionality of stochastic gradient boosting
        /// </summary>
        private void BuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate,
                                    int maxTreeSize, int minNumSamples, int numIter,
                                    int cThreads, Random r)
        {
            float minValidationErr = 100;

            float[] funValueGain = new float[this.numSamples];

            //(1) compute scores produced by the sub-model
            boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore);

            //(2) compute the corresponding function values;
            boostTreeLoss.ModelScoresToFuncValues();

            //(3) compute the metrics of the sub-model
            int m = optIter = 0;
            metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false);

#if VERBOSE
            Console.WriteLine(metrics.ResultsHeaderStr());
            Console.WriteLine(metrics.ResultsStr(m));
#endif
            //(4) creat samplers to sub-sampl the features and data during node spliting
            RandomSampler featureSampler = new RandomSampler(r);
            RandomSampler dataSampler = new RandomSampler(r);

            //(5) creat the object that does node splitting
#if SINGLE_THREAD
            // single-threaded
             this.findSplit = new FindSplitSync();
#else
            // multi-threaded
            this.findSplit = new FindSplitAsync(cThreads);
#endif //SINGLE_THREAD

            //(6) Iteratively building boosted trees
            for (m = 0; m < numIter; m++)
            {
                // selecting a fraction of data groups for each iteration
                float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m);
                DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r);
                workDataSet.Sort();  // sorting gains some noticable speedup.

                // compute the pseudo response of the current system
                boostTreeLoss.ComputePseudoResponse(workDataSet);

                //set the data and feature sampling rate for node spliting in this iteration
                featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m);
                dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m);

                // fit a residual model (regression trees) from the pesuso response
                // to compensate the error of the current system
                for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
                {
                    //only use the important data points if necessary
                    int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m);

                    //build a regression tree according to the pseduo-response
                    this.regressionTrees[m, k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex,
                                                                    dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace);

                    //compute the function value of all data points produced by the newly generated regression tree
                    this.regressionTrees[m, k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain);

                    //try to do a more global optimalization - refine the leaf node response of a decision tree
                    //by looking at all the training data points, instead of only the ones falling into the regaion.
                    //Here we are estimate and apply a global mutiplication factor for all leaf nodes
                    float adjFactor = (m>0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F;

                    //apply the multiplication factor to the leaf nodes of the newly generated regression tree
                    this.regressionTrees[m, k].AdjustResponse(adjFactor);

                    //update the function value for all data points given the new regression tree
                    boostTreeLoss.AccFuncValueGain(funValueGain, adjFactor, k);
                }

                //compute the metrics of the current system
                boostTreeLoss.FuncValuesToModelScores();
                metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false);
#if VERBOSE
                Console.WriteLine(metrics.ResultsStr(m+1));
#endif
                //keep track of the best (minimal Error) iteration on the Validation data set
                this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr);

                if ((m+1) % 5 == 0)  // save the tree every 5 iterations
                    SaveBoostTree();
            }

            if (this.findSplit != null)
            {
                this.findSplit.Cleanup();
            }
        }
Example #6
0
        public void AddWeakLearner(RegressionTree[] candidateTree, float[] funValueGain, int m, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int cThreads, Random r)
        {
            //update the function value for all data points given the new regression tree
            for (int i = 0; i < boostTreeLoss.NumTreesPerIteration; i++)
            {
                candidateTree[i].PredictFunValue(this.labelFeatureDataCoded, true, ref funValueGain);

                this.regressionTrees[m, i] = candidateTree[i];
                boostTreeLoss.AccFuncValueGain(funValueGain, candidateTree[i].AdjustFactor, i);
            }
        }
Example #7
0
        public RegressionTree[] GetNextWeakLearner(int m, float[] funValueGain, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, RandomSampler dataSampler, RandomSampler featureSampler,
                                    int maxTreeSize, int minNumSamples, int cThreads, Random r)
        {
            // select a fraction of data groups for this iteration
            float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m);
            DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r);
            workDataSet.Sort();  // sorting gains some noticable speedup.

            // compute the pseudo response of the current system
            boostTreeLoss.ComputePseudoResponse(workDataSet);

            //set the data and feature sampling rate for node spliting in this iteration
            featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m);
            dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m);

            // fit a residual model (regression trees) from the pseudo response
            // to compensate the error of the current system

            RegressionTree[] newTree = new RegressionTree[boostTreeLoss.NumTreesPerIteration];

            for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
            {
                //only use the important data points if necessary
                int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m);

                //build a regression tree according to the pseduo-response
                newTree[k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex,
                                                                dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace);

                //compute the function value of all data points produced by the newly generated regression tree
                newTree[k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain);

                //try to do a more global optimalization - refine the leaf node response of a decision tree
                //by looking at all the training data points, instead of only the ones falling into the regaion.
                //Here we are estimate and apply a global mutiplication factor for all leaf nodes
                float adjFactor = (m > 0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F;

                //apply the multiplication factor to the leaf nodes of the newly generated regression tree
                newTree[k].AdjustResponse(adjFactor);
                newTree[k].AdjustFactor = adjFactor;
            }

            //return the k regression trees
            return newTree;
        }
Example #8
0
        /// <summary>
        /// This method implements the main functionality of stochastic gradient boosting, for distributed computing
        /// </summary>
        private void DistributedBuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate,
                                    int maxTreeSize, int minNumSamples, int numIter,
                                    int cThreads, Random r)
        {
            float minValidationErr = 100;

            float[] funValueGain = new float[this.numSamples];

            //(1) compute scores produced by the sub-model
            boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore);

            //(2) compute the corresponding function values;
            boostTreeLoss.ModelScoresToFuncValues();

            //(3) compute the metrics of the sub-model
            int m = optIter = 0;
            metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false);
#if VERBOSE
            Console.WriteLine(metrics.ResultsHeaderStr());
            Console.WriteLine(metrics.ResultsStr(m));
#endif
            //(4) creat samplers to sub-sampl the features and data during node spliting
            RandomSampler featureSampler = new RandomSampler(r);
            RandomSampler dataSampler = new RandomSampler(r);

            //(5) creat the object that does node splitting
#if SINGLE_THREAD
            // single-threaded
             this.findSplit = new FindSplitSync();
#else
            // multi-threaded
            this.findSplit = new FindSplitAsync(cThreads);
#endif //SINGLE_THREAD

            //(6) Iteratively building boosted trees
            for (m = 0; m < numIter; m++)
            {
                //returns array of regression trees (one per class k) for this iteration
                RegressionTree[] candidateTree = GetNextWeakLearner(m, funValueGain, metrics,boostTreeLoss,dataFeatureSampleRate, dataSampler, featureSampler, maxTreeSize,minNumSamples,cThreads,r);

                AddWeakLearner(candidateTree, funValueGain, m, metrics, boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, cThreads, r);

                //compute the metrics of the current system
                boostTreeLoss.FuncValuesToModelScores();
                metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false);
#if VERBOSE
                Console.WriteLine(metrics.ResultsStr(m + 1));
#endif
                //keep track of the best (minimal Error) iteration on the Validation data set
                this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr);

                if ((m + 1) % 5 == 0)  // save the tree every 5 iterations
                    SaveBoostTree();
            }

            if (this.findSplit != null)
            {
                this.findSplit.Cleanup();
            }
        }
Example #9
0
        private void BuildRegressionTree(BoostTreeLoss boostTreeLoss, int iTree, IFindSplit findSplit, RandomSampler featureSampler, RandomSampler dataSampler)
        {
            this.responses = boostTreeLoss.PseudoResponse(iTree);

            TreeNode root = new TreeNode();
            root.isTerminal = true;
            root.dataPoints = Vector.IndexArray(this.workIndex.Length);

            this.tree = new TreeNode[2 * maxTreeSize - 1];
            this.tree[0] = root;

            for (int i = 0; i < maxTreeSize - 1; i++)
            {
                float maxGain = -1;
                int bestRegion = -1;

                TreeNode leftNode = new TreeNode();
                TreeNode rightNode = new TreeNode();

                //qiangwu: compute the best split for new nodes
                //         We only need to explore the last two nodes because they are and only they are new nodes i.e.
                //         for (int j = 2*i; j >= 0; j--)
                for (int j = 0; j < 2 * i + 1; j++)
                {
                    TreeNode curNode = this.tree[j];

                    //qiangwu: (assert curNode.split<0 && curNode.isTerminal) <==> (2*i-1 <= j <= 2*i)
                    if (curNode.split<0 && curNode.isTerminal && curNode.dataPoints.Length >= this.minNumSamples)
                    {
                        dataSampler.Shuffle(curNode.dataPoints.Length);
                        featureSampler.Shuffle(this.numFeatures);

                        Split bestSplit = findSplit.FindBestSplit(this.labelFeatureDataCoded, this.responses, curNode.dataPoints, this.workIndex, featureSampler, dataSampler, this.minNumSamples);

                        //qiangwu: the only way (bestSplit.feature < 0) not slippint is because this.dataColRange[dim]=1 for all
                        //         dimensions. I.e. the values all of data points in every dimension are the same (or in one bin)
                        if (bestSplit.feature >= 0)
                        {
                            curNode.split = bestSplit.feature;
                            curNode.gain = (float)bestSplit.gain;
                            curNode.splitValueCoded = bestSplit.iThresh + 0.2F; // add 0.2 to avoid boundary check or floating point rounding
                            curNode.splitValue = this.labelFeatureDataCoded.ConvertToOrigData(curNode.split, curNode.splitValueCoded);
                            //SplitOneDim(curNode.dataPoints, regionSplitDim, regionSplitPoint, out curNode.leftPoints, out curNode.rightPoints);
                        }
                    }
                    if (curNode.gain > maxGain)
                    {
                        maxGain = curNode.gain;
                        bestRegion = j;
                    }
                }

                if (bestRegion == -1)
                    break;

                TreeNode bestNode = this.tree[bestRegion];

                SplitOneDim(bestNode.dataPoints, bestNode.split, (int)bestNode.splitValueCoded, out bestNode.leftPoints, out bestNode.rightPoints);

                leftNode.isTerminal = true; leftNode.parent = bestRegion;
                leftNode.dataPoints = bestNode.leftPoints;

                rightNode.isTerminal = true; rightNode.parent = bestRegion;
                rightNode.dataPoints = bestNode.rightPoints;

                this.tree[2 * i + 1] = leftNode; this.tree[2 * i + 2] = rightNode;

                this.featureImportance[bestNode.split] += bestNode.gain;

                bestNode.leftChild = 2 * i + 1;
                bestNode.rightChild = 2 * i + 2;
                bestNode.isTerminal = false;
                bestNode.gain = -1;
                bestNode.dataPoints = null;
                bestNode.leftPoints = null;
                bestNode.rightPoints = null;
                GC.Collect(); // hope for the best.
            }

            //qiangwu: compute the response of newly created region (node)
            for (int i = 0; i < this.tree.Length; i++)
            {
                if (this.tree[i] != null && this.tree[i].isTerminal)
                {
                    Debug.Assert(this.tree[i].dataPoints.Length >= this.minNumSamples, "Regression Tree split has problems");
                    float v = boostTreeLoss.Response(this.tree[i].dataPoints, this.workIndex, iTree);
                    //round the regional value to 5 decimal point
                    //to remove/alleviate the differences due to floating point precision
                    //so that different algorithms produces the same model/results
            #if ROUND
                    this.tree[i].regionValue = (float)Math.Round(v, 5);
            #else
                    this.tree[i].regionValue = v;
            #endif //ROUND
                    this.tree[i].dataPoints = null;
                    this.tree[i].leftPoints = null;
                    this.tree[i].rightPoints = null;
                    GC.Collect();
                }
            }
        }
Example #10
0
        public RegressionTree(LabelFeatureDataCoded labelFeatureDataCoded, BoostTreeLoss boostTreeLoss, int iTree, int[] workIndex,
                              RandomSampler featureSampler, RandomSampler dataSampler,
                              int maxTreeSize, int minNumSamples,
                              IFindSplit findSplit, TempSpace tempSpace)
        {
            this.labelFeatureDataCoded = labelFeatureDataCoded;
            this.workIndex = workIndex;
            this.numFeatures = labelFeatureDataCoded.NumFeatures;
            this.maxTreeSize = maxTreeSize;
            this.featureImportance = new float[this.numFeatures];
            this.minNumSamples = minNumSamples;

            //distributed setting
            this.adjustFactor = 1.0F;

            InitTempSpace(tempSpace);
            BuildRegressionTree(boostTreeLoss, iTree, findSplit, dataSampler, featureSampler);
            GC.Collect(); // hope for the best!!!
        }