Beispiel #1
0
        public void DistributedBuild(Metrics metrics, DataFeatureSampleRate dataFeatureSampleRate,
                          int maxTreeSize, int minNumSamples, int numIter,
                          int cThreads, Random r)
        {
            this.regressionTrees = new RegressionTree[numIter, this.boostTreeLoss.NumTreesPerIteration];

            this.tempSpace = new TempSpace(this.numSamples);

            DistributedBuildBoostTree(metrics, this.boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, numIter, cThreads, r);
            SaveBoostTree();
        }
Beispiel #2
0
        /// <summary>
        /// This method implements the main functionality of stochastic gradient boosting
        /// </summary>
        private void BuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate,
                                    int maxTreeSize, int minNumSamples, int numIter,
                                    int cThreads, Random r)
        {
            float minValidationErr = 100;

            float[] funValueGain = new float[this.numSamples];

            //(1) compute scores produced by the sub-model
            boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore);

            //(2) compute the corresponding function values;
            boostTreeLoss.ModelScoresToFuncValues();

            //(3) compute the metrics of the sub-model
            int m = optIter = 0;
            metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false);

#if VERBOSE
            Console.WriteLine(metrics.ResultsHeaderStr());
            Console.WriteLine(metrics.ResultsStr(m));
#endif
            //(4) creat samplers to sub-sampl the features and data during node spliting
            RandomSampler featureSampler = new RandomSampler(r);
            RandomSampler dataSampler = new RandomSampler(r);

            //(5) creat the object that does node splitting
#if SINGLE_THREAD
            // single-threaded
             this.findSplit = new FindSplitSync();
#else
            // multi-threaded
            this.findSplit = new FindSplitAsync(cThreads);
#endif //SINGLE_THREAD

            //(6) Iteratively building boosted trees
            for (m = 0; m < numIter; m++)
            {
                // selecting a fraction of data groups for each iteration
                float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m);
                DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r);
                workDataSet.Sort();  // sorting gains some noticable speedup.

                // compute the pseudo response of the current system
                boostTreeLoss.ComputePseudoResponse(workDataSet);

                //set the data and feature sampling rate for node spliting in this iteration
                featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m);
                dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m);

                // fit a residual model (regression trees) from the pesuso response
                // to compensate the error of the current system
                for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
                {
                    //only use the important data points if necessary
                    int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m);

                    //build a regression tree according to the pseduo-response
                    this.regressionTrees[m, k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex,
                                                                    dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace);

                    //compute the function value of all data points produced by the newly generated regression tree
                    this.regressionTrees[m, k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain);

                    //try to do a more global optimalization - refine the leaf node response of a decision tree
                    //by looking at all the training data points, instead of only the ones falling into the regaion.
                    //Here we are estimate and apply a global mutiplication factor for all leaf nodes
                    float adjFactor = (m>0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F;

                    //apply the multiplication factor to the leaf nodes of the newly generated regression tree
                    this.regressionTrees[m, k].AdjustResponse(adjFactor);

                    //update the function value for all data points given the new regression tree
                    boostTreeLoss.AccFuncValueGain(funValueGain, adjFactor, k);
                }

                //compute the metrics of the current system
                boostTreeLoss.FuncValuesToModelScores();
                metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false);
#if VERBOSE
                Console.WriteLine(metrics.ResultsStr(m+1));
#endif
                //keep track of the best (minimal Error) iteration on the Validation data set
                this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr);

                if ((m+1) % 5 == 0)  // save the tree every 5 iterations
                    SaveBoostTree();
            }

            if (this.findSplit != null)
            {
                this.findSplit.Cleanup();
            }
        }
Beispiel #3
0
        public void AddWeakLearner(RegressionTree[] candidateTree, float[] funValueGain, int m, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, int maxTreeSize, int minNumSamples, int cThreads, Random r)
        {
            //update the function value for all data points given the new regression tree
            for (int i = 0; i < boostTreeLoss.NumTreesPerIteration; i++)
            {
                candidateTree[i].PredictFunValue(this.labelFeatureDataCoded, true, ref funValueGain);

                this.regressionTrees[m, i] = candidateTree[i];
                boostTreeLoss.AccFuncValueGain(funValueGain, candidateTree[i].AdjustFactor, i);
            }
        }
Beispiel #4
0
        public RegressionTree[] GetNextWeakLearner(int m, float[] funValueGain, Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate, RandomSampler dataSampler, RandomSampler featureSampler,
                                    int maxTreeSize, int minNumSamples, int cThreads, Random r)
        {
            // select a fraction of data groups for this iteration
            float sampleRate = dataFeatureSampleRate.SampleDataGroupRate(m);
            DataSet workDataSet = this.labelFeatureDataCoded.DataGroups.GetDataPartition(DataPartitionType.Train, sampleRate, r);
            workDataSet.Sort();  // sorting gains some noticable speedup.

            // compute the pseudo response of the current system
            boostTreeLoss.ComputePseudoResponse(workDataSet);

            //set the data and feature sampling rate for node spliting in this iteration
            featureSampler.SampleRate = dataFeatureSampleRate.SampleFeatureRate(m);
            dataSampler.SampleRate = dataFeatureSampleRate.SampleDataRate(m);

            // fit a residual model (regression trees) from the pseudo response
            // to compensate the error of the current system

            RegressionTree[] newTree = new RegressionTree[boostTreeLoss.NumTreesPerIteration];

            for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++)
            {
                //only use the important data points if necessary
                int[] trimIndex = boostTreeLoss.TrimIndex(workDataSet, k, m);

                //build a regression tree according to the pseduo-response
                newTree[k] = new RegressionTree(this.labelFeatureDataCoded, boostTreeLoss, k, trimIndex,
                                                                dataSampler, featureSampler, maxTreeSize, minNumSamples, this.findSplit, this.tempSpace);

                //compute the function value of all data points produced by the newly generated regression tree
                newTree[k].PredictFunValue(this.labelFeatureDataCoded, ref funValueGain);

                //try to do a more global optimalization - refine the leaf node response of a decision tree
                //by looking at all the training data points, instead of only the ones falling into the regaion.
                //Here we are estimate and apply a global mutiplication factor for all leaf nodes
                float adjFactor = (m > 0) ? boostTreeLoss.ComputeResponseAdjust(funValueGain) : 1.0F;

                //apply the multiplication factor to the leaf nodes of the newly generated regression tree
                newTree[k].AdjustResponse(adjFactor);
                newTree[k].AdjustFactor = adjFactor;
            }

            //return the k regression trees
            return newTree;
        }
Beispiel #5
0
        /// <summary>
        /// This method implements the main functionality of stochastic gradient boosting, for distributed computing
        /// </summary>
        private void DistributedBuildBoostTree(Metrics metrics, BoostTreeLoss boostTreeLoss, DataFeatureSampleRate dataFeatureSampleRate,
                                    int maxTreeSize, int minNumSamples, int numIter,
                                    int cThreads, Random r)
        {
            float minValidationErr = 100;

            float[] funValueGain = new float[this.numSamples];

            //(1) compute scores produced by the sub-model
            boostTreeLoss.ModelEval(this.subModel, this.labelFeatureDataCoded, this.subModelScore);

            //(2) compute the corresponding function values;
            boostTreeLoss.ModelScoresToFuncValues();

            //(3) compute the metrics of the sub-model
            int m = optIter = 0;
            metrics.ComputeMetrics(boostTreeLoss.ModelScores, m, false);
#if VERBOSE
            Console.WriteLine(metrics.ResultsHeaderStr());
            Console.WriteLine(metrics.ResultsStr(m));
#endif
            //(4) creat samplers to sub-sampl the features and data during node spliting
            RandomSampler featureSampler = new RandomSampler(r);
            RandomSampler dataSampler = new RandomSampler(r);

            //(5) creat the object that does node splitting
#if SINGLE_THREAD
            // single-threaded
             this.findSplit = new FindSplitSync();
#else
            // multi-threaded
            this.findSplit = new FindSplitAsync(cThreads);
#endif //SINGLE_THREAD

            //(6) Iteratively building boosted trees
            for (m = 0; m < numIter; m++)
            {
                //returns array of regression trees (one per class k) for this iteration
                RegressionTree[] candidateTree = GetNextWeakLearner(m, funValueGain, metrics,boostTreeLoss,dataFeatureSampleRate, dataSampler, featureSampler, maxTreeSize,minNumSamples,cThreads,r);

                AddWeakLearner(candidateTree, funValueGain, m, metrics, boostTreeLoss, dataFeatureSampleRate, maxTreeSize, minNumSamples, cThreads, r);

                //compute the metrics of the current system
                boostTreeLoss.FuncValuesToModelScores();
                metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, false);
#if VERBOSE
                Console.WriteLine(metrics.ResultsStr(m + 1));
#endif
                //keep track of the best (minimal Error) iteration on the Validation data set
                this.optIter = metrics.GetBest(DataPartitionType.Validation, ref minValidationErr);

                if ((m + 1) % 5 == 0)  // save the tree every 5 iterations
                    SaveBoostTree();
            }

            if (this.findSplit != null)
            {
                this.findSplit.Cleanup();
            }
        }
Beispiel #6
0
        /// <summary>
        /// Main Program 
        /// </summary>
        /// <param name="args">
        /// Should contain five parameters in the order of 
        /// file name for training source data (DataProcess class),
        /// file name for saving the tree
        /// tree size, i.e., maximum number of terminal nodes, usually 16 - 20
        /// learning rate, usually 0.02 - 0.06
        /// number of iterations, usallay >500(can be pruned later)
        /// </param>
        public static void Main(string[] args)
        {
            try
            {
                TrainArgs cmd = new TrainArgs(args);

                Random r = null;
                if (cmd.seed == -1)
                {
                    r = new Random();
                }
                else
                {
                    r = new Random(cmd.seed);
                }

                string[] activeFeatureNames = null;
                //read and process only a subset of activated features as specified in the activeFeatureFile
                if (cmd.activeFeatureFile != null)
                {
                    activeFeatureNames = TsvFileLoader.ReadFeatureNames(cmd.activeFeatureFile);
                }

                //feature parser: special module that understand MSN style value encoding
                MsnFeatureParser featureParser = new MsnFeatureParser(activeFeatureNames);

                //the column name for label:
                string[] labelName = { cmd.labelName };
                //label/rating parser:
                IParser<float> RateParser = new MsnLabelParser(labelName, cmd.labelNameValueFile);

                //data boundary: no boundary
                OnelineGroup noBounadry = new OnelineGroup();

                //Load coded data if exist
                LabelFeatureDataCoded trainLabelFeatureDataCoded = (CLabelFeatureDataCoded)CLabelFeatureData.Load(cmd.trainFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureDataCoded), activeFeatureNames, cmd.cThreads, cmd.SparseCoded);
                LabelFeatureData validLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.validFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded);
                LabelFeatureData testLabelFeatureData = (CLabelFeatureData)CLabelFeatureData.Load(cmd.testFile, featureParser, RateParser, noBounadry, typeof(CLabelFeatureData), activeFeatureNames, cmd.cThreads, cmd.SparseCoded);

                //build composite data - an aggregated data object that keeps tract of train/valid/test data
                CLabelFeatureDataCodedComposite labelFeatureDataCoded = CLabelFeatureDataCodedComposite.Create(trainLabelFeatureDataCoded, validLabelFeatureData, testLabelFeatureData);

                //initial submodel to boost on
                Model subModel = null;
                if (cmd.subModel != null && cmd.subModel.Length > 0)
                {
                    if (cmd.cLayer > 0)
                    {
                        string[] layerNames = new string[cmd.cLayer];
                        for (int i = 0; i < cmd.cLayer; i++)
                        {
                            string num = "";
                            if (cmd.cLayer > 1)
                            {
                                num = (i + 1).ToString();
                            }
                            layerNames[i] = cmd.subModel + "layer" + num + ".txt";
                        }
                        subModel = new NNModel(layerNames);
                    }
                    else
                    {
                        string iniName = cmd.subModel + ".ini";
                        subModel = NNModelMSN.Create(iniName);
                    }
                }

                if (subModel != null && !subModel.SetFeatureNames(labelFeatureDataCoded.FeatureNames))
                {
                    Console.WriteLine("Fail to initialize specified submodel - training with empty submodel");
                    subModel = null;
                }

                LabelConverter labelConvert = new LabelConverterNull();

                LabelFeatureData subModelScore = null;
                McBoostTreeLoss boostTreeLoss = new McBoostTreeLoss(labelFeatureDataCoded, labelConvert, cmd.learnRate);
                BoostTree boostTree = new BoostTree(labelFeatureDataCoded, subModelScore, subModel, boostTreeLoss,
                                                cmd.binaryTreeFile, cmd.textTreeFile);

                //set up the error metrics that we like to keep tract of during testing
                DataPartitionType[] dataTypes = { DataPartitionType.Train, DataPartitionType.Validation, DataPartitionType.Test };

                //dp.LabelFeatureData is the data we are evaluating
                Metrics metrics = new ClassError(labelFeatureDataCoded, labelConvert, dataTypes);

                DataFeatureSampleRate dataFeatureSampleRate = new DataFeatureSampleRate(cmd.sampleFeatureRate, cmd.sampleDataRate, cmd.sampleDataGroupRate);

                boostTree.Build(metrics, dataFeatureSampleRate, cmd.cLeafNodes, cmd.minNumSamples, cmd.numIter, cmd.cThreads, r);

                metrics.SaveAllResults("TrainingErrHistory.txt");

            }
            catch (Exception exc)
            {
                Console.WriteLine(exc.Message);
            }
        }