Exemplo n.º 1
0
        /// <summary>
        /// Parse array of feature importance values from text dump of XGBoost trees.
        /// </summary>
        /// <param name="textTrees"></param>
        /// <returns></returns>
        public static GBMTree[] FromXGBoostTextTreesToGBMTrees(string[] textTrees)
        {
            var trees = new GBMTree[textTrees.Length];

            for (int i = 0; i < textTrees.Length; i++)
            {
                trees[i] = ConvertXGBoostTextTreeToGBMTree(textTrees[i]);
            }

            return(trees);
        }
Exemplo n.º 2
0
        /// <summary>
        ///  A series of regression trees are fitted stage wise on the residuals of the previous tree
        /// </summary>
        /// <param name="observations"></param>
        /// <param name="targets"></param>
        /// <param name="indices"></param>
        /// <returns></returns>
        public RegressionGradientBoostModel Learn(F64Matrix observations, double[] targets,
                                                  int[] indices)
        {
            Checks.VerifyObservationsAndTargets(observations, targets);
            Checks.VerifyIndices(indices, observations, targets);

            var rows            = observations.RowCount;
            var orderedElements = CreateOrderedElements(observations, rows);

            var inSample = targets.Select(t => false).ToArray();

            indices.ForEach(i => inSample[i] = true);
            var workIndices = indices.ToArray();

            var trees = new GBMTree[m_iterations];

            var initialLoss = m_loss.InitialLoss(targets, inSample);
            var predictions = targets.Select(t => initialLoss).ToArray();
            var residuals   = new double[targets.Length];

            var predictWork = new double[observations.RowCount];

            for (int iteration = 0; iteration < m_iterations; iteration++)
            {
                m_loss.UpdateResiduals(targets, predictions, residuals, inSample);

                var sampleSize = targets.Length;
                if (m_subSampleRatio != 1.0)
                {
                    sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length);
                    var currentInSample = Sample(sampleSize, workIndices, targets.Length);

                    trees[iteration] = m_learner.Learn(observations, targets, residuals,
                                                       predictions, orderedElements, currentInSample);
                }
                else
                {
                    trees[iteration] = m_learner.Learn(observations, targets, residuals,
                                                       predictions, orderedElements, inSample);
                }

                trees[iteration].Predict(observations, predictWork);
                for (int i = 0; i < predictWork.Length; i++)
                {
                    predictions[i] += m_learningRate * predictWork[i];
                }
            }

            return(new RegressionGradientBoostModel(trees, m_learningRate, initialLoss,
                                                    observations.ColumnCount));
        }
Exemplo n.º 3
0
        /// <summary>
        /// Learns a RegressionGradientBoostModel with early stopping.
        /// The parameter earlyStoppingRounds controls how often the validation error is measured.
        /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned.
        /// The number of iterations used is equal to the number of trees in the resulting model.
        /// The method used for early stopping is based on the article:
        /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf
        /// </summary>
        /// <param name="trainingObservations"></param>
        /// <param name="trainingTargets"></param>
        /// <param name="validationObservations"></param>
        /// <param name="validationTargets"></param>
        /// <param name="metric">The metric to use for early stopping</param>
        /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations.</param>
        /// <returns>RegressionGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns>
        public RegressionGradientBoostModel LearnWithEarlyStopping(
            F64Matrix trainingObservations,
            double[] trainingTargets,
            F64Matrix validationObservations,
            double[] validationTargets,
            IMetric <double, double> metric,
            int earlyStoppingRounds)
        {
            if (earlyStoppingRounds >= m_iterations)
            {
                throw new ArgumentException("Number of iterations " + m_iterations +
                                            " is smaller than earlyStoppingRounds " + earlyStoppingRounds);
            }

            Checks.VerifyObservationsAndTargets(trainingObservations, trainingTargets);
            Checks.VerifyObservationsAndTargets(validationObservations, validationTargets);

            var rows            = trainingObservations.RowCount;
            var orderedElements = CreateOrderedElements(trainingObservations, rows);

            var inSample = trainingTargets.Select(t => false).ToArray();
            var indices  = Enumerable.Range(0, trainingTargets.Length).ToArray();

            indices.ForEach(i => inSample[i] = true);
            var workIndices = indices.ToArray();

            var trees = new GBMTree[m_iterations];

            var initialLoss = m_loss.InitialLoss(trainingTargets, inSample);
            var predictions = trainingTargets.Select(t => initialLoss).ToArray();
            var residuals   = new double[trainingTargets.Length];

            var bestIterationCount = 0;
            var currentBedstError  = double.MaxValue;

            var predictWork = new double[trainingObservations.RowCount];

            for (int iteration = 0; iteration < m_iterations; iteration++)
            {
                m_loss.UpdateResiduals(trainingTargets, predictions, residuals, inSample);

                var sampleSize = trainingTargets.Length;
                if (m_subSampleRatio != 1.0)
                {
                    sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length);
                    var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length);

                    trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals,
                                                       predictions, orderedElements, currentInSample);
                }
                else
                {
                    trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals,
                                                       predictions, orderedElements, inSample);
                }

                trees[iteration].Predict(trainingObservations, predictWork);
                for (int i = 0; i < predictWork.Length; i++)
                {
                    predictions[i] += m_learningRate * predictWork[i];
                }

                // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds
                // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees).
                if ((iteration % earlyStoppingRounds) == 0)
                {
                    var model = new RegressionGradientBoostModel(trees.Take(iteration).ToArray(),
                                                                 m_learningRate, initialLoss, trainingObservations.ColumnCount);

                    var validPredictions = model.Predict(validationObservations);
                    var error            = metric.Error(validationTargets, validPredictions);

                    Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error);

                    if (currentBedstError > error)
                    {
                        currentBedstError  = error;
                        bestIterationCount = iteration;
                    }
                }
            }

            return(new RegressionGradientBoostModel(trees.Take(bestIterationCount).ToArray(),
                                                    m_learningRate, initialLoss, trainingObservations.ColumnCount));
        }
Exemplo n.º 4
0
        /// <summary>
        /// Learns a ClassificationGradientBoostModel with early stopping.
        /// The parameter earlyStoppingRounds controls how often the validation error is measured.
        /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned.
        /// The number of iterations used is equal to the number of trees in the resulting model.
        /// The method used for early stopping is based on the article:
        /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf
        /// </summary>
        /// <param name="trainingObservations"></param>
        /// <param name="trainingTargets"></param>
        /// <param name="validationObservations"></param>
        /// <param name="validationTargets"></param>
        /// <param name="metric">The metric to use for early stopping</param>
        /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations</param>
        /// <returns>ClassificationGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns>
        public ClassificationGradientBoostModel LearnWithEarlyStopping(
            F64Matrix trainingObservations,
            double[] trainingTargets,
            F64Matrix validationObservations,
            double[] validationTargets,
            IMetric <double, ProbabilityPrediction> metric,
            int earlyStoppingRounds)
        {
            if (earlyStoppingRounds >= m_iterations)
            {
                throw new ArgumentException("Number of iterations " + m_iterations +
                                            " is smaller than earlyStoppingRounds " + earlyStoppingRounds);
            }

            var rows            = trainingObservations.RowCount;
            var orderedElements = CreateOrderedElements(trainingObservations, rows);

            var inSample = trainingTargets.Select(t => false).ToArray();
            var indices  = Enumerable.Range(0, trainingTargets.Length).ToArray();

            indices.ForEach(i => inSample[i] = true);
            var workIndices = indices.ToArray();

            var uniqueTargets = trainingTargets.Distinct().OrderBy(v => v).ToArray();
            var initialLoss   = m_loss.InitialLoss(trainingTargets, inSample);

            double[][]  oneVsAllTargets = null;
            double[][]  predictions     = null;
            double[][]  residuals       = null;
            GBMTree[][] trees           = null;

            if (uniqueTargets.Length == 2) // Binary case - only need to fit to one class and use (1.0 - probability)
            {
                trees       = new GBMTree[][] { new GBMTree[m_iterations] };
                predictions = new double[][] { trainingTargets.Select(_ => initialLoss).ToArray() };
                residuals   = new double[][] { new double[trainingTargets.Length] };

                oneVsAllTargets = new double[1][];
                var target = uniqueTargets[0];
                oneVsAllTargets[0] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray();
            }
            else // multi-class case - use oneVsAll strategy and fit probability for each class
            {
                trees       = new GBMTree[uniqueTargets.Length][];
                predictions = uniqueTargets.Select(_ => trainingTargets.Select(t => initialLoss).ToArray())
                              .ToArray();
                residuals = uniqueTargets.Select(_ => new double[trainingTargets.Length])
                            .ToArray();

                oneVsAllTargets = new double[uniqueTargets.Length][];
                for (int i = 0; i < uniqueTargets.Length; i++)
                {
                    var target = uniqueTargets[i];
                    oneVsAllTargets[i] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray();
                    trees[i]           = new GBMTree[m_iterations];
                }
            }

            var bestIterationCount = 0;
            var currentBedstError  = double.MaxValue;

            for (int iteration = 0; iteration < m_iterations; iteration++)
            {
                for (int itarget = 0; itarget < trees.Length; itarget++)
                {
                    m_loss.UpdateResiduals(oneVsAllTargets[itarget], predictions[itarget],
                                           residuals[itarget], inSample);

                    var sampleSize = trainingTargets.Length;
                    if (m_subSampleRatio != 1.0)
                    {
                        sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length);
                        var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length);

                        trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget],
                                                                    residuals[itarget], predictions[itarget], orderedElements, currentInSample);
                    }
                    else
                    {
                        trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget],
                                                                    residuals[itarget], predictions[itarget], orderedElements, inSample);
                    }

                    var predict = trees[itarget][iteration].Predict(trainingObservations);
                    for (int i = 0; i < predict.Length; i++)
                    {
                        predictions[itarget][i] += m_learningRate * predict[i];
                    }
                }

                // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds
                // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees).
                if (iteration % earlyStoppingRounds == 0)
                {
                    var model = new ClassificationGradientBoostModel(
                        trees.Select(t => t.Take(iteration).ToArray()).ToArray(),
                        uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount);

                    var validPredictions = model.PredictProbability(validationObservations);
                    var error            = metric.Error(validationTargets, validPredictions);

                    Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error);

                    if (currentBedstError >= error)
                    {
                        currentBedstError  = error;
                        bestIterationCount = iteration;
                    }
                }
            }

            return(new ClassificationGradientBoostModel(
                       trees.Select(t => t.Take(bestIterationCount).ToArray()).ToArray(),
                       uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount));
        }
Exemplo n.º 5
0
        /// <summary>
        ///  A series of regression trees are fitted stage wise on the residuals of the previous stage
        /// </summary>
        /// <param name="observations"></param>
        /// <param name="targets"></param>
        /// <param name="indices"></param>
        /// <returns></returns>
        public ClassificationGradientBoostModel Learn(F64Matrix observations, double[] targets,
                                                      int[] indices)
        {
            Checks.VerifyObservationsAndTargets(observations, targets);
            Checks.VerifyIndices(indices, observations, targets);

            var rows            = observations.RowCount;
            var orderedElements = CreateOrderedElements(observations, rows);

            var inSample = targets.Select(t => false).ToArray();

            indices.ForEach(i => inSample[i] = true);
            var workIndices = indices.ToArray();

            var uniqueTargets = targets.Distinct().OrderBy(v => v).ToArray();
            var initialLoss   = m_loss.InitialLoss(targets, inSample);

            double[][]  oneVsAllTargets = null;
            double[][]  predictions     = null;
            double[][]  residuals       = null;
            GBMTree[][] trees           = null;

            if (uniqueTargets.Length == 2) // Binary case - only need to fit to one class and use (1.0 - probability)
            {
                trees       = new GBMTree[][] { new GBMTree[m_iterations] };
                predictions = new double[][] { targets.Select(_ => initialLoss).ToArray() };
                residuals   = new double[][] { new double[targets.Length] };

                oneVsAllTargets = new double[1][];
                var target = uniqueTargets[0];
                oneVsAllTargets[0] = targets.Select(t => t == target ? 1.0 : 0.0).ToArray();
            }
            else // multi-class case - use oneVsAll strategy and fit probability for each class
            {
                trees       = new GBMTree[uniqueTargets.Length][];
                predictions = uniqueTargets.Select(_ => targets.Select(t => initialLoss).ToArray())
                              .ToArray();
                residuals = uniqueTargets.Select(_ => new double[targets.Length])
                            .ToArray();

                oneVsAllTargets = new double[uniqueTargets.Length][];
                for (int i = 0; i < uniqueTargets.Length; i++)
                {
                    var target = uniqueTargets[i];
                    oneVsAllTargets[i] = targets.Select(t => t == target ? 1.0 : 0.0).ToArray();
                    trees[i]           = new GBMTree[m_iterations];
                }
            }

            var predictWork = new double[observations.RowCount];

            for (int iteration = 0; iteration < m_iterations; iteration++)
            {
                for (int itarget = 0; itarget < trees.Length; itarget++)
                {
                    m_loss.UpdateResiduals(oneVsAllTargets[itarget], predictions[itarget],
                                           residuals[itarget], inSample);

                    var sampleSize = targets.Length;
                    if (m_subSampleRatio != 1.0)
                    {
                        sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length);
                        var currentInSample = Sample(sampleSize, workIndices, targets.Length);

                        trees[itarget][iteration] = m_learner.Learn(observations, oneVsAllTargets[itarget],
                                                                    residuals[itarget], predictions[itarget], orderedElements, currentInSample);
                    }
                    else
                    {
                        trees[itarget][iteration] = m_learner.Learn(observations, oneVsAllTargets[itarget],
                                                                    residuals[itarget], predictions[itarget], orderedElements, inSample);
                    }

                    trees[itarget][iteration].Predict(observations, predictWork);
                    for (int i = 0; i < predictWork.Length; i++)
                    {
                        predictions[itarget][i] += m_learningRate * predictWork[i];
                    }
                }
            }

            return(new ClassificationGradientBoostModel(trees, uniqueTargets, m_learningRate,
                                                        initialLoss, observations.ColumnCount));
        }