/// <summary> /// Parse array of feature importance values from text dump of XGBoost trees. /// </summary> /// <param name="textTrees"></param> /// <returns></returns> public static GBMTree[] FromXGBoostTextTreesToGBMTrees(string[] textTrees) { var trees = new GBMTree[textTrees.Length]; for (int i = 0; i < textTrees.Length; i++) { trees[i] = ConvertXGBoostTextTreeToGBMTree(textTrees[i]); } return(trees); }
/// <summary> /// A series of regression trees are fitted stage wise on the residuals of the previous tree /// </summary> /// <param name="observations"></param> /// <param name="targets"></param> /// <param name="indices"></param> /// <returns></returns> public RegressionGradientBoostModel Learn(F64Matrix observations, double[] targets, int[] indices) { Checks.VerifyObservationsAndTargets(observations, targets); Checks.VerifyIndices(indices, observations, targets); var rows = observations.RowCount; var orderedElements = CreateOrderedElements(observations, rows); var inSample = targets.Select(t => false).ToArray(); indices.ForEach(i => inSample[i] = true); var workIndices = indices.ToArray(); var trees = new GBMTree[m_iterations]; var initialLoss = m_loss.InitialLoss(targets, inSample); var predictions = targets.Select(t => initialLoss).ToArray(); var residuals = new double[targets.Length]; var predictWork = new double[observations.RowCount]; for (int iteration = 0; iteration < m_iterations; iteration++) { m_loss.UpdateResiduals(targets, predictions, residuals, inSample); var sampleSize = targets.Length; if (m_subSampleRatio != 1.0) { sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length); var currentInSample = Sample(sampleSize, workIndices, targets.Length); trees[iteration] = m_learner.Learn(observations, targets, residuals, predictions, orderedElements, currentInSample); } else { trees[iteration] = m_learner.Learn(observations, targets, residuals, predictions, orderedElements, inSample); } trees[iteration].Predict(observations, predictWork); for (int i = 0; i < predictWork.Length; i++) { predictions[i] += m_learningRate * predictWork[i]; } } return(new RegressionGradientBoostModel(trees, m_learningRate, initialLoss, observations.ColumnCount)); }
/// <summary> /// Learns a RegressionGradientBoostModel with early stopping. /// The parameter earlyStoppingRounds controls how often the validation error is measured. /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned. /// The number of iterations used is equal to the number of trees in the resulting model. /// The method used for early stopping is based on the article: /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf /// </summary> /// <param name="trainingObservations"></param> /// <param name="trainingTargets"></param> /// <param name="validationObservations"></param> /// <param name="validationTargets"></param> /// <param name="metric">The metric to use for early stopping</param> /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations.</param> /// <returns>RegressionGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns> public RegressionGradientBoostModel LearnWithEarlyStopping( F64Matrix trainingObservations, double[] trainingTargets, F64Matrix validationObservations, double[] validationTargets, IMetric <double, double> metric, int earlyStoppingRounds) { if (earlyStoppingRounds >= m_iterations) { throw new ArgumentException("Number of iterations " + m_iterations + " is smaller than earlyStoppingRounds " + earlyStoppingRounds); } Checks.VerifyObservationsAndTargets(trainingObservations, trainingTargets); Checks.VerifyObservationsAndTargets(validationObservations, validationTargets); var rows = trainingObservations.RowCount; var orderedElements = CreateOrderedElements(trainingObservations, rows); var inSample = trainingTargets.Select(t => false).ToArray(); var indices = Enumerable.Range(0, trainingTargets.Length).ToArray(); indices.ForEach(i => inSample[i] = true); var workIndices = indices.ToArray(); var trees = new GBMTree[m_iterations]; var initialLoss = m_loss.InitialLoss(trainingTargets, inSample); var predictions = trainingTargets.Select(t => initialLoss).ToArray(); var residuals = new double[trainingTargets.Length]; var bestIterationCount = 0; var currentBedstError = double.MaxValue; var predictWork = new double[trainingObservations.RowCount]; for (int iteration = 0; iteration < m_iterations; iteration++) { m_loss.UpdateResiduals(trainingTargets, predictions, residuals, inSample); var sampleSize = trainingTargets.Length; if (m_subSampleRatio != 1.0) { sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length); var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length); trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals, predictions, orderedElements, currentInSample); } else { trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals, predictions, orderedElements, inSample); } trees[iteration].Predict(trainingObservations, predictWork); for (int i = 0; i < predictWork.Length; i++) { predictions[i] += m_learningRate * predictWork[i]; } // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees). if ((iteration % earlyStoppingRounds) == 0) { var model = new RegressionGradientBoostModel(trees.Take(iteration).ToArray(), m_learningRate, initialLoss, trainingObservations.ColumnCount); var validPredictions = model.Predict(validationObservations); var error = metric.Error(validationTargets, validPredictions); Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error); if (currentBedstError > error) { currentBedstError = error; bestIterationCount = iteration; } } } return(new RegressionGradientBoostModel(trees.Take(bestIterationCount).ToArray(), m_learningRate, initialLoss, trainingObservations.ColumnCount)); }
/// <summary> /// Learns a ClassificationGradientBoostModel with early stopping. /// The parameter earlyStoppingRounds controls how often the validation error is measured. /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned. /// The number of iterations used is equal to the number of trees in the resulting model. /// The method used for early stopping is based on the article: /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf /// </summary> /// <param name="trainingObservations"></param> /// <param name="trainingTargets"></param> /// <param name="validationObservations"></param> /// <param name="validationTargets"></param> /// <param name="metric">The metric to use for early stopping</param> /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations</param> /// <returns>ClassificationGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns> public ClassificationGradientBoostModel LearnWithEarlyStopping( F64Matrix trainingObservations, double[] trainingTargets, F64Matrix validationObservations, double[] validationTargets, IMetric <double, ProbabilityPrediction> metric, int earlyStoppingRounds) { if (earlyStoppingRounds >= m_iterations) { throw new ArgumentException("Number of iterations " + m_iterations + " is smaller than earlyStoppingRounds " + earlyStoppingRounds); } var rows = trainingObservations.RowCount; var orderedElements = CreateOrderedElements(trainingObservations, rows); var inSample = trainingTargets.Select(t => false).ToArray(); var indices = Enumerable.Range(0, trainingTargets.Length).ToArray(); indices.ForEach(i => inSample[i] = true); var workIndices = indices.ToArray(); var uniqueTargets = trainingTargets.Distinct().OrderBy(v => v).ToArray(); var initialLoss = m_loss.InitialLoss(trainingTargets, inSample); double[][] oneVsAllTargets = null; double[][] predictions = null; double[][] residuals = null; GBMTree[][] trees = null; if (uniqueTargets.Length == 2) // Binary case - only need to fit to one class and use (1.0 - probability) { trees = new GBMTree[][] { new GBMTree[m_iterations] }; predictions = new double[][] { trainingTargets.Select(_ => initialLoss).ToArray() }; residuals = new double[][] { new double[trainingTargets.Length] }; oneVsAllTargets = new double[1][]; var target = uniqueTargets[0]; oneVsAllTargets[0] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray(); } else // multi-class case - use oneVsAll strategy and fit probability for each class { trees = new GBMTree[uniqueTargets.Length][]; predictions = uniqueTargets.Select(_ => trainingTargets.Select(t => initialLoss).ToArray()) .ToArray(); residuals = uniqueTargets.Select(_ => new double[trainingTargets.Length]) .ToArray(); oneVsAllTargets = new double[uniqueTargets.Length][]; for (int i = 0; i < uniqueTargets.Length; i++) { var target = uniqueTargets[i]; oneVsAllTargets[i] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray(); trees[i] = new GBMTree[m_iterations]; } } var bestIterationCount = 0; var currentBedstError = double.MaxValue; for (int iteration = 0; iteration < m_iterations; iteration++) { for (int itarget = 0; itarget < trees.Length; itarget++) { m_loss.UpdateResiduals(oneVsAllTargets[itarget], predictions[itarget], residuals[itarget], inSample); var sampleSize = trainingTargets.Length; if (m_subSampleRatio != 1.0) { sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length); var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length); trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget], residuals[itarget], predictions[itarget], orderedElements, currentInSample); } else { trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget], residuals[itarget], predictions[itarget], orderedElements, inSample); } var predict = trees[itarget][iteration].Predict(trainingObservations); for (int i = 0; i < predict.Length; i++) { predictions[itarget][i] += m_learningRate * predict[i]; } } // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees). if (iteration % earlyStoppingRounds == 0) { var model = new ClassificationGradientBoostModel( trees.Select(t => t.Take(iteration).ToArray()).ToArray(), uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount); var validPredictions = model.PredictProbability(validationObservations); var error = metric.Error(validationTargets, validPredictions); Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error); if (currentBedstError >= error) { currentBedstError = error; bestIterationCount = iteration; } } } return(new ClassificationGradientBoostModel( trees.Select(t => t.Take(bestIterationCount).ToArray()).ToArray(), uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount)); }
/// <summary> /// A series of regression trees are fitted stage wise on the residuals of the previous stage /// </summary> /// <param name="observations"></param> /// <param name="targets"></param> /// <param name="indices"></param> /// <returns></returns> public ClassificationGradientBoostModel Learn(F64Matrix observations, double[] targets, int[] indices) { Checks.VerifyObservationsAndTargets(observations, targets); Checks.VerifyIndices(indices, observations, targets); var rows = observations.RowCount; var orderedElements = CreateOrderedElements(observations, rows); var inSample = targets.Select(t => false).ToArray(); indices.ForEach(i => inSample[i] = true); var workIndices = indices.ToArray(); var uniqueTargets = targets.Distinct().OrderBy(v => v).ToArray(); var initialLoss = m_loss.InitialLoss(targets, inSample); double[][] oneVsAllTargets = null; double[][] predictions = null; double[][] residuals = null; GBMTree[][] trees = null; if (uniqueTargets.Length == 2) // Binary case - only need to fit to one class and use (1.0 - probability) { trees = new GBMTree[][] { new GBMTree[m_iterations] }; predictions = new double[][] { targets.Select(_ => initialLoss).ToArray() }; residuals = new double[][] { new double[targets.Length] }; oneVsAllTargets = new double[1][]; var target = uniqueTargets[0]; oneVsAllTargets[0] = targets.Select(t => t == target ? 1.0 : 0.0).ToArray(); } else // multi-class case - use oneVsAll strategy and fit probability for each class { trees = new GBMTree[uniqueTargets.Length][]; predictions = uniqueTargets.Select(_ => targets.Select(t => initialLoss).ToArray()) .ToArray(); residuals = uniqueTargets.Select(_ => new double[targets.Length]) .ToArray(); oneVsAllTargets = new double[uniqueTargets.Length][]; for (int i = 0; i < uniqueTargets.Length; i++) { var target = uniqueTargets[i]; oneVsAllTargets[i] = targets.Select(t => t == target ? 1.0 : 0.0).ToArray(); trees[i] = new GBMTree[m_iterations]; } } var predictWork = new double[observations.RowCount]; for (int iteration = 0; iteration < m_iterations; iteration++) { for (int itarget = 0; itarget < trees.Length; itarget++) { m_loss.UpdateResiduals(oneVsAllTargets[itarget], predictions[itarget], residuals[itarget], inSample); var sampleSize = targets.Length; if (m_subSampleRatio != 1.0) { sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length); var currentInSample = Sample(sampleSize, workIndices, targets.Length); trees[itarget][iteration] = m_learner.Learn(observations, oneVsAllTargets[itarget], residuals[itarget], predictions[itarget], orderedElements, currentInSample); } else { trees[itarget][iteration] = m_learner.Learn(observations, oneVsAllTargets[itarget], residuals[itarget], predictions[itarget], orderedElements, inSample); } trees[itarget][iteration].Predict(observations, predictWork); for (int i = 0; i < predictWork.Length; i++) { predictions[itarget][i] += m_learningRate * predictWork[i]; } } } return(new ClassificationGradientBoostModel(trees, uniqueTargets, m_learningRate, initialLoss, observations.ColumnCount)); }