/// <summary> /// Returns a list of BiasVarianceLearningCurvePoints for constructing learning curves. /// The points contain sample size, training score and validation score. /// </summary> /// <param name="learner"></param> /// <param name="observations"></param> /// <param name="targets"></param> /// <param name="trainingIndices">Indices that should be used for training</param> /// <param name="validationIndices">Indices that should be used for validation</param> /// <returns></returns> public List <LearningCurvePoint> Calculate(IIndexedLearner <TPrediction> learner, F64Matrix observations, double[] targets, int[] trainingIndices, int[] validationIndices) { var learningCurves = new List <LearningCurvePoint>(); var validationTargets = targets.GetIndices(validationIndices); var validationPredictions = new TPrediction[validationTargets.Length]; foreach (var samplePercentage in m_samplePercentages) { if (samplePercentage <= 0.0 || samplePercentage > 1.0) { throw new ArgumentException("Sample percentage must be larger than 0.0 and smaller than or equal to 1.0"); } var sampleSize = (int)Math.Round(samplePercentage * (double)trainingIndices.Length); if (sampleSize <= 0) { throw new ArgumentException("Sample percentage " + samplePercentage + " too small for training set size " + trainingIndices.Length); } var trainError = 0.0; var validationError = 0.0; var trainingPredictions = new TPrediction[sampleSize]; for (int j = 0; j < m_numberOfShufflesPrSample; j++) { var sampleIndices = m_indexedSampler.Sample(targets, sampleSize, trainingIndices); var model = learner.Learn(observations, targets, sampleIndices); for (int i = 0; i < trainingPredictions.Length; i++) { trainingPredictions[i] = model.Predict(observations.Row(sampleIndices[i])); } for (int i = 0; i < validationIndices.Length; i++) { validationPredictions[i] = model.Predict(observations.Row(validationIndices[i])); } var sampleTargets = targets.GetIndices(sampleIndices); trainError += m_metric.Error(sampleTargets, trainingPredictions); validationError += m_metric.Error(validationTargets, validationPredictions); ModelDisposer.DisposeIfDisposable(model); } trainError = trainError / m_numberOfShufflesPrSample; validationError = validationError / m_numberOfShufflesPrSample; learningCurves.Add(new LearningCurvePoint(sampleSize, trainError, validationError)); } return(learningCurves); }
double SelectNextModelToAdd(ProbabilityPrediction[][] crossValidatedModelPredictions, double[] targets, double currentBestError) { var rows = crossValidatedModelPredictions.First().Length; var candidateModelMatrix = new ProbabilityPrediction[m_selectedModelIndices.Count + 1][]; var candidatePredictions = new ProbabilityPrediction[rows]; var candidateModelIndices = new int[m_selectedModelIndices.Count + 1]; var bestError = currentBestError; var bestIndex = -1; foreach (var index in m_remainingModelIndices) { m_selectedModelIndices.CopyTo(candidateModelIndices); candidateModelIndices[candidateModelIndices.Length - 1] = index; for (int i = 0; i < candidateModelIndices.Length; i++) { candidateModelMatrix[i] = crossValidatedModelPredictions[candidateModelIndices[i]]; } m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions); var error = m_metric.Error(targets, candidatePredictions); if (error < bestError) { bestError = error; bestIndex = index; } } if(bestIndex != -1) { m_selectedModelIndices.Add(bestIndex); if(!m_selectWithReplacement) { m_remainingModelIndices.Remove(bestIndex); } } return bestError; }
double SelectNextModelToAdd(F64Matrix crossValidatedModelPredictions, double[] targets, double currentBestError) { var candidateModelMatrix = new F64Matrix(crossValidatedModelPredictions.RowCount, m_selectedModelIndices.Count + 1); var candidatePredictions = new double[crossValidatedModelPredictions.RowCount]; var candidateModelIndices = new int[m_selectedModelIndices.Count + 1]; var bestError = currentBestError; var bestIndex = -1; foreach (var index in m_remainingModelIndices) { m_selectedModelIndices.CopyTo(candidateModelIndices); candidateModelIndices[candidateModelIndices.Length - 1] = index; crossValidatedModelPredictions.Columns(candidateModelIndices, candidateModelMatrix); m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions); var error = m_metric.Error(targets, candidatePredictions); if (error < bestError) { bestError = error; bestIndex = index; } } if (bestIndex != -1) { m_selectedModelIndices.Add(bestIndex); if (!m_selectWithReplacement) { m_remainingModelIndices.Remove(bestIndex); } } return(bestError); }
double SelectNextModelToRemove(ProbabilityPrediction[][] crossValidatedModelPredictions, double[] targets, double currentBestError) { var rows = crossValidatedModelPredictions.First().Length; var candidateModelMatrix = new ProbabilityPrediction[m_remainingModelIndices.Count - 1][]; var candidatePredictions = new ProbabilityPrediction[rows]; var candidateModelIndices = new int[m_remainingModelIndices.Count - 1]; var bestError = currentBestError; var bestIndex = -1; foreach (var index in m_remainingModelIndices) { var candidateIndex = 0; for (int i = 0; i < m_remainingModelIndices.Count; i++) { var curIndex = m_remainingModelIndices[i]; if (curIndex != index) { candidateModelIndices[candidateIndex++] = m_remainingModelIndices[i]; } } for (int i = 0; i < candidateModelIndices.Length; i++) { candidateModelMatrix[i] = crossValidatedModelPredictions[candidateModelIndices[i]]; } m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions); var error = m_metric.Error(targets, candidatePredictions); if (error < bestError) { bestError = error; bestIndex = index; } } m_remainingModelIndices.Remove(bestIndex); return(bestError); }
/// <summary> /// Greedy forward selection of ensemble models. /// </summary> /// <param name="crossValidatedModelPredictions">cross validated predictions from multiple models. /// Each row in the matrix corresponds to predictions from a separate model</param> /// <param name="targets">Corresponding targets</param> /// <returns>The indices of the selected model</returns> public int[] Select(ProbabilityPrediction[][] crossValidatedModelPredictions, double[] targets) { if (crossValidatedModelPredictions.Length < m_numberOfModelsToSelect) { throw new ArgumentException("Availible models: " + crossValidatedModelPredictions.Length + " is smaller than number of models to select: " + m_numberOfModelsToSelect); } m_allIndices = Enumerable.Range(0, crossValidatedModelPredictions.Length).ToArray(); var rows = crossValidatedModelPredictions.First().Length; var candidateModelMatrix = new ProbabilityPrediction[m_numberOfModelsToSelect][]; var candidatePredictions = new ProbabilityPrediction[rows]; var candidateModelIndices = new int[m_numberOfModelsToSelect]; var bestModelIndices = new int[m_numberOfModelsToSelect]; var bestError = double.MaxValue; for (int i = 0; i < m_iterations; i++) { SelectNextRandomIndices(candidateModelIndices); for (int j = 0; j < candidateModelIndices.Length; j++) { candidateModelMatrix[j] = crossValidatedModelPredictions[candidateModelIndices[j]]; } m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions); var error = m_metric.Error(targets, candidatePredictions); if (error < bestError) { bestError = error; candidateModelIndices.CopyTo(bestModelIndices, 0); Trace.WriteLine("Models selected: " + bestModelIndices.Length + ": " + error); } } Trace.WriteLine("Selected model indices: " + string.Join(", ", bestModelIndices.ToArray())); return(bestModelIndices); }
double SelectNextModelToRemove(F64Matrix crossValidatedModelPredictions, double[] targets, double currentBestError) { var candidateModelMatrix = new F64Matrix(crossValidatedModelPredictions.RowCount, m_remainingModelIndices.Count - 1); var candidatePredictions = new double[crossValidatedModelPredictions.RowCount]; var candidateModelIndices = new int[m_remainingModelIndices.Count - 1]; var bestError = double.MaxValue; var bestIndex = -1; foreach (var index in m_remainingModelIndices) { var candidateIndex = 0; for (int i = 0; i < m_remainingModelIndices.Count; i++) { var curIndex = m_remainingModelIndices[i]; if (curIndex != index) { candidateModelIndices[candidateIndex++] = m_remainingModelIndices[i]; } } crossValidatedModelPredictions.Columns(candidateModelIndices, candidateModelMatrix); m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions); var error = m_metric.Error(targets, candidatePredictions); if (error < bestError) { bestError = error; bestIndex = index; } } m_remainingModelIndices.Remove(bestIndex); return(bestError); }
/// <summary> /// Iterative random selection of ensemble models. /// </summary> /// <param name="crossValidatedModelPredictions">cross validated predictions from multiple models. /// Each column in the matrix corresponds to predictions from a separate model</param> /// <param name="targets">Corresponding targets</param> /// <returns>The indices of the selected model</returns> public int[] Select(F64Matrix crossValidatedModelPredictions, double[] targets) { if (crossValidatedModelPredictions.ColumnCount < m_numberOfModelsToSelect) { throw new ArgumentException("Available models: " + crossValidatedModelPredictions.ColumnCount + " is smaller than number of models to select: " + m_numberOfModelsToSelect); } m_allIndices = Enumerable.Range(0, crossValidatedModelPredictions.ColumnCount).ToArray(); var bestModelIndices = new int[m_numberOfModelsToSelect]; var candidateModelIndices = new int[m_numberOfModelsToSelect]; var candidateModelMatrix = new F64Matrix(crossValidatedModelPredictions.RowCount, m_numberOfModelsToSelect); var candidatePredictions = new double[crossValidatedModelPredictions.RowCount]; var bestError = double.MaxValue; for (int i = 0; i < m_iterations; i++) { SelectNextRandomIndices(candidateModelIndices); crossValidatedModelPredictions.Columns(candidateModelIndices, candidateModelMatrix); m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions); var error = m_metric.Error(targets, candidatePredictions); if (error < bestError) { bestError = error; candidateModelIndices.CopyTo(bestModelIndices, 0); Trace.WriteLine("Models selected: " + bestModelIndices.Length + ": " + error); } } Trace.WriteLine("Selected model indices: " + string.Join(", ", bestModelIndices.ToArray())); return(bestModelIndices); }
/// <summary> /// Learns a RegressionGradientBoostModel with early stopping. /// The parameter earlyStoppingRounds controls how often the validation error is measured. /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned. /// The number of iterations used is equal to the number of trees in the resulting model. /// The method used for early stopping is based on the article: /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf /// </summary> /// <param name="trainingObservations"></param> /// <param name="trainingTargets"></param> /// <param name="validationObservations"></param> /// <param name="validationTargets"></param> /// <param name="metric">The metric to use for early stopping</param> /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations.</param> /// <returns>RegressionGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns> public RegressionGradientBoostModel LearnWithEarlyStopping( F64Matrix trainingObservations, double[] trainingTargets, F64Matrix validationObservations, double[] validationTargets, IMetric <double, double> metric, int earlyStoppingRounds) { if (earlyStoppingRounds >= m_iterations) { throw new ArgumentException("Number of iterations " + m_iterations + " is smaller than earlyStoppingRounds " + earlyStoppingRounds); } Checks.VerifyObservationsAndTargets(trainingObservations, trainingTargets); Checks.VerifyObservationsAndTargets(validationObservations, validationTargets); var rows = trainingObservations.RowCount; var orderedElements = CreateOrderedElements(trainingObservations, rows); var inSample = trainingTargets.Select(t => false).ToArray(); var indices = Enumerable.Range(0, trainingTargets.Length).ToArray(); indices.ForEach(i => inSample[i] = true); var workIndices = indices.ToArray(); var trees = new GBMTree[m_iterations]; var initialLoss = m_loss.InitialLoss(trainingTargets, inSample); var predictions = trainingTargets.Select(t => initialLoss).ToArray(); var residuals = new double[trainingTargets.Length]; var bestIterationCount = 0; var currentBedstError = double.MaxValue; var predictWork = new double[trainingObservations.RowCount]; for (int iteration = 0; iteration < m_iterations; iteration++) { m_loss.UpdateResiduals(trainingTargets, predictions, residuals, inSample); var sampleSize = trainingTargets.Length; if (m_subSampleRatio != 1.0) { sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length); var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length); trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals, predictions, orderedElements, currentInSample); } else { trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals, predictions, orderedElements, inSample); } trees[iteration].Predict(trainingObservations, predictWork); for (int i = 0; i < predictWork.Length; i++) { predictions[i] += m_learningRate * predictWork[i]; } // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees). if ((iteration % earlyStoppingRounds) == 0) { var model = new RegressionGradientBoostModel(trees.Take(iteration).ToArray(), m_learningRate, initialLoss, trainingObservations.ColumnCount); var validPredictions = model.Predict(validationObservations); var error = metric.Error(validationTargets, validPredictions); Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error); if (currentBedstError > error) { currentBedstError = error; bestIterationCount = iteration; } } } return(new RegressionGradientBoostModel(trees.Take(bestIterationCount).ToArray(), m_learningRate, initialLoss, trainingObservations.ColumnCount)); }
/// <summary> /// Learns a ClassificationGradientBoostModel with early stopping. /// The parameter earlyStoppingRounds controls how often the validation error is measured. /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned. /// The number of iterations used is equal to the number of trees in the resulting model. /// The method used for early stopping is based on the article: /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf /// </summary> /// <param name="trainingObservations"></param> /// <param name="trainingTargets"></param> /// <param name="validationObservations"></param> /// <param name="validationTargets"></param> /// <param name="metric">The metric to use for early stopping</param> /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations</param> /// <returns>ClassificationGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns> public ClassificationGradientBoostModel LearnWithEarlyStopping( F64Matrix trainingObservations, double[] trainingTargets, F64Matrix validationObservations, double[] validationTargets, IMetric <double, ProbabilityPrediction> metric, int earlyStoppingRounds) { if (earlyStoppingRounds >= m_iterations) { throw new ArgumentException("Number of iterations " + m_iterations + " is smaller than earlyStoppingRounds " + earlyStoppingRounds); } var rows = trainingObservations.RowCount; var orderedElements = CreateOrderedElements(trainingObservations, rows); var inSample = trainingTargets.Select(t => false).ToArray(); var indices = Enumerable.Range(0, trainingTargets.Length).ToArray(); indices.ForEach(i => inSample[i] = true); var workIndices = indices.ToArray(); var uniqueTargets = trainingTargets.Distinct().OrderBy(v => v).ToArray(); var initialLoss = m_loss.InitialLoss(trainingTargets, inSample); double[][] oneVsAllTargets = null; double[][] predictions = null; double[][] residuals = null; GBMTree[][] trees = null; if (uniqueTargets.Length == 2) // Binary case - only need to fit to one class and use (1.0 - probability) { trees = new GBMTree[][] { new GBMTree[m_iterations] }; predictions = new double[][] { trainingTargets.Select(_ => initialLoss).ToArray() }; residuals = new double[][] { new double[trainingTargets.Length] }; oneVsAllTargets = new double[1][]; var target = uniqueTargets[0]; oneVsAllTargets[0] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray(); } else // multi-class case - use oneVsAll strategy and fit probability for each class { trees = new GBMTree[uniqueTargets.Length][]; predictions = uniqueTargets.Select(_ => trainingTargets.Select(t => initialLoss).ToArray()) .ToArray(); residuals = uniqueTargets.Select(_ => new double[trainingTargets.Length]) .ToArray(); oneVsAllTargets = new double[uniqueTargets.Length][]; for (int i = 0; i < uniqueTargets.Length; i++) { var target = uniqueTargets[i]; oneVsAllTargets[i] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray(); trees[i] = new GBMTree[m_iterations]; } } var bestIterationCount = 0; var currentBedstError = double.MaxValue; for (int iteration = 0; iteration < m_iterations; iteration++) { for (int itarget = 0; itarget < trees.Length; itarget++) { m_loss.UpdateResiduals(oneVsAllTargets[itarget], predictions[itarget], residuals[itarget], inSample); var sampleSize = trainingTargets.Length; if (m_subSampleRatio != 1.0) { sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length); var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length); trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget], residuals[itarget], predictions[itarget], orderedElements, currentInSample); } else { trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget], residuals[itarget], predictions[itarget], orderedElements, inSample); } var predict = trees[itarget][iteration].Predict(trainingObservations); for (int i = 0; i < predict.Length; i++) { predictions[itarget][i] += m_learningRate * predict[i]; } } // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees). if (iteration % earlyStoppingRounds == 0) { var model = new ClassificationGradientBoostModel( trees.Select(t => t.Take(iteration).ToArray()).ToArray(), uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount); var validPredictions = model.PredictProbability(validationObservations); var error = metric.Error(validationTargets, validPredictions); Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error); if (currentBedstError >= error) { currentBedstError = error; bestIterationCount = iteration; } } } return(new ClassificationGradientBoostModel( trees.Select(t => t.Take(bestIterationCount).ToArray()).ToArray(), uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount)); }