/// <summary>
        /// Returns a list of BiasVarianceLearningCurvePoints for constructing learning curves.
        /// The points contain sample size, training score and validation score.
        /// </summary>
        /// <param name="learner"></param>
        /// <param name="observations"></param>
        /// <param name="targets"></param>
        /// <param name="trainingIndices">Indices that should be used for training</param>
        /// <param name="validationIndices">Indices that should be used for validation</param>
        /// <returns></returns>
        public List <LearningCurvePoint> Calculate(IIndexedLearner <TPrediction> learner,
                                                   F64Matrix observations, double[] targets, int[] trainingIndices, int[] validationIndices)
        {
            var learningCurves = new List <LearningCurvePoint>();

            var validationTargets     = targets.GetIndices(validationIndices);
            var validationPredictions = new TPrediction[validationTargets.Length];

            foreach (var samplePercentage in m_samplePercentages)
            {
                if (samplePercentage <= 0.0 || samplePercentage > 1.0)
                {
                    throw new ArgumentException("Sample percentage must be larger than 0.0 and smaller than or equal to 1.0");
                }

                var sampleSize = (int)Math.Round(samplePercentage * (double)trainingIndices.Length);
                if (sampleSize <= 0)
                {
                    throw new ArgumentException("Sample percentage " + samplePercentage +
                                                " too small for training set size " + trainingIndices.Length);
                }

                var trainError      = 0.0;
                var validationError = 0.0;

                var trainingPredictions = new TPrediction[sampleSize];

                for (int j = 0; j < m_numberOfShufflesPrSample; j++)
                {
                    var sampleIndices = m_indexedSampler.Sample(targets, sampleSize, trainingIndices);
                    var model         = learner.Learn(observations, targets, sampleIndices);

                    for (int i = 0; i < trainingPredictions.Length; i++)
                    {
                        trainingPredictions[i] = model.Predict(observations.Row(sampleIndices[i]));
                    }

                    for (int i = 0; i < validationIndices.Length; i++)
                    {
                        validationPredictions[i] = model.Predict(observations.Row(validationIndices[i]));
                    }

                    var sampleTargets = targets.GetIndices(sampleIndices);
                    trainError      += m_metric.Error(sampleTargets, trainingPredictions);
                    validationError += m_metric.Error(validationTargets, validationPredictions);

                    ModelDisposer.DisposeIfDisposable(model);
                }

                trainError      = trainError / m_numberOfShufflesPrSample;
                validationError = validationError / m_numberOfShufflesPrSample;

                learningCurves.Add(new LearningCurvePoint(sampleSize,
                                                          trainError, validationError));
            }

            return(learningCurves);
        }
        double SelectNextModelToAdd(ProbabilityPrediction[][] crossValidatedModelPredictions, 
            double[] targets, 
            double currentBestError)
        {
            var rows = crossValidatedModelPredictions.First().Length;
            var candidateModelMatrix = new ProbabilityPrediction[m_selectedModelIndices.Count + 1][];
            var candidatePredictions = new ProbabilityPrediction[rows];
            var candidateModelIndices = new int[m_selectedModelIndices.Count + 1];

            var bestError = currentBestError;
            var bestIndex = -1;

            foreach (var index in m_remainingModelIndices)
            {
                m_selectedModelIndices.CopyTo(candidateModelIndices);
                candidateModelIndices[candidateModelIndices.Length - 1] = index;

                for (int i = 0; i < candidateModelIndices.Length; i++)
                {
                    candidateModelMatrix[i] = crossValidatedModelPredictions[candidateModelIndices[i]];
                }

                m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions);
                var error = m_metric.Error(targets, candidatePredictions);

                if (error < bestError)
                {
                    bestError = error;
                    bestIndex = index;
                }
            }

            if(bestIndex != -1)
            {
                m_selectedModelIndices.Add(bestIndex);
                
                if(!m_selectWithReplacement)
                {
                    m_remainingModelIndices.Remove(bestIndex);
                }
            }

            return bestError;
        }
        double SelectNextModelToAdd(F64Matrix crossValidatedModelPredictions, double[] targets, double currentBestError)
        {
            var candidateModelMatrix  = new F64Matrix(crossValidatedModelPredictions.RowCount, m_selectedModelIndices.Count + 1);
            var candidatePredictions  = new double[crossValidatedModelPredictions.RowCount];
            var candidateModelIndices = new int[m_selectedModelIndices.Count + 1];

            var bestError = currentBestError;
            var bestIndex = -1;

            foreach (var index in m_remainingModelIndices)
            {
                m_selectedModelIndices.CopyTo(candidateModelIndices);
                candidateModelIndices[candidateModelIndices.Length - 1] = index;

                crossValidatedModelPredictions.Columns(candidateModelIndices, candidateModelMatrix);

                m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions);
                var error = m_metric.Error(targets, candidatePredictions);

                if (error < bestError)
                {
                    bestError = error;
                    bestIndex = index;
                }
            }

            if (bestIndex != -1)
            {
                m_selectedModelIndices.Add(bestIndex);

                if (!m_selectWithReplacement)
                {
                    m_remainingModelIndices.Remove(bestIndex);
                }
            }

            return(bestError);
        }
        double SelectNextModelToRemove(ProbabilityPrediction[][] crossValidatedModelPredictions,
                                       double[] targets,
                                       double currentBestError)
        {
            var rows = crossValidatedModelPredictions.First().Length;
            var candidateModelMatrix  = new ProbabilityPrediction[m_remainingModelIndices.Count - 1][];
            var candidatePredictions  = new ProbabilityPrediction[rows];
            var candidateModelIndices = new int[m_remainingModelIndices.Count - 1];

            var bestError = currentBestError;
            var bestIndex = -1;

            foreach (var index in m_remainingModelIndices)
            {
                var candidateIndex = 0;
                for (int i = 0; i < m_remainingModelIndices.Count; i++)
                {
                    var curIndex = m_remainingModelIndices[i];
                    if (curIndex != index)
                    {
                        candidateModelIndices[candidateIndex++] = m_remainingModelIndices[i];
                    }
                }

                for (int i = 0; i < candidateModelIndices.Length; i++)
                {
                    candidateModelMatrix[i] = crossValidatedModelPredictions[candidateModelIndices[i]];
                }

                m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions);
                var error = m_metric.Error(targets, candidatePredictions);

                if (error < bestError)
                {
                    bestError = error;
                    bestIndex = index;
                }
            }

            m_remainingModelIndices.Remove(bestIndex);

            return(bestError);
        }
        /// <summary>
        /// Greedy forward selection of ensemble models.
        /// </summary>
        /// <param name="crossValidatedModelPredictions">cross validated predictions from multiple models.
        /// Each row in the matrix corresponds to predictions from a separate model</param>
        /// <param name="targets">Corresponding targets</param>
        /// <returns>The indices of the selected model</returns>
        public int[] Select(ProbabilityPrediction[][] crossValidatedModelPredictions, double[] targets)
        {
            if (crossValidatedModelPredictions.Length < m_numberOfModelsToSelect)
            {
                throw new ArgumentException("Availible models: " + crossValidatedModelPredictions.Length +
                                            " is smaller than number of models to select: " + m_numberOfModelsToSelect);
            }

            m_allIndices = Enumerable.Range(0, crossValidatedModelPredictions.Length).ToArray();

            var rows = crossValidatedModelPredictions.First().Length;
            var candidateModelMatrix  = new ProbabilityPrediction[m_numberOfModelsToSelect][];
            var candidatePredictions  = new ProbabilityPrediction[rows];
            var candidateModelIndices = new int[m_numberOfModelsToSelect];
            var bestModelIndices      = new int[m_numberOfModelsToSelect];

            var bestError = double.MaxValue;

            for (int i = 0; i < m_iterations; i++)
            {
                SelectNextRandomIndices(candidateModelIndices);

                for (int j = 0; j < candidateModelIndices.Length; j++)
                {
                    candidateModelMatrix[j] = crossValidatedModelPredictions[candidateModelIndices[j]];
                }

                m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions);
                var error = m_metric.Error(targets, candidatePredictions);

                if (error < bestError)
                {
                    bestError = error;
                    candidateModelIndices.CopyTo(bestModelIndices, 0);
                    Trace.WriteLine("Models selected: " + bestModelIndices.Length + ": " + error);
                }
            }

            Trace.WriteLine("Selected model indices: " + string.Join(", ", bestModelIndices.ToArray()));

            return(bestModelIndices);
        }
        double SelectNextModelToRemove(F64Matrix crossValidatedModelPredictions,
                                       double[] targets,
                                       double currentBestError)
        {
            var candidateModelMatrix = new F64Matrix(crossValidatedModelPredictions.RowCount,
                                                     m_remainingModelIndices.Count - 1);

            var candidatePredictions  = new double[crossValidatedModelPredictions.RowCount];
            var candidateModelIndices = new int[m_remainingModelIndices.Count - 1];

            var bestError = double.MaxValue;
            var bestIndex = -1;

            foreach (var index in m_remainingModelIndices)
            {
                var candidateIndex = 0;
                for (int i = 0; i < m_remainingModelIndices.Count; i++)
                {
                    var curIndex = m_remainingModelIndices[i];
                    if (curIndex != index)
                    {
                        candidateModelIndices[candidateIndex++] = m_remainingModelIndices[i];
                    }
                }

                crossValidatedModelPredictions.Columns(candidateModelIndices, candidateModelMatrix);

                m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions);
                var error = m_metric.Error(targets, candidatePredictions);

                if (error < bestError)
                {
                    bestError = error;
                    bestIndex = index;
                }
            }

            m_remainingModelIndices.Remove(bestIndex);

            return(bestError);
        }
Exemple #7
0
        /// <summary>
        /// Iterative random selection of ensemble models.
        /// </summary>
        /// <param name="crossValidatedModelPredictions">cross validated predictions from multiple models.
        /// Each column in the matrix corresponds to predictions from a separate model</param>
        /// <param name="targets">Corresponding targets</param>
        /// <returns>The indices of the selected model</returns>
        public int[] Select(F64Matrix crossValidatedModelPredictions, double[] targets)
        {
            if (crossValidatedModelPredictions.ColumnCount < m_numberOfModelsToSelect)
            {
                throw new ArgumentException("Available models: " + crossValidatedModelPredictions.ColumnCount +
                                            " is smaller than number of models to select: " + m_numberOfModelsToSelect);
            }

            m_allIndices = Enumerable.Range(0, crossValidatedModelPredictions.ColumnCount).ToArray();
            var bestModelIndices      = new int[m_numberOfModelsToSelect];
            var candidateModelIndices = new int[m_numberOfModelsToSelect];

            var candidateModelMatrix = new F64Matrix(crossValidatedModelPredictions.RowCount, m_numberOfModelsToSelect);
            var candidatePredictions = new double[crossValidatedModelPredictions.RowCount];

            var bestError = double.MaxValue;

            for (int i = 0; i < m_iterations; i++)
            {
                SelectNextRandomIndices(candidateModelIndices);

                crossValidatedModelPredictions.Columns(candidateModelIndices, candidateModelMatrix);

                m_ensembleStrategy.Combine(candidateModelMatrix, candidatePredictions);
                var error = m_metric.Error(targets, candidatePredictions);

                if (error < bestError)
                {
                    bestError = error;
                    candidateModelIndices.CopyTo(bestModelIndices, 0);
                    Trace.WriteLine("Models selected: " + bestModelIndices.Length + ": " + error);
                }
            }

            Trace.WriteLine("Selected model indices: " + string.Join(", ", bestModelIndices.ToArray()));

            return(bestModelIndices);
        }
Exemple #8
0
        /// <summary>
        /// Learns a RegressionGradientBoostModel with early stopping.
        /// The parameter earlyStoppingRounds controls how often the validation error is measured.
        /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned.
        /// The number of iterations used is equal to the number of trees in the resulting model.
        /// The method used for early stopping is based on the article:
        /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf
        /// </summary>
        /// <param name="trainingObservations"></param>
        /// <param name="trainingTargets"></param>
        /// <param name="validationObservations"></param>
        /// <param name="validationTargets"></param>
        /// <param name="metric">The metric to use for early stopping</param>
        /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations.</param>
        /// <returns>RegressionGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns>
        public RegressionGradientBoostModel LearnWithEarlyStopping(
            F64Matrix trainingObservations,
            double[] trainingTargets,
            F64Matrix validationObservations,
            double[] validationTargets,
            IMetric <double, double> metric,
            int earlyStoppingRounds)
        {
            if (earlyStoppingRounds >= m_iterations)
            {
                throw new ArgumentException("Number of iterations " + m_iterations +
                                            " is smaller than earlyStoppingRounds " + earlyStoppingRounds);
            }

            Checks.VerifyObservationsAndTargets(trainingObservations, trainingTargets);
            Checks.VerifyObservationsAndTargets(validationObservations, validationTargets);

            var rows            = trainingObservations.RowCount;
            var orderedElements = CreateOrderedElements(trainingObservations, rows);

            var inSample = trainingTargets.Select(t => false).ToArray();
            var indices  = Enumerable.Range(0, trainingTargets.Length).ToArray();

            indices.ForEach(i => inSample[i] = true);
            var workIndices = indices.ToArray();

            var trees = new GBMTree[m_iterations];

            var initialLoss = m_loss.InitialLoss(trainingTargets, inSample);
            var predictions = trainingTargets.Select(t => initialLoss).ToArray();
            var residuals   = new double[trainingTargets.Length];

            var bestIterationCount = 0;
            var currentBedstError  = double.MaxValue;

            var predictWork = new double[trainingObservations.RowCount];

            for (int iteration = 0; iteration < m_iterations; iteration++)
            {
                m_loss.UpdateResiduals(trainingTargets, predictions, residuals, inSample);

                var sampleSize = trainingTargets.Length;
                if (m_subSampleRatio != 1.0)
                {
                    sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length);
                    var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length);

                    trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals,
                                                       predictions, orderedElements, currentInSample);
                }
                else
                {
                    trees[iteration] = m_learner.Learn(trainingObservations, trainingTargets, residuals,
                                                       predictions, orderedElements, inSample);
                }

                trees[iteration].Predict(trainingObservations, predictWork);
                for (int i = 0; i < predictWork.Length; i++)
                {
                    predictions[i] += m_learningRate * predictWork[i];
                }

                // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds
                // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees).
                if ((iteration % earlyStoppingRounds) == 0)
                {
                    var model = new RegressionGradientBoostModel(trees.Take(iteration).ToArray(),
                                                                 m_learningRate, initialLoss, trainingObservations.ColumnCount);

                    var validPredictions = model.Predict(validationObservations);
                    var error            = metric.Error(validationTargets, validPredictions);

                    Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error);

                    if (currentBedstError > error)
                    {
                        currentBedstError  = error;
                        bestIterationCount = iteration;
                    }
                }
            }

            return(new RegressionGradientBoostModel(trees.Take(bestIterationCount).ToArray(),
                                                    m_learningRate, initialLoss, trainingObservations.ColumnCount));
        }
Exemple #9
0
        /// <summary>
        /// Learns a ClassificationGradientBoostModel with early stopping.
        /// The parameter earlyStoppingRounds controls how often the validation error is measured.
        /// If the validation error has increased, the learning is stopped and the model with the best number of iterations (trees) is returned.
        /// The number of iterations used is equal to the number of trees in the resulting model.
        /// The method used for early stopping is based on the article:
        /// http://page.mi.fu-berlin.de/prechelt/Biblio/stop_tricks1997.pdf
        /// </summary>
        /// <param name="trainingObservations"></param>
        /// <param name="trainingTargets"></param>
        /// <param name="validationObservations"></param>
        /// <param name="validationTargets"></param>
        /// <param name="metric">The metric to use for early stopping</param>
        /// <param name="earlyStoppingRounds">This controls how often the validation error is checked to estimate the best number of iterations</param>
        /// <returns>ClassificationGradientBoostModel with early stopping. The number of iterations will equal the number of trees in the model</returns>
        public ClassificationGradientBoostModel LearnWithEarlyStopping(
            F64Matrix trainingObservations,
            double[] trainingTargets,
            F64Matrix validationObservations,
            double[] validationTargets,
            IMetric <double, ProbabilityPrediction> metric,
            int earlyStoppingRounds)
        {
            if (earlyStoppingRounds >= m_iterations)
            {
                throw new ArgumentException("Number of iterations " + m_iterations +
                                            " is smaller than earlyStoppingRounds " + earlyStoppingRounds);
            }

            var rows            = trainingObservations.RowCount;
            var orderedElements = CreateOrderedElements(trainingObservations, rows);

            var inSample = trainingTargets.Select(t => false).ToArray();
            var indices  = Enumerable.Range(0, trainingTargets.Length).ToArray();

            indices.ForEach(i => inSample[i] = true);
            var workIndices = indices.ToArray();

            var uniqueTargets = trainingTargets.Distinct().OrderBy(v => v).ToArray();
            var initialLoss   = m_loss.InitialLoss(trainingTargets, inSample);

            double[][]  oneVsAllTargets = null;
            double[][]  predictions     = null;
            double[][]  residuals       = null;
            GBMTree[][] trees           = null;

            if (uniqueTargets.Length == 2) // Binary case - only need to fit to one class and use (1.0 - probability)
            {
                trees       = new GBMTree[][] { new GBMTree[m_iterations] };
                predictions = new double[][] { trainingTargets.Select(_ => initialLoss).ToArray() };
                residuals   = new double[][] { new double[trainingTargets.Length] };

                oneVsAllTargets = new double[1][];
                var target = uniqueTargets[0];
                oneVsAllTargets[0] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray();
            }
            else // multi-class case - use oneVsAll strategy and fit probability for each class
            {
                trees       = new GBMTree[uniqueTargets.Length][];
                predictions = uniqueTargets.Select(_ => trainingTargets.Select(t => initialLoss).ToArray())
                              .ToArray();
                residuals = uniqueTargets.Select(_ => new double[trainingTargets.Length])
                            .ToArray();

                oneVsAllTargets = new double[uniqueTargets.Length][];
                for (int i = 0; i < uniqueTargets.Length; i++)
                {
                    var target = uniqueTargets[i];
                    oneVsAllTargets[i] = trainingTargets.Select(t => t == target ? 1.0 : 0.0).ToArray();
                    trees[i]           = new GBMTree[m_iterations];
                }
            }

            var bestIterationCount = 0;
            var currentBedstError  = double.MaxValue;

            for (int iteration = 0; iteration < m_iterations; iteration++)
            {
                for (int itarget = 0; itarget < trees.Length; itarget++)
                {
                    m_loss.UpdateResiduals(oneVsAllTargets[itarget], predictions[itarget],
                                           residuals[itarget], inSample);

                    var sampleSize = trainingTargets.Length;
                    if (m_subSampleRatio != 1.0)
                    {
                        sampleSize = (int)Math.Round(m_subSampleRatio * workIndices.Length);
                        var currentInSample = Sample(sampleSize, workIndices, trainingTargets.Length);

                        trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget],
                                                                    residuals[itarget], predictions[itarget], orderedElements, currentInSample);
                    }
                    else
                    {
                        trees[itarget][iteration] = m_learner.Learn(trainingObservations, oneVsAllTargets[itarget],
                                                                    residuals[itarget], predictions[itarget], orderedElements, inSample);
                    }

                    var predict = trees[itarget][iteration].Predict(trainingObservations);
                    for (int i = 0; i < predict.Length; i++)
                    {
                        predictions[itarget][i] += m_learningRate * predict[i];
                    }
                }

                // When using early stopping, Check that the validation error is not increasing between earlyStoppingRounds
                // If the validation error has increased, stop the learning and return the model with the best number of iterations (trees).
                if (iteration % earlyStoppingRounds == 0)
                {
                    var model = new ClassificationGradientBoostModel(
                        trees.Select(t => t.Take(iteration).ToArray()).ToArray(),
                        uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount);

                    var validPredictions = model.PredictProbability(validationObservations);
                    var error            = metric.Error(validationTargets, validPredictions);

                    Trace.WriteLine("Iteration " + (iteration + 1) + " Validation Error: " + error);

                    if (currentBedstError >= error)
                    {
                        currentBedstError  = error;
                        bestIterationCount = iteration;
                    }
                }
            }

            return(new ClassificationGradientBoostModel(
                       trees.Select(t => t.Take(bestIterationCount).ToArray()).ToArray(),
                       uniqueTargets, m_learningRate, initialLoss, trainingObservations.ColumnCount));
        }