public void RegressionDecisionTreeLearner_Learn_Reuse_No_Valid_Split()
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();
            var rows         = targets.Length;

            var sut = new RegressionDecisionTreeLearner();

            // train initial model.
            sut.Learn(observations, targets);

            // reuse learner, with smaller data that provides no valid split.
            var onlyUniqueTargetValue     = 1.0;
            var onlyOneUniqueObservations = (F64Matrix)observations.Rows(0, 1, 2, 3, 4);
            var onlyOneUniquetargets      = Enumerable.Range(0, onlyOneUniqueObservations.RowCount).Select(v => onlyUniqueTargetValue).ToArray();
            var model = sut.Learn(onlyOneUniqueObservations, onlyOneUniquetargets);

            var predictions = model.Predict(onlyOneUniqueObservations);

            // no valid split, so should result in the model always returning the onlyUniqueTargetValue.
            for (int i = 0; i < predictions.Length; i++)
            {
                Assert.AreEqual(onlyUniqueTargetValue, predictions[i], 0.0001);
            }
        }
        public void Hyper_Parameter_Tuning()
        {
            #region Read data

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read classification targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            #endregion

            // metric to minimize
            var metric = new MeanSquaredErrorRegressionMetric();

            // Parameter ranges for the optimizer
            var paramers = new ParameterBounds[]
            {
                new ParameterBounds(min: 1, max: 100, transform: Transform.Linear), // maximumTreeDepth
                new ParameterBounds(min: 1, max: 16, transform: Transform.Linear),  // minimumSplitSize
            };

            // create random search optimizer
            var optimizer = new RandomSearchOptimizer(paramers, iterations: 30, runParallel: true);

            // other availible optimizers
            // GridSearchOptimizer
            // GlobalizedBoundedNelderMeadOptimizer
            // ParticleSwarmOptimizer
            // BayesianOptimizer

            // function to minimize
            Func <double[], OptimizerResult> minimize = p =>
            {
                var cv          = new RandomCrossValidation <double>(crossValidationFolds: 5, seed: 42);
                var optlearner  = new RegressionDecisionTreeLearner(maximumTreeDepth: (int)p[0], minimumSplitSize: (int)p[1]);
                var predictions = cv.CrossValidate(optlearner, observations, targets);
                var error       = metric.Error(targets, predictions);
                Trace.WriteLine("Error: " + error);
                return(new OptimizerResult(p, error));
            };

            // run optimizer
            var result         = optimizer.OptimizeBest(minimize);
            var bestParameters = result.ParameterSet;

            Trace.WriteLine("Result: " + result.Error);

            // create learner with found parameters
            var learner = new RegressionDecisionTreeLearner(maximumTreeDepth: (int)bestParameters[0], minimumSplitSize: (int)bestParameters[1]);

            // learn model with found parameters
            var model = learner.Learn(observations, targets);
        }
        public void RegressionModel_Predict()
        {
            #region learner creation

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // create learner
            var learner = new RegressionDecisionTreeLearner();
            #endregion

            // learns a RegressionDecisionTreeModel
            var model = learner.Learn(observations, targets);

            // predict all observations
            var predictions = model.Predict(observations);

            // predict single observation
            var prediction = model.Predict(observations.Row(0));
        }
Esempio n. 4
0
        public void RegressionDecisionTreeModel_Save()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var writer = new StringWriter();

            sut.Save(() => writer);

            Assert.AreEqual(m_regressionDecisionTreeModelString, writer.ToString());
        }
Esempio n. 5
0
        public void RegressionDecisionTreeLearner_Learn_Reuse_No_Valid_Split()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var sut = new RegressionDecisionTreeLearner();

            // train initial model.
            sut.Learn(observations, targets);

            // reuse learner, with smaller data that provides no valid split.
            var onlyUniqueTargetValue     = 1.0;
            var onlyOneUniqueObservations = (F64Matrix)observations.Rows(0, 1, 2, 3, 4);
            var onlyOneUniquetargets      = Enumerable.Range(0, onlyOneUniqueObservations.RowCount).Select(v => onlyUniqueTargetValue).ToArray();
            var model = sut.Learn(onlyOneUniqueObservations, onlyOneUniquetargets);

            var predictions = model.Predict(onlyOneUniqueObservations);

            // no valid split, so should result in the model always returning the onlyUniqueTargetValue.
            for (int i = 0; i < predictions.Length; i++)
            {
                Assert.AreEqual(onlyUniqueTargetValue, predictions[i], 0.0001);
            }
        }
Esempio n. 6
0
        public void RegressionDecisionTreeModel_Predict_Multiple()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var predictions = sut.Predict(observations);

            var evaluator = new MeanSquaredErrorRegressionMetric();
            var error     = evaluator.Error(targets, predictions);

            Assert.AreEqual(0.032120286249559482, error, 0.0000001);
        }
Esempio n. 7
0
        private static double RegressionDecisionTreeLearner_Learn(int treeDepth)
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var sut = new RegressionDecisionTreeLearner(treeDepth, 4, 2, 0.1, 42);

            var model = sut.Learn(observations, targets);

            var predictions = model.Predict(observations);

            var evaluator = new MeanSquaredErrorRegressionMetric();
            var error     = evaluator.Error(targets, predictions);

            return(error);
        }
Esempio n. 8
0
        public void RegressionDecisionTreeModel_Save()
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var writer = new StringWriter();

            sut.Save(() => writer);

            Assert.AreEqual(RegressionDecisionTreeModelString, writer.ToString());
        }
Esempio n. 9
0
        private double RegressionDecisionTreeLearner_Learn_Weighted(int treeDepth, double weight)
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var sut     = new RegressionDecisionTreeLearner(treeDepth, 4, 2, 0.1, 42);
            var weights = targets.Select(v => Weight(v, weight)).ToArray();
            var model   = sut.Learn(observations, targets, weights);

            var predictions = model.Predict(observations);

            var evaluator = new MeanSquaredErrorRegressionMetric();
            var error     = evaluator.Error(targets, predictions);

            return(error);
        }
Esempio n. 10
0
        public void RegressionDecisionTreeModel_Predict_Multiple_Indexed()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var indices     = new int[] { 0, 3, 4, 5, 6, 7, 8, 9, 20, 21 };
            var predictions = sut.Predict(observations, indices);

            var indexedTargets = targets.GetIndices(indices);
            var evaluator      = new MeanSquaredErrorRegressionMetric();
            var error          = evaluator.Error(indexedTargets, predictions);

            Assert.AreEqual(0.023821615502626264, error, 0.0000001);
        }
Esempio n. 11
0
        public void RegressionDecisionTreeModel_Predict_Multiple()
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();
            var rows         = targets.Length;

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var predictions = sut.Predict(observations);

            var evaluator = new MeanSquaredErrorRegressionMetric();
            var error     = evaluator.Error(targets, predictions);

            Assert.AreEqual(0.032120286249559482, error, 0.0000001);
        }
        RegressionDecisionTreeModel CreateTree(F64Matrix observations, double[] targets, int[] indices, Random random)
        {
            var learner = new RegressionDecisionTreeLearner(m_maximumTreeDepth, m_minimumSplitSize, m_featuresPrSplit,
                                                            m_minimumInformationGain, random.Next());

            var treeIndicesLength = (int)Math.Round(m_subSampleRatio * (double)indices.Length);
            var treeIndices       = new int[treeIndicesLength];

            for (int j = 0; j < treeIndicesLength; j++)
            {
                treeIndices[j] = indices[random.Next(indices.Length)];
            }

            var model = learner.Learn(observations, targets, treeIndices);

            return(model);
        }
Esempio n. 13
0
        public void RegressionDecisionTreeModel_GetVariableImportance()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var featureNameToIndex = new Dictionary <string, int> {
                { "F1", 0 }, { "F2", 1 }
            };

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var actual   = sut.GetVariableImportance(featureNameToIndex);
            var expected = new Dictionary <string, double> {
                { "F2", 100.0 }, { "F1", 0.0 }
            };

            CollectionAssert.AreEqual(expected, actual);
        }
Esempio n. 14
0
        private double RegressionDecisionTreeLearner_Learn_Weighted(int treeDepth, double weight)
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();
            var rows         = targets.Length;

            var sut     = new RegressionDecisionTreeLearner(treeDepth, 4, 2, 0.1, 42);
            var weights = targets.Select(v => Weight(v, weight)).ToArray();
            var model   = sut.Learn(observations, targets, weights);

            var predictions = model.Predict(observations);

            var evaluator = new MeanSquaredErrorRegressionMetric();
            var error     = evaluator.Error(targets, predictions);

            return(error);
        }
        public void TrainingTestSplitter_SplitSet()
        {
            #region Read data

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix (all columns different from the targetName)
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            #endregion

            // creates training test splitter, observations are shuffled randomly
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainingSet       = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;

            var learner = new RegressionDecisionTreeLearner();
            var model   = learner.Learn(trainingSet.Observations, trainingSet.Targets);

            // predict test set
            var testPredictions = model.Predict(testSet.Observations);

            // metric for measuring model error
            var metric = new MeanSquaredErrorRegressionMetric();

            // The test set provides an estimate on how the model will perform on unseen data
            Trace.WriteLine("Test error: " + metric.Error(testSet.Targets, testPredictions));

            // predict training set for comparison
            var trainingPredictions = model.Predict(trainingSet.Observations);

            // The training set is NOT a good estimate of how well the model will perfrom on unseen data.
            Trace.WriteLine("Training error: " + metric.Error(trainingSet.Targets, trainingPredictions));
        }
Esempio n. 16
0
        public void RegressionDecisionTreeModel_Predict_Multiple_Indexed()
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();
            var rows         = targets.Length;

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var indices     = new int[] { 0, 3, 4, 5, 6, 7, 8, 9, 20, 21 };
            var predictions = sut.Predict(observations, indices);

            var indexedTargets = targets.GetIndices(indices);
            var evaluator      = new MeanSquaredErrorRegressionMetric();
            var error          = evaluator.Error(indexedTargets, predictions);

            Assert.AreEqual(0.023821615502626264, error, 0.0000001);
        }
Esempio n. 17
0
        public void RegressionDecisionTreeModel_Predict_Single()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var rows        = targets.Length;
            var predictions = new double[rows];

            for (int i = 0; i < rows; i++)
            {
                predictions[i] = sut.Predict(observations.Row(i));
            }

            var evaluator = new MeanSquaredErrorRegressionMetric();
            var error     = evaluator.Error(targets, predictions);

            Assert.AreEqual(0.032120286249559482, error, 0.0000001);
        }
Esempio n. 18
0
        public void RegressionDecisionTreeModel_GetVariableImportance()
        {
            var parser             = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations       = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets            = parser.EnumerateRows("T").ToF64Vector();
            var rows               = targets.Length;
            var featureNameToIndex = new Dictionary <string, int> {
                { "F1", 0 }, { "F2", 1 }
            };

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var actual   = sut.GetVariableImportance(featureNameToIndex);
            var expected = new Dictionary <string, double> {
                { "F2", 100.0 }, { "F1", 0.0 }
            };

            CollectionAssert.AreEqual(expected, actual);
        }
        public void RegressionModel_FeatureImportance()
        {
            #region learner creation

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // create learner
            var learner = new RegressionDecisionTreeLearner();
            #endregion

            // learns a RegressionDecisionTreeModel
            var model = learner.Learn(observations, targets);

            // raw feature importance
            var rawImportance = model.GetRawVariableImportance();

            // normalized and named feature importance
            var featureNameToIndex = parser.EnumerateRows(c => c != targetName).First().ColumnNameToIndex;
            var importance         = model.GetVariableImportance(featureNameToIndex);

            // trace normalized importances
            var importanceCsv = new StringBuilder();
            importanceCsv.Append("FeatureName;Importance");
            foreach (var feature in importance)
            {
                importanceCsv.AppendLine();
                importanceCsv.Append(feature.Key + ";" + feature.Value);
            }

            Trace.WriteLine(importanceCsv);
        }
Esempio n. 20
0
        public void RegressionDecisionTreeModel_GetRawVariableImportance()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var featureNameToIndex = new Dictionary <string, int> {
                { "F1", 0 }, { "F2", 1 }
            };

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var actual   = sut.GetRawVariableImportance();
            var expected = new double[] { 0.0, 364.56356850440511 };

            Assert.AreEqual(expected.Length, actual.Length);

            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], 0.000001);
            }
        }
        public void CrossValidation_CrossValidate()
        {
            #region Read data

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix (all columns different from the targetName)
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            #endregion

            // creates cross validator, observations are shuffled randomly
            var cv = new RandomCrossValidation <double>(crossValidationFolds: 5, seed: 42);

            // create learner
            var learner = new RegressionDecisionTreeLearner();

            // cross-validated predictions
            var cvPredictions = cv.CrossValidate(learner, observations, targets);

            // metric for measuring model error
            var metric = new MeanSquaredErrorRegressionMetric();

            // cross-validation provides an estimate on how the model will perform on unseen data
            Trace.WriteLine("Cross-validation error: " + metric.Error(targets, cvPredictions));

            // train and predict training set for comparison.
            var predictions = learner.Learn(observations, targets).Predict(observations);

            // The training set is NOT a good estimate of how well the model will perfrom on unseen data.
            Trace.WriteLine("Training error: " + metric.Error(targets, predictions));
        }
Esempio n. 22
0
        public void RegressionDecisionTreeModel_GetRawVariableImportance()
        {
            var parser             = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations       = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets            = parser.EnumerateRows("T").ToF64Vector();
            var rows               = targets.Length;
            var featureNameToIndex = new Dictionary <string, int> {
                { "F1", 0 }, { "F2", 1 }
            };

            var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42);
            var sut     = learner.Learn(observations, targets);

            var actual   = sut.GetRawVariableImportance();
            var expected = new double[] { 0.0, 364.56356850440511 };

            Assert.AreEqual(expected.Length, actual.Length);

            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], 0.000001);
            }
        }
        public void RegressionModel_Save_Load()
        {
            #region learner creation

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // create learner
            var learner = new RegressionDecisionTreeLearner();

            #endregion

            // learns a ClassificationDecisionTreeModel
            var model = learner.Learn(observations, targets);

            var writer = new StringWriter();
            model.Save(() => writer);

            // save to file
            //model.Save(() => new StreamWriter(filePath));

            var text        = writer.ToString();
            var loadedModel = RegressionDecisionTreeModel.Load(() => new StringReader(text));

            // load from file
            //RegressionDecisionTreeModel.Load(() => new StreamReader(filePath));
        }
        bool Boost(F64Matrix observations, double[] targets, int[] indices, int iteration)
        {
            m_sampler.Sample(indices, m_sampleWeights, m_sampleIndices);

            var model = m_modelLearner.Learn(observations, targets,
                                             m_sampleIndices); // weighted sampling is used instead of weights in training


            var predictions = model.Predict(observations, indices);

            for (int i = 0; i < predictions.Length; i++)
            {
                var index = indices[i];
                m_workErrors[index] = Math.Abs(m_indexedTargets[i] - predictions[i]);
            }

            var maxError = m_workErrors.Max();

            for (int i = 0; i < m_workErrors.Length; i++)
            {
                var error = m_workErrors[i];

                if (maxError != 0.0)
                {
                    error = error / maxError;
                }

                switch (m_loss)
                {
                case AdaBoostRegressionLoss.Linear:
                    break;

                case AdaBoostRegressionLoss.Squared:
                    error = error * error;
                    break;

                case AdaBoostRegressionLoss.Exponential:
                    error = 1.0 - Math.Exp(-error);
                    break;

                default:
                    throw new ArgumentException("Unsupported loss type");
                }

                m_workErrors[i] = error;
            }

            var modelError = m_workErrors.WeightedMean(m_sampleWeights, indices);

            if (modelError <= 0.0)
            {
                m_modelErrors.Add(0.0);
                m_modelWeights.Add(1.0);
                m_models.Add(model);
                return(true);
            }
            else if (modelError >= 0.5)
            {
                return(false);
            }

            var beta = modelError / (1.0 - modelError);

            var modelWeight = m_learningRate * Math.Log(1.0 / beta);

            // Only boost if not last iteration
            if (iteration != m_iterations - 1)
            {
                for (int i = 0; i < indices.Length; i++)
                {
                    var index        = indices[i];
                    var sampleWeight = m_sampleWeights[index];
                    var error        = m_workErrors[index];
                    m_sampleWeights[index] = sampleWeight * Math.Pow(beta, (1.0 - error) * m_learningRate);
                }
            }

            m_modelErrors.Add(modelError);
            m_modelWeights.Add(modelWeight);
            m_models.Add(model);

            return(true);
        }