public void RegressionDecisionTreeLearner_Learn_Reuse_No_Valid_Split() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var rows = targets.Length; var sut = new RegressionDecisionTreeLearner(); // train initial model. sut.Learn(observations, targets); // reuse learner, with smaller data that provides no valid split. var onlyUniqueTargetValue = 1.0; var onlyOneUniqueObservations = (F64Matrix)observations.Rows(0, 1, 2, 3, 4); var onlyOneUniquetargets = Enumerable.Range(0, onlyOneUniqueObservations.RowCount).Select(v => onlyUniqueTargetValue).ToArray(); var model = sut.Learn(onlyOneUniqueObservations, onlyOneUniquetargets); var predictions = model.Predict(onlyOneUniqueObservations); // no valid split, so should result in the model always returning the onlyUniqueTargetValue. for (int i = 0; i < predictions.Length; i++) { Assert.AreEqual(onlyUniqueTargetValue, predictions[i], 0.0001); } }
public void Hyper_Parameter_Tuning() { #region Read data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read classification targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); #endregion // metric to minimize var metric = new MeanSquaredErrorRegressionMetric(); // Parameter ranges for the optimizer var paramers = new ParameterBounds[] { new ParameterBounds(min: 1, max: 100, transform: Transform.Linear), // maximumTreeDepth new ParameterBounds(min: 1, max: 16, transform: Transform.Linear), // minimumSplitSize }; // create random search optimizer var optimizer = new RandomSearchOptimizer(paramers, iterations: 30, runParallel: true); // other availible optimizers // GridSearchOptimizer // GlobalizedBoundedNelderMeadOptimizer // ParticleSwarmOptimizer // BayesianOptimizer // function to minimize Func <double[], OptimizerResult> minimize = p => { var cv = new RandomCrossValidation <double>(crossValidationFolds: 5, seed: 42); var optlearner = new RegressionDecisionTreeLearner(maximumTreeDepth: (int)p[0], minimumSplitSize: (int)p[1]); var predictions = cv.CrossValidate(optlearner, observations, targets); var error = metric.Error(targets, predictions); Trace.WriteLine("Error: " + error); return(new OptimizerResult(p, error)); }; // run optimizer var result = optimizer.OptimizeBest(minimize); var bestParameters = result.ParameterSet; Trace.WriteLine("Result: " + result.Error); // create learner with found parameters var learner = new RegressionDecisionTreeLearner(maximumTreeDepth: (int)bestParameters[0], minimumSplitSize: (int)bestParameters[1]); // learn model with found parameters var model = learner.Learn(observations, targets); }
public void RegressionModel_Predict() { #region learner creation // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // create learner var learner = new RegressionDecisionTreeLearner(); #endregion // learns a RegressionDecisionTreeModel var model = learner.Learn(observations, targets); // predict all observations var predictions = model.Predict(observations); // predict single observation var prediction = model.Predict(observations.Row(0)); }
public void TimeSeriesCrossValidation_Validate_InitialTrainingSize_Is_Larger_Than_Obsevations_Length() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new TimeSeriesCrossValidation <double>(initialTrainingSize: 300); var learner = new RegressionDecisionTreeLearner(); var timeSeriesPredictions = sut.Validate(learner, observations, targets); }
public void TimeSeriesCrossValidation_Validate_Observations_And_Targets_Length_Does_Not_Match() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); targets = targets.Take(100).ToArray(); var sut = new TimeSeriesCrossValidation <double>(initialTrainingSize: 5); var learner = new RegressionDecisionTreeLearner(); var timeSeriesPredictions = sut.Validate(learner, observations, targets); }
double CrossValidate(int folds) { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new RandomCrossValidation <double>(folds, 42); var learner = new RegressionDecisionTreeLearner(); var predictions = sut.CrossValidate(learner, observations, targets); var metric = new MeanSquaredErrorRegressionMetric(); return(metric.Error(targets, predictions)); }
public void RegressionDecisionTreeModel_Save() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var writer = new StringWriter(); sut.Save(() => writer); Assert.AreEqual(m_regressionDecisionTreeModelString, writer.ToString()); }
public void RegressionDecisionTreeModel_Predict_Multiple() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var predictions = sut.Predict(observations); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(targets, predictions); Assert.AreEqual(0.032120286249559482, error, 0.0000001); }
public void TimeSeriesCrossValidation_Validate_MaxTrainingSetSize_And_RetrainInterval() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new TimeSeriesCrossValidation <double>(initialTrainingSize: 5, maxTrainingSetSize: 30, retrainInterval: 5); var learner = new RegressionDecisionTreeLearner(); var timeSeriesPredictions = sut.Validate(learner, observations, targets); var timeSeriesTargets = sut.GetValidationTargets(targets); var metric = new MeanSquaredErrorRegressionMetric(); var error = metric.Error(timeSeriesTargets, timeSeriesPredictions); Assert.AreEqual(0.13010151998135897, error, 0.00001); }
public void TimeSeriesCrossValidation_Validate() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new TimeSeriesCrossValidation <double>(initialTrainingSize: 5); var learner = new RegressionDecisionTreeLearner(); var timeSeriesPredictions = sut.Validate(learner, observations, targets); var timeSeriesTargets = sut.GetValidationTargets(targets); var metric = new MeanSquaredErrorRegressionMetric(); var error = metric.Error(timeSeriesTargets, timeSeriesPredictions); Assert.AreEqual(0.098690664447830825, error, 0.00001); }
private static double RegressionDecisionTreeLearner_Learn(int treeDepth) { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new RegressionDecisionTreeLearner(treeDepth, 4, 2, 0.1, 42); var model = sut.Learn(observations, targets); var predictions = model.Predict(observations); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(targets, predictions); return(error); }
private double RegressionDecisionTreeLearner_Learn_Weighted(int treeDepth, double weight) { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new RegressionDecisionTreeLearner(treeDepth, 4, 2, 0.1, 42); var weights = targets.Select(v => Weight(v, weight)).ToArray(); var model = sut.Learn(observations, targets, weights); var predictions = model.Predict(observations); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(targets, predictions); return(error); }
public void RegressionDecisionTreeModel_Save() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var writer = new StringWriter(); sut.Save(() => writer); Assert.AreEqual(RegressionDecisionTreeModelString, writer.ToString()); }
public void RegressionDecisionTreeModel_Predict_Multiple_Indexed() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var indices = new int[] { 0, 3, 4, 5, 6, 7, 8, 9, 20, 21 }; var predictions = sut.Predict(observations, indices); var indexedTargets = targets.GetIndices(indices); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(indexedTargets, predictions); Assert.AreEqual(0.023821615502626264, error, 0.0000001); }
double CrossValidate_Provide_Indices(int folds) { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new RandomCrossValidation <double>(folds, 42); var rowsToCrossvalidate = targets.Length / 2; var indices = Enumerable.Range(0, rowsToCrossvalidate).ToArray(); var predictions = new double[rowsToCrossvalidate]; var learner = new RegressionDecisionTreeLearner(); sut.CrossValidate(learner, observations, targets, indices, predictions); var metric = new MeanSquaredErrorRegressionMetric(); return(metric.Error(targets.Take(rowsToCrossvalidate).ToArray(), predictions)); }
public void RegressionDecisionTreeModel_Predict_Multiple() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var rows = targets.Length; var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var predictions = sut.Predict(observations); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(targets, predictions); Assert.AreEqual(0.032120286249559482, error, 0.0000001); }
RegressionDecisionTreeModel CreateTree(F64Matrix observations, double[] targets, int[] indices, Random random) { var learner = new RegressionDecisionTreeLearner(m_maximumTreeDepth, m_minimumSplitSize, m_featuresPrSplit, m_minimumInformationGain, random.Next()); var treeIndicesLength = (int)Math.Round(m_subSampleRatio * (double)indices.Length); var treeIndices = new int[treeIndicesLength]; for (int j = 0; j < treeIndicesLength; j++) { treeIndices[j] = indices[random.Next(indices.Length)]; } var model = learner.Learn(observations, targets, treeIndices); return(model); }
private double RegressionDecisionTreeLearner_Learn_Weighted(int treeDepth, double weight) { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var rows = targets.Length; var sut = new RegressionDecisionTreeLearner(treeDepth, 4, 2, 0.1, 42); var weights = targets.Select(v => Weight(v, weight)).ToArray(); var model = sut.Learn(observations, targets, weights); var predictions = model.Predict(observations); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(targets, predictions); return(error); }
public void RegressionDecisionTreeModel_GetVariableImportance() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var featureNameToIndex = new Dictionary <string, int> { { "F1", 0 }, { "F2", 1 } }; var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var actual = sut.GetVariableImportance(featureNameToIndex); var expected = new Dictionary <string, double> { { "F2", 100.0 }, { "F1", 0.0 } }; CollectionAssert.AreEqual(expected, actual); }
public void LearningCurves_Calculate() { #region Read data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); #endregion // metric for measuring model error var metric = new MeanSquaredErrorRegressionMetric(); // creates cross validator, observations are shuffled randomly var learningCurveCalculator = new RandomShuffleLearningCurvesCalculator <double>(metric, samplePercentages: new double[] { 0.05, 0.1, 0.2, 0.4, 0.8, 1.0 }, trainingPercentage: 0.7, numberOfShufflesPrSample: 5); // create learner var learner = new RegressionDecisionTreeLearner(maximumTreeDepth: 5); // calculate learning curve var learningCurve = learningCurveCalculator.Calculate(learner, observations, targets); // write to csv var writer = new StringWriter(); learningCurve.Write(() => writer); // trace result // Plotting the learning curves will help determine if the model has high bias or high variance. // This information can be used to determine what to try next in order to improve the model. Trace.WriteLine(writer.ToString()); // alternatively, write to file //learningCurve.Write(() => new StreamWriter(filePath)); }
public void RegressionDecisionTreeModel_Predict_Multiple_Indexed() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var rows = targets.Length; var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var indices = new int[] { 0, 3, 4, 5, 6, 7, 8, 9, 20, 21 }; var predictions = sut.Predict(observations, indices); var indexedTargets = targets.GetIndices(indices); var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(indexedTargets, predictions); Assert.AreEqual(0.023821615502626264, error, 0.0000001); }
public void TrainingTestSplitter_SplitSet() { #region Read data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix (all columns different from the targetName) var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); #endregion // creates training test splitter, observations are shuffled randomly var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainingSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; var learner = new RegressionDecisionTreeLearner(); var model = learner.Learn(trainingSet.Observations, trainingSet.Targets); // predict test set var testPredictions = model.Predict(testSet.Observations); // metric for measuring model error var metric = new MeanSquaredErrorRegressionMetric(); // The test set provides an estimate on how the model will perform on unseen data Trace.WriteLine("Test error: " + metric.Error(testSet.Targets, testPredictions)); // predict training set for comparison var trainingPredictions = model.Predict(trainingSet.Observations); // The training set is NOT a good estimate of how well the model will perfrom on unseen data. Trace.WriteLine("Training error: " + metric.Error(trainingSet.Targets, trainingPredictions)); }
public void RegressionDecisionTreeModel_GetVariableImportance() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var rows = targets.Length; var featureNameToIndex = new Dictionary <string, int> { { "F1", 0 }, { "F2", 1 } }; var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var actual = sut.GetVariableImportance(featureNameToIndex); var expected = new Dictionary <string, double> { { "F2", 100.0 }, { "F1", 0.0 } }; CollectionAssert.AreEqual(expected, actual); }
public void RegressionDecisionTreeModel_Predict_Single() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var rows = targets.Length; var predictions = new double[rows]; for (int i = 0; i < rows; i++) { predictions[i] = sut.Predict(observations.Row(i)); } var evaluator = new MeanSquaredErrorRegressionMetric(); var error = evaluator.Error(targets, predictions); Assert.AreEqual(0.032120286249559482, error, 0.0000001); }
public void RegressionModel_FeatureImportance() { #region learner creation // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // create learner var learner = new RegressionDecisionTreeLearner(); #endregion // learns a RegressionDecisionTreeModel var model = learner.Learn(observations, targets); // raw feature importance var rawImportance = model.GetRawVariableImportance(); // normalized and named feature importance var featureNameToIndex = parser.EnumerateRows(c => c != targetName).First().ColumnNameToIndex; var importance = model.GetVariableImportance(featureNameToIndex); // trace normalized importances var importanceCsv = new StringBuilder(); importanceCsv.Append("FeatureName;Importance"); foreach (var feature in importance) { importanceCsv.AppendLine(); importanceCsv.Append(feature.Key + ";" + feature.Value); } Trace.WriteLine(importanceCsv); }
public void RegressionDecisionTreeModel_GetRawVariableImportance() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var featureNameToIndex = new Dictionary <string, int> { { "F1", 0 }, { "F2", 1 } }; var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var actual = sut.GetRawVariableImportance(); var expected = new double[] { 0.0, 364.56356850440511 }; Assert.AreEqual(expected.Length, actual.Length); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], 0.000001); } }
public void CrossValidation_CrossValidate() { #region Read data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix (all columns different from the targetName) var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); #endregion // creates cross validator, observations are shuffled randomly var cv = new RandomCrossValidation <double>(crossValidationFolds: 5, seed: 42); // create learner var learner = new RegressionDecisionTreeLearner(); // cross-validated predictions var cvPredictions = cv.CrossValidate(learner, observations, targets); // metric for measuring model error var metric = new MeanSquaredErrorRegressionMetric(); // cross-validation provides an estimate on how the model will perform on unseen data Trace.WriteLine("Cross-validation error: " + metric.Error(targets, cvPredictions)); // train and predict training set for comparison. var predictions = learner.Learn(observations, targets).Predict(observations); // The training set is NOT a good estimate of how well the model will perfrom on unseen data. Trace.WriteLine("Training error: " + metric.Error(targets, predictions)); }
public void RegressionDecisionTreeLearner_Learn_Reuse_No_Valid_Split() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var sut = new RegressionDecisionTreeLearner(); // train initial model. sut.Learn(observations, targets); // reuse learner, with smaller data that provides no valid split. var onlyUniqueTargetValue = 1.0; var onlyOneUniqueObservations = (F64Matrix)observations.Rows(0, 1, 2, 3, 4); var onlyOneUniquetargets = Enumerable.Range(0, onlyOneUniqueObservations.RowCount).Select(v => onlyUniqueTargetValue).ToArray(); var model = sut.Learn(onlyOneUniqueObservations, onlyOneUniquetargets); var predictions = model.Predict(onlyOneUniqueObservations); // no valid split, so should result in the model always returning the onlyUniqueTargetValue. for (int i = 0; i < predictions.Length; i++) { Assert.AreEqual(onlyUniqueTargetValue, predictions[i], 0.0001); } }
public void RegressionDecisionTreeModel_GetRawVariableImportance() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var rows = targets.Length; var featureNameToIndex = new Dictionary <string, int> { { "F1", 0 }, { "F2", 1 } }; var learner = new RegressionDecisionTreeLearner(100, 4, 2, 0.1, 42); var sut = learner.Learn(observations, targets); var actual = sut.GetRawVariableImportance(); var expected = new double[] { 0.0, 364.56356850440511 }; Assert.AreEqual(expected.Length, actual.Length); for (int i = 0; i < expected.Length; i++) { Assert.AreEqual(expected[i], actual[i], 0.000001); } }
public void RegressionModel_Save_Load() { #region learner creation // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // create learner var learner = new RegressionDecisionTreeLearner(); #endregion // learns a ClassificationDecisionTreeModel var model = learner.Learn(observations, targets); var writer = new StringWriter(); model.Save(() => writer); // save to file //model.Save(() => new StreamWriter(filePath)); var text = writer.ToString(); var loadedModel = RegressionDecisionTreeModel.Load(() => new StringReader(text)); // load from file //RegressionDecisionTreeModel.Load(() => new StreamReader(filePath)); }