示例#1
0
    private float GetAUROC(F64Matrix observations, double[] targets)
    {
        // print the raw data being used for classification
        PrintArray(observations);
        PrintVector(targets);

        // split the data into training and test set
        var splitter          = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.5);
        var trainingTestSplit = splitter.SplitSet(observations, targets);
        var trainSet          = trainingTestSplit.TrainingSet;
        var testSet           = trainingTestSplit.TestSet;

        // train the model
        var learner = new ClassificationRandomForestLearner();
        var model   = learner.Learn(trainSet.Observations, trainSet.Targets);

        // make the predictions from the test set
        var testPredictions = model.PredictProbability(testSet.Observations);

        // create the metric and measure the error
        var metric    = new RocAucClassificationProbabilityMetric(1);
        var testError = (float)metric.Error(testSet.Targets, testPredictions);

        if (testError < .5f)
        {
            testError = 1f - testError;
        }

        return(testError);
    }
示例#2
0
        public void RandomForest_Default_Parameters_Save_Load_Model_Using_Static_Methods()
        {
            #region read and split data
            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // creates training test splitter,
            // Since this is a regression problem, we use the random training/test set splitter.
            // 30 % of the data is used for the test set.
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainSet          = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;
            #endregion

            // create learner with default parameters
            var learner = new RegressionRandomForestLearner(trees: 100);

            // learn model with found parameters
            var model = learner.Learn(trainSet.Observations, trainSet.Targets);

            // predict the training and test set.
            var trainPredictions = model.Predict(trainSet.Observations);
            var testPredictions  = model.Predict(testSet.Observations);

            // since this is a regression problem we are using square error as metric
            // for evaluating how well the model performs.
            var metric = new MeanSquaredErrorRegressionMetric();

            // measure the error on training and test set.
            var trainError = metric.Error(trainSet.Targets, trainPredictions);
            var testError  = metric.Error(testSet.Targets, testPredictions);

            TraceTrainingAndTestError(trainError, testError);

            //Save model, in the file system use new StreamWriter(filePath);
            // default format is xml.
            var savedModel = new StringWriter();
            model.Save(() => savedModel);

            // load model, in the file system use new StreamReader(filePath);
            // default format is xml.
            var loadedModel = RegressionForestModel.Load(() => new StringReader(savedModel.ToString()));
        }
示例#3
0
        public void RandomTrainingTestIndexSplitter_Split()
        {
            var sut = new RandomTrainingTestIndexSplitter <double>(0.8, 42);

            var targets = new double[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };

            var actual   = sut.Split(targets);
            var expected = new TrainingTestIndexSplit(new int[] { 9, 0, 4, 2, 5, 7, 3, 8 },
                                                      new int[] { 1, 6 });

            Assert.AreEqual(expected, actual);
        }
示例#4
0
        public void RegressionGradientBoostLearner_LearnWithEarlyStopping_ToFewIterations()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234);
            var split    = splitter.SplitSet(observations, targets);

            var sut       = new RegressionSquareLossGradientBoostLearner(5, 0.1, 3, 1, 1e-6, 1.0, 0, false);
            var evaluator = new MeanSquaredErrorRegressionMetric();

            var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets,
                                                   split.TestSet.Observations, split.TestSet.Targets, evaluator, 5);
        }
        public void RegressionGradientBoostLearner_LearnWithEarlyStopping_ToFewIterations()
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();

            var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234);
            var split    = splitter.SplitSet(observations, targets);

            var sut       = new RegressionSquareLossGradientBoostLearner(5, 0.1, 3, 1, 1e-6, 1.0, 0, false);
            var evaluator = new MeanSquaredErrorRegressionMetric();

            var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets,
                                                   split.TestSet.Observations, split.TestSet.Targets, evaluator, 5);
        }
示例#6
0
        public void RegressionGradientBoostLearner_LearnWithEarlyStopping()
        {
            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();

            var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234);
            var split    = splitter.SplitSet(observations, targets);

            var sut       = new RegressionSquareLossGradientBoostLearner(1000, 0.1, 3, 1, 1e-6, 1.0, 0, false);
            var evaluator = new MeanSquaredErrorRegressionMetric();

            var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets,
                                                   split.TestSet.Observations, split.TestSet.Targets, evaluator, 5);

            var predictions = model.Predict(split.TestSet.Observations);
            var actual      = evaluator.Error(split.TestSet.Targets, predictions);

            Assert.AreEqual(0.061035472792879512, actual, 0.000001);
            Assert.AreEqual(40, model.Trees.Length);
        }
        public void TrainingTestSplitter_SplitSet()
        {
            #region Read data

            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix (all columns different from the targetName)
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            #endregion

            // creates training test splitter, observations are shuffled randomly
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainingSet       = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;

            var learner = new RegressionDecisionTreeLearner();
            var model   = learner.Learn(trainingSet.Observations, trainingSet.Targets);

            // predict test set
            var testPredictions = model.Predict(testSet.Observations);

            // metric for measuring model error
            var metric = new MeanSquaredErrorRegressionMetric();

            // The test set provides an estimate on how the model will perform on unseen data
            Trace.WriteLine("Test error: " + metric.Error(testSet.Targets, testPredictions));

            // predict training set for comparison
            var trainingPredictions = model.Predict(trainingSet.Observations);

            // The training set is NOT a good estimate of how well the model will perfrom on unseen data.
            Trace.WriteLine("Training error: " + metric.Error(trainingSet.Targets, trainingPredictions));
        }
示例#8
0
        public void RandomForest_Default_Parameters()
        {
            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // creates training test splitter,
            // Since this is a regression problem, we use the random training/test set splitter.
            // 30 % of the data is used for the test set.
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainSet          = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;

            // Create the learner and learn the model.
            var learner = new RegressionRandomForestLearner(trees: 100);
            var model   = learner.Learn(trainSet.Observations, trainSet.Targets);

            // predict the training and test set.
            var trainPredictions = model.Predict(trainSet.Observations);
            var testPredictions  = model.Predict(testSet.Observations);

            // since this is a regression problem we are using square error as metric
            // for evaluating how well the model performs.
            var metric = new MeanSquaredErrorRegressionMetric();

            // measure the error on training and test set.
            var trainError = metric.Error(trainSet.Targets, trainPredictions);
            var testError  = metric.Error(testSet.Targets, testPredictions);

            TraceTrainingAndTestError(trainError, testError);
        }
        public void RegressionGradientBoostLearner_LearnWithEarlyStopping()
        {
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix();
            var targets      = parser.EnumerateRows("T").ToF64Vector();

            var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234);
            var split    = splitter.SplitSet(observations, targets);

            var sut       = new RegressionSquareLossGradientBoostLearner(1000, 0.1, 3, 1, 1e-6, 1.0, 0, false);
            var evaluator = new MeanSquaredErrorRegressionMetric();

            var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets,
                                                   split.TestSet.Observations, split.TestSet.Targets, evaluator, 5);

            var predictions = model.Predict(split.TestSet.Observations);
            var actual      = evaluator.Error(split.TestSet.Targets, predictions);

            Assert.AreEqual(0.061035472792879512, actual, 0.000001);
            Assert.AreEqual(40, model.Trees.Length);
        }
        public void LearningCurvesCalculator_Calculate_Indices_Provided()
        {
            var splitter = new RandomTrainingTestIndexSplitter <double>(0.8, 42);

            var sut = new LearningCurvesCalculator <double>(splitter, new RandomIndexSampler <double>(42),
                                                            new MeanSquaredErrorRegressionMetric(), new double[] { 0.2, 0.8 });

            var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet();
            var indexSplits = splitter.Split(targets);

            var actual = sut.Calculate(new RegressionDecisionTreeLearner(),
                                       observations, targets, indexSplits.TrainingIndices, indexSplits.TestIndices);

            var expected = new List <LearningCurvePoint>()
            {
                new LearningCurvePoint(32, 0, 0.141565953928265),
                new LearningCurvePoint(128, 0.0, 0.068970597423950036)
            };

            CollectionAssert.AreEqual(expected, actual);
        }
        public void LearningCurvesCalculator_Calculate_Indices_Provided()
        {
            var splitter = new RandomTrainingTestIndexSplitter <double>(0.8, 42);

            var sut = new LearningCurvesCalculator <double>(splitter, new RandomIndexSampler <double>(42),
                                                            new MeanSquaredErrorRegressionMetric(), new double[] { 0.2, 0.8 });

            var targetName   = "T";
            var parser       = new CsvParser(() => new StringReader(Resources.DecisionTreeData));
            var observations = parser.EnumerateRows(v => !v.Contains(targetName)).ToF64Matrix();
            var targets      = parser.EnumerateRows(targetName).ToF64Vector();
            var indexSplits  = splitter.Split(targets);

            var actual = sut.Calculate(new RegressionDecisionTreeLearner(),
                                       observations, targets, indexSplits.TrainingIndices, indexSplits.TestIndices);

            var expected = new List <LearningCurvePoint>()
            {
                new LearningCurvePoint(32, 0, 0.141565953928265),
                new LearningCurvePoint(128, 0.0, 0.068970597423950036)
            };

            CollectionAssert.AreEqual(expected, actual);
        }
        public void GradientBoost_Optimize_Hyperparameters()
        {
            #region read and split data
            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // creates training test splitter,
            // Since this is a regression problem, we use the random training/test set splitter.
            // 30 % of the data is used for the test set.
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainSet          = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;
            #endregion

            // since this is a regression problem we are using square error as metric
            // for evaluating how well the model performs.
            var metric = new MeanSquaredErrorRegressionMetric();

            // Usually better results can be achieved by tuning a gradient boost learner

            var numberOfFeatures = trainSet.Observations.ColumnCount;

            // Parameter specs for the optimizer
            // best parameter to tune on random forest is featuresPrSplit.
            var parameters = new IParameterSpec[]
            {
                new MinMaxParameterSpec(min: 80, max: 300,
                                        transform: Transform.Linear, parameterType: ParameterType.Discrete), // iterations

                new MinMaxParameterSpec(min: 0.02, max:  0.2,
                                        transform: Transform.Logarithmic, parameterType: ParameterType.Continuous), // learning rate

                new MinMaxParameterSpec(min: 8, max: 15,
                                        transform: Transform.Linear, parameterType: ParameterType.Discrete), // maximumTreeDepth

                new MinMaxParameterSpec(min: 0.5, max: 0.9,
                                        transform: Transform.Linear, parameterType: ParameterType.Continuous), // subSampleRatio

                new MinMaxParameterSpec(min: 1, max: numberOfFeatures,
                                        transform: Transform.Linear, parameterType: ParameterType.Discrete), // featuresPrSplit
            };

            // Further split the training data to have a validation set to measure
            // how well the model generalizes to unseen data during the optimization.
            var validationSplit = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24)
                                  .SplitSet(trainSet.Observations, trainSet.Targets);


            // Define optimizer objective (function to minimize)
            Func <double[], OptimizerResult> minimize = p =>
            {
                // create the candidate learner using the current optimization parameters.
                var candidateLearner = new RegressionSquareLossGradientBoostLearner(
                    iterations: (int)p[0],
                    learningRate: p[1],
                    maximumTreeDepth: (int)p[2],
                    subSampleRatio: p[3],
                    featuresPrSplit: (int)p[4],
                    runParallel: false);

                var candidateModel = candidateLearner.Learn(validationSplit.TrainingSet.Observations,
                                                            validationSplit.TrainingSet.Targets);

                var validationPredictions = candidateModel.Predict(validationSplit.TestSet.Observations);
                var candidateError        = metric.Error(validationSplit.TestSet.Targets, validationPredictions);

                // trace current error
                Trace.WriteLine(string.Format("Candidate Error: {0:0.0000}, Candidate Parameters: {1}",
                                              candidateError, string.Join(", ", p)));

                return(new OptimizerResult(p, candidateError));
            };

            // create random search optimizer
            var optimizer = new RandomSearchOptimizer(parameters, iterations: 30, runParallel: true);

            // find best hyperparameters
            var result = optimizer.OptimizeBest(minimize);
            var best   = result.ParameterSet;

            // create the final learner using the best hyperparameters.
            var learner = new RegressionSquareLossGradientBoostLearner(
                iterations: (int)best[0],
                learningRate: best[1],
                maximumTreeDepth: (int)best[2],
                subSampleRatio: best[3],
                featuresPrSplit: (int)best[4],
                runParallel: false);

            // learn model with found parameters
            var model = learner.Learn(trainSet.Observations, trainSet.Targets);

            // predict the training and test set.
            var trainPredictions = model.Predict(trainSet.Observations);
            var testPredictions  = model.Predict(testSet.Observations);

            // measure the error on training and test set.
            var trainError = metric.Error(trainSet.Targets, trainPredictions);
            var testError  = metric.Error(testSet.Targets, testPredictions);

            // Optimizer found hyperparameters.
            Trace.WriteLine(string.Format("Found parameters, iterations:  {0}, learning rate {1:0.000}:  maximumTreeDepth: {2}, subSampleRatio {3:0.000}, featuresPrSplit: {4} ",
                                          (int)best[0], best[1], (int)best[2], best[3], (int)best[4]));
            TraceTrainingAndTestError(trainError, testError);
        }
        public void Predict(int iterations = DefaultNNIterations, int targetOffset = 1, string targetName = DefaultTargetName, bool pauseAtEnd = false)
        {
            _iterations   = iterations;
            _targetName   = targetName;
            _targetOffset = targetOffset;

            Program.StatusLogger.Info($"Iterations: {_iterations}");
            Program.StatusLogger.Info($"Target: {_targetName}");
            Program.StatusLogger.Info($"Offset: {_targetOffset}");

            var data = new ConcurrentDictionary <int, ModelData>();

            if (File.Exists(Path()))
            {
                data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path()));
                //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path()));

                Program.StatusLogger.Info("Cached data was loaded.");
            }
            else
            {
                //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv
                var header      = File.ReadLines("UK-HPI-full-file-2019-07.csv").First();
                var columnNames = header.Split(",");

                var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true);

                var creditData          = _creditDataExtractor.Extract();
                var populationData      = _populationDataExtractor.Extract();
                var otherPopulationData = _otherPopulationDataExtractor.Extract();
                var densityData         = _londonDensityDataExtractor.Extract();
                var gvaData             = _gvaDataExtractor.Extract();

                var featureRows = parser.EnumerateRows().ToArray();
                var targets     = parser.EnumerateRows(_targetName).ToArray();

                string previousKey = null;

                for (int i = 0; i < featureRows.Length; i++)
                {
                    var item = featureRows[i];
                    var key  = item.GetValue("RegionName");
                    var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal);

                    if (key != previousKey)
                    {
                        Program.StatusLogger.Info($"Processing {key}");
                    }
                    previousKey = key;

                    var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s));

                    var creditDataKey = _creditDataExtractor.GetKey(date, creditData.Keys.ToArray());
                    if (!creditData.ContainsKey(creditDataKey))
                    {
                        regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length));
                        Trace.WriteLine($"Credit data not found: {creditDataKey}");
                    }
                    else
                    {
                        regionFeatures = regionFeatures.Concat(creditData[creditDataKey]);
                    }

                    var modelData = new ModelData
                    {
                        Name           = key,
                        Code           = item.GetValue("AreaCode"),
                        Date           = date,
                        Observations   = regionFeatures.ToArray(),
                        OriginalTarget = ParseTarget(item.GetValue(_targetName))
                    };

                    modelData.Observations = modelData.Observations
                                             .Concat(_populationDataExtractor.Get(populationData, modelData))
                                             .Concat(_londonDensityDataExtractor.Get(densityData, modelData))
                                             .Concat(_otherPopulationDataExtractor.Get(otherPopulationData, modelData))
                                             .Concat(_gvaDataExtractor.Get(gvaData, modelData))
                                             .ToArray();

                    data.TryAdd(i, modelData);
                }

                _targetCalculator.Calculate(data, _targetOffset);


                //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path()));
                var json = JsonConvert.SerializeObject(data, Formatting.Indented);
                File.WriteAllText(Path(), json);
            }

            var itemCount = 0;

            Parallel.ForEach(data.OrderBy(o => o.Value.Date).GroupBy(g => g.Value.Name).AsParallel(), new ParallelOptions {
                MaxDegreeOfParallelism = -1
            }, (grouping) =>
            {
                var lastDate       = grouping.Last().Value.Date;
                var dataWithTarget = grouping.Where(s => s.Value.OriginalTarget.HasValue && s.Value.Target != -1);

                if (dataWithTarget.Any())
                {
                    var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray();
                    var allTargets      = dataWithTarget.Select(s => s.Value.Target).ToArray();

                    //var validation = new TimeSeriesCrossValidation<double>((int)(allObservationsExceptLast.RowCount * 0.8), 0, 1);
                    //var validationPredictions = validation.Validate((IIndexedLearner<double>)learner, allObservationsExceptLast, allTargetsExceptLast);
                    //var crossMetric = new MeanSquaredErrorRegressionMetric();
                    //var crossError = crossMetric.Error(validation.GetValidationTargets(allTargetsExceptLast), validationPredictions);
                    //_totalCrossError += crossError;
                    var meanZeroTransformer  = new MeanZeroFeatureTransformer();
                    var minMaxTransformer    = new MinMaxTransformer(0d, 1d);
                    var lastObservations     = grouping.Last().Value.Observations;
                    F64Matrix allTransformed = minMaxTransformer.Transform(meanZeroTransformer.Transform(allObservations.Append(lastObservations).ToArray()));
                    var transformed          = new F64Matrix(allTransformed.Rows(Enumerable.Range(0, allTransformed.RowCount - 1).ToArray()).Data(), allTransformed.RowCount - 1, allTransformed.ColumnCount);

                    var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

                    var trainingTestSplit = splitter.SplitSet(transformed, allTargets);
                    transformed           = trainingTestSplit.TrainingSet.Observations;
                    var testSet           = trainingTestSplit.TestSet;

                    //var learner = GetRandomForest();
                    //var learner = GetAda();
                    //var learner = GetNeuralNet(grouping.First().Value.Observations.Length, transformed.RowCount);
                    var learner = GetEnsemble(grouping.First().Value.Observations.Length, transformed.RowCount);

                    Program.StatusLogger.Info("Learning commenced " + grouping.First().Value.Name);

                    var model = learner.Learn(transformed, trainingTestSplit.TrainingSet.Targets);

                    Program.StatusLogger.Info("Learning completed " + grouping.First().Value.Name);

                    if (model.GetRawVariableImportance().Any(a => a > 0))
                    {
                        var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString()));
                        Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary);
                    }

                    var lastTransformed = allTransformed.Row(transformed.RowCount);
                    var prediction      = model.Predict(lastTransformed);

                    //var before = item.Value.Item2[transformed.RowCount - _targetOffset - 1];
                    var change = -1; //Math.Round(prediction / before, 2);

                    var testPrediction = model.Predict(testSet.Observations);

                    var metric       = new MeanSquaredErrorRegressionMetric();
                    var error        = metric.Error(testSet.Targets, testPrediction);
                    var averageError = 0d;
                    lock (Locker)
                    {
                        _totalError += error;
                        itemCount++;
                        averageError = Math.Round(_totalError / itemCount, 3);
                    }
                    var isLondon = London.Contains(grouping.First().Value.Name);

                    var message = $"TotalError: {Math.Round(_totalError, 3)}, AverageError: {averageError}, Target: {_targetName}, Offset: {_targetOffset}, Region: {grouping.First().Value.Name}, London: {isLondon}, Error: {Math.Round(error, 3)}, Next: {Math.Round(prediction, 3)}, Change: {change}";

                    Program.Logger.Info(message);
                }
            });

            if (pauseAtEnd)
            {
                Console.WriteLine("Press any key to continue");
                Console.ReadKey();
            }
        }
        public void RegressionEnsembleLearner()
        {
            #region read and split data
            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // creates training test splitter,
            // Since this is a regression problem, we use the random training/test set splitter.
            // 30 % of the data is used for the test set.
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainSet          = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;
            #endregion

            // create the list of learners to include in the ensemble
            var ensembleLearners = new IIndexedLearner <double>[]
            {
                new RegressionAdaBoostLearner(maximumTreeDepth: 15),
                new RegressionRandomForestLearner(runParallel: false),
                new RegressionSquareLossGradientBoostLearner(iterations:  198, learningRate: 0.028, maximumTreeDepth: 12,
                                                             subSampleRatio: 0.559, featuresPrSplit: 10, runParallel: false)
            };

            // create the ensemble learner
            var learner = new RegressionEnsembleLearner(learners: ensembleLearners);

            // the ensemble learnr combines all the provided learners
            // into a single ensemble model.
            var model = learner.Learn(trainSet.Observations, trainSet.Targets);

            // predict the training and test set.
            var trainPredictions = model.Predict(trainSet.Observations);
            var testPredictions  = model.Predict(testSet.Observations);

            // since this is a regression problem we are using square error as metric
            // for evaluating how well the model performs.
            var metric = new MeanSquaredErrorRegressionMetric();

            // measure the error on training and test set.
            var trainError = metric.Error(trainSet.Targets, trainPredictions);
            var testError  = metric.Error(testSet.Targets, testPredictions);

            // The ensemble model achieves a lower test error
            // then any of the individual models:

            // RegressionAdaBoostLearner: 0.4005
            // RegressionRandomForestLearner: 0.4037
            // RegressionSquareLossGradientBoostLearner: 0.3936
            TraceTrainingAndTestError(trainError, testError);
        }
示例#15
0
        public void TrainingTestIndexSplitterExtensions_SplitSet_Observations_Targets_Row_Differ()
        {
            var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 32);

            splitter.SplitSet(new F64Matrix(10, 2), new double[8]);
        }
示例#16
0
        public static double FitGBT(double[] pred_Features)
        {
            var parser     = new CsvParser(() => new StreamReader("dataset.csv"), separator: ',');
            var targetName = "Y";

            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // read regression targets


            var metric = new MeanSquaredErrorRegressionMetric();

            var parameters = new double[][]
            {
                new double[] { 80, 300 },                     // iterations (min: 20, max: 100)
                new double[] { 0.02, 0.2 },                   // learning rate (min: 0.02, max: 0.2)
                new double[] { 8, 15 },                       // maximumTreeDepth (min: 8, max: 15)
                new double[] { 0.5, 0.9 },                    // subSampleRatio (min: 0.5, max: 0.9)
                new double[] { 1, observations.ColumnCount }, // featuresPrSplit (min: 1, max: numberOfFeatures)
            };


            var validationSplit = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24)
                                  .SplitSet(observations, targets);

            Func <double[], OptimizerResult> minimize = p =>
            {
                // create the candidate learner using the current optimization parameters

                var candidateLearner = new RegressionSquareLossGradientBoostLearner(
                    iterations: (int)p[0],
                    learningRate: p[1],
                    maximumTreeDepth: (int)p[2],
                    subSampleRatio: p[3],
                    featuresPrSplit: (int)p[4],
                    runParallel: false);

                var candidateModel = candidateLearner.Learn(validationSplit.TrainingSet.Observations,
                                                            validationSplit.TrainingSet.Targets);

                var validationPredictions = candidateModel.Predict(validationSplit.TestSet.Observations);
                var candidateError        = metric.Error(validationSplit.TestSet.Targets, validationPredictions);

                return(new OptimizerResult(p, candidateError));
            };

            // Hyper-parameter tuning
            var optimizer = new RandomSearchOptimizer(parameters, iterations: 30, runParallel: true);

            var result = optimizer.OptimizeBest(minimize);
            var best   = result.ParameterSet;

            var learner = new RegressionSquareLossGradientBoostLearner(
                iterations: (int)best[0],
                learningRate: best[1],
                maximumTreeDepth: (int)best[2],
                subSampleRatio: best[3],
                featuresPrSplit: (int)best[4],
                runParallel: false);

            var model      = learner.Learn(observations, targets);
            var prediction = model.Predict(pred_Features);

            return(prediction);
        }
示例#17
0
        public void RandomForest_Default_Parameters_Variable_Importance()
        {
            #region read and split data
            // Use StreamReader(filepath) when running from filesystem
            var parser     = new CsvParser(() => new StringReader(Resources.winequality_white));
            var targetName = "quality";

            // read feature matrix
            var observations = parser.EnumerateRows(c => c != targetName)
                               .ToF64Matrix();

            // read regression targets
            var targets = parser.EnumerateRows(targetName)
                          .ToF64Vector();

            // creates training test splitter,
            // Since this is a regression problem, we use the random training/test set splitter.
            // 30 % of the data is used for the test set.
            var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24);

            var trainingTestSplit = splitter.SplitSet(observations, targets);
            var trainSet          = trainingTestSplit.TrainingSet;
            var testSet           = trainingTestSplit.TestSet;
            #endregion

            // create learner with default parameters
            var learner = new RegressionRandomForestLearner(trees: 100);

            // learn model with found parameters
            var model = learner.Learn(trainSet.Observations, trainSet.Targets);

            // predict the training and test set.
            var trainPredictions = model.Predict(trainSet.Observations);
            var testPredictions  = model.Predict(testSet.Observations);

            // since this is a regression problem we are using square error as metric
            // for evaluating how well the model performs.
            var metric = new MeanSquaredErrorRegressionMetric();

            // measure the error on training and test set.
            var trainError = metric.Error(trainSet.Targets, trainPredictions);
            var testError  = metric.Error(testSet.Targets, testPredictions);

            TraceTrainingAndTestError(trainError, testError);

            // the variable importance requires the featureNameToIndex
            // from the data set. This mapping describes the relation
            // from column name to index in the feature matrix.
            var featureNameToIndex = parser.EnumerateRows(c => c != targetName)
                                     .First().ColumnNameToIndex;

            // Get the variable importance from the model.
            // Variable importance is a measure made by to model
            // of how important each feature is.
            var importances = model.GetVariableImportance(featureNameToIndex);

            // trace normalized importances as csv.
            var importanceCsv = new StringBuilder();
            importanceCsv.Append("FeatureName;Importance");
            foreach (var feature in importances)
            {
                importanceCsv.AppendLine();
                importanceCsv.Append(string.Format("{0};{1:0.00}",
                                                   feature.Key, feature.Value));
            }

            Trace.WriteLine(importanceCsv);
        }