private float GetAUROC(F64Matrix observations, double[] targets) { // print the raw data being used for classification PrintArray(observations); PrintVector(targets); // split the data into training and test set var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.5); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; // train the model var learner = new ClassificationRandomForestLearner(); var model = learner.Learn(trainSet.Observations, trainSet.Targets); // make the predictions from the test set var testPredictions = model.PredictProbability(testSet.Observations); // create the metric and measure the error var metric = new RocAucClassificationProbabilityMetric(1); var testError = (float)metric.Error(testSet.Targets, testPredictions); if (testError < .5f) { testError = 1f - testError; } return(testError); }
public void RandomForest_Default_Parameters_Save_Load_Model_Using_Static_Methods() { #region read and split data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // creates training test splitter, // Since this is a regression problem, we use the random training/test set splitter. // 30 % of the data is used for the test set. var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; #endregion // create learner with default parameters var learner = new RegressionRandomForestLearner(trees: 100); // learn model with found parameters var model = learner.Learn(trainSet.Observations, trainSet.Targets); // predict the training and test set. var trainPredictions = model.Predict(trainSet.Observations); var testPredictions = model.Predict(testSet.Observations); // since this is a regression problem we are using square error as metric // for evaluating how well the model performs. var metric = new MeanSquaredErrorRegressionMetric(); // measure the error on training and test set. var trainError = metric.Error(trainSet.Targets, trainPredictions); var testError = metric.Error(testSet.Targets, testPredictions); TraceTrainingAndTestError(trainError, testError); //Save model, in the file system use new StreamWriter(filePath); // default format is xml. var savedModel = new StringWriter(); model.Save(() => savedModel); // load model, in the file system use new StreamReader(filePath); // default format is xml. var loadedModel = RegressionForestModel.Load(() => new StringReader(savedModel.ToString())); }
public void RegressionGradientBoostLearner_LearnWithEarlyStopping_ToFewIterations() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234); var split = splitter.SplitSet(observations, targets); var sut = new RegressionSquareLossGradientBoostLearner(5, 0.1, 3, 1, 1e-6, 1.0, 0, false); var evaluator = new MeanSquaredErrorRegressionMetric(); var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets, split.TestSet.Observations, split.TestSet.Targets, evaluator, 5); }
public void RegressionGradientBoostLearner_LearnWithEarlyStopping_ToFewIterations() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234); var split = splitter.SplitSet(observations, targets); var sut = new RegressionSquareLossGradientBoostLearner(5, 0.1, 3, 1, 1e-6, 1.0, 0, false); var evaluator = new MeanSquaredErrorRegressionMetric(); var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets, split.TestSet.Observations, split.TestSet.Targets, evaluator, 5); }
public void TrainingTestSplitter_SplitSet() { #region Read data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix (all columns different from the targetName) var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); #endregion // creates training test splitter, observations are shuffled randomly var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainingSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; var learner = new RegressionDecisionTreeLearner(); var model = learner.Learn(trainingSet.Observations, trainingSet.Targets); // predict test set var testPredictions = model.Predict(testSet.Observations); // metric for measuring model error var metric = new MeanSquaredErrorRegressionMetric(); // The test set provides an estimate on how the model will perform on unseen data Trace.WriteLine("Test error: " + metric.Error(testSet.Targets, testPredictions)); // predict training set for comparison var trainingPredictions = model.Predict(trainingSet.Observations); // The training set is NOT a good estimate of how well the model will perfrom on unseen data. Trace.WriteLine("Training error: " + metric.Error(trainingSet.Targets, trainingPredictions)); }
public void RegressionGradientBoostLearner_LearnWithEarlyStopping() { var(observations, targets) = DataSetUtilities.LoadDecisionTreeDataSet(); var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234); var split = splitter.SplitSet(observations, targets); var sut = new RegressionSquareLossGradientBoostLearner(1000, 0.1, 3, 1, 1e-6, 1.0, 0, false); var evaluator = new MeanSquaredErrorRegressionMetric(); var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets, split.TestSet.Observations, split.TestSet.Targets, evaluator, 5); var predictions = model.Predict(split.TestSet.Observations); var actual = evaluator.Error(split.TestSet.Targets, predictions); Assert.AreEqual(0.061035472792879512, actual, 0.000001); Assert.AreEqual(40, model.Trees.Length); }
public void RandomForest_Default_Parameters() { // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // creates training test splitter, // Since this is a regression problem, we use the random training/test set splitter. // 30 % of the data is used for the test set. var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; // Create the learner and learn the model. var learner = new RegressionRandomForestLearner(trees: 100); var model = learner.Learn(trainSet.Observations, trainSet.Targets); // predict the training and test set. var trainPredictions = model.Predict(trainSet.Observations); var testPredictions = model.Predict(testSet.Observations); // since this is a regression problem we are using square error as metric // for evaluating how well the model performs. var metric = new MeanSquaredErrorRegressionMetric(); // measure the error on training and test set. var trainError = metric.Error(trainSet.Targets, trainPredictions); var testError = metric.Error(testSet.Targets, testPredictions); TraceTrainingAndTestError(trainError, testError); }
public void RegressionGradientBoostLearner_LearnWithEarlyStopping() { var parser = new CsvParser(() => new StringReader(Resources.DecisionTreeData)); var observations = parser.EnumerateRows("F1", "F2").ToF64Matrix(); var targets = parser.EnumerateRows("T").ToF64Vector(); var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 1234); var split = splitter.SplitSet(observations, targets); var sut = new RegressionSquareLossGradientBoostLearner(1000, 0.1, 3, 1, 1e-6, 1.0, 0, false); var evaluator = new MeanSquaredErrorRegressionMetric(); var model = sut.LearnWithEarlyStopping(split.TrainingSet.Observations, split.TrainingSet.Targets, split.TestSet.Observations, split.TestSet.Targets, evaluator, 5); var predictions = model.Predict(split.TestSet.Observations); var actual = evaluator.Error(split.TestSet.Targets, predictions); Assert.AreEqual(0.061035472792879512, actual, 0.000001); Assert.AreEqual(40, model.Trees.Length); }
public void TrainingTestIndexSplitterExtensions_SplitSet_Observations_Targets_Row_Differ() { var splitter = new RandomTrainingTestIndexSplitter <double>(0.6, 32); splitter.SplitSet(new F64Matrix(10, 2), new double[8]); }
public void GradientBoost_Optimize_Hyperparameters() { #region read and split data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // creates training test splitter, // Since this is a regression problem, we use the random training/test set splitter. // 30 % of the data is used for the test set. var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; #endregion // since this is a regression problem we are using square error as metric // for evaluating how well the model performs. var metric = new MeanSquaredErrorRegressionMetric(); // Usually better results can be achieved by tuning a gradient boost learner var numberOfFeatures = trainSet.Observations.ColumnCount; // Parameter specs for the optimizer // best parameter to tune on random forest is featuresPrSplit. var parameters = new IParameterSpec[] { new MinMaxParameterSpec(min: 80, max: 300, transform: Transform.Linear, parameterType: ParameterType.Discrete), // iterations new MinMaxParameterSpec(min: 0.02, max: 0.2, transform: Transform.Logarithmic, parameterType: ParameterType.Continuous), // learning rate new MinMaxParameterSpec(min: 8, max: 15, transform: Transform.Linear, parameterType: ParameterType.Discrete), // maximumTreeDepth new MinMaxParameterSpec(min: 0.5, max: 0.9, transform: Transform.Linear, parameterType: ParameterType.Continuous), // subSampleRatio new MinMaxParameterSpec(min: 1, max: numberOfFeatures, transform: Transform.Linear, parameterType: ParameterType.Discrete), // featuresPrSplit }; // Further split the training data to have a validation set to measure // how well the model generalizes to unseen data during the optimization. var validationSplit = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24) .SplitSet(trainSet.Observations, trainSet.Targets); // Define optimizer objective (function to minimize) Func <double[], OptimizerResult> minimize = p => { // create the candidate learner using the current optimization parameters. var candidateLearner = new RegressionSquareLossGradientBoostLearner( iterations: (int)p[0], learningRate: p[1], maximumTreeDepth: (int)p[2], subSampleRatio: p[3], featuresPrSplit: (int)p[4], runParallel: false); var candidateModel = candidateLearner.Learn(validationSplit.TrainingSet.Observations, validationSplit.TrainingSet.Targets); var validationPredictions = candidateModel.Predict(validationSplit.TestSet.Observations); var candidateError = metric.Error(validationSplit.TestSet.Targets, validationPredictions); // trace current error Trace.WriteLine(string.Format("Candidate Error: {0:0.0000}, Candidate Parameters: {1}", candidateError, string.Join(", ", p))); return(new OptimizerResult(p, candidateError)); }; // create random search optimizer var optimizer = new RandomSearchOptimizer(parameters, iterations: 30, runParallel: true); // find best hyperparameters var result = optimizer.OptimizeBest(minimize); var best = result.ParameterSet; // create the final learner using the best hyperparameters. var learner = new RegressionSquareLossGradientBoostLearner( iterations: (int)best[0], learningRate: best[1], maximumTreeDepth: (int)best[2], subSampleRatio: best[3], featuresPrSplit: (int)best[4], runParallel: false); // learn model with found parameters var model = learner.Learn(trainSet.Observations, trainSet.Targets); // predict the training and test set. var trainPredictions = model.Predict(trainSet.Observations); var testPredictions = model.Predict(testSet.Observations); // measure the error on training and test set. var trainError = metric.Error(trainSet.Targets, trainPredictions); var testError = metric.Error(testSet.Targets, testPredictions); // Optimizer found hyperparameters. Trace.WriteLine(string.Format("Found parameters, iterations: {0}, learning rate {1:0.000}: maximumTreeDepth: {2}, subSampleRatio {3:0.000}, featuresPrSplit: {4} ", (int)best[0], best[1], (int)best[2], best[3], (int)best[4])); TraceTrainingAndTestError(trainError, testError); }
public void Predict(int iterations = DefaultNNIterations, int targetOffset = 1, string targetName = DefaultTargetName, bool pauseAtEnd = false) { _iterations = iterations; _targetName = targetName; _targetOffset = targetOffset; Program.StatusLogger.Info($"Iterations: {_iterations}"); Program.StatusLogger.Info($"Target: {_targetName}"); Program.StatusLogger.Info($"Offset: {_targetOffset}"); var data = new ConcurrentDictionary <int, ModelData>(); if (File.Exists(Path())) { data = JsonConvert.DeserializeObject <ConcurrentDictionary <int, ModelData> >(File.ReadAllText(Path())); //data = TypeSerializer.DeserializeFromReader<ConcurrentDictionary<int, ModelData>>(new StreamReader(Path())); Program.StatusLogger.Info("Cached data was loaded."); } else { //http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/UK-HPI-full-file-2019-07.csv var header = File.ReadLines("UK-HPI-full-file-2019-07.csv").First(); var columnNames = header.Split(","); var parser = new CsvParser(() => new StringReader(File.ReadAllText("UK-HPI-full-file-2019-07.csv")), ',', false, true); var creditData = _creditDataExtractor.Extract(); var populationData = _populationDataExtractor.Extract(); var otherPopulationData = _otherPopulationDataExtractor.Extract(); var densityData = _londonDensityDataExtractor.Extract(); var gvaData = _gvaDataExtractor.Extract(); var featureRows = parser.EnumerateRows().ToArray(); var targets = parser.EnumerateRows(_targetName).ToArray(); string previousKey = null; for (int i = 0; i < featureRows.Length; i++) { var item = featureRows[i]; var key = item.GetValue("RegionName"); var date = DateTime.ParseExact(item.GetValue("Date"), "dd/MM/yyyy", new CultureInfo("en-GB"), DateTimeStyles.AssumeLocal); if (key != previousKey) { Program.StatusLogger.Info($"Processing {key}"); } previousKey = key; var regionFeatures = item.GetValues(columnNames.Except(excludeColumns).ToArray()).Select(s => ParseRowValue(s)); var creditDataKey = _creditDataExtractor.GetKey(date, creditData.Keys.ToArray()); if (!creditData.ContainsKey(creditDataKey)) { regionFeatures = regionFeatures.Concat(Enumerable.Repeat(-1d, creditData.Values.First().Length)); Trace.WriteLine($"Credit data not found: {creditDataKey}"); } else { regionFeatures = regionFeatures.Concat(creditData[creditDataKey]); } var modelData = new ModelData { Name = key, Code = item.GetValue("AreaCode"), Date = date, Observations = regionFeatures.ToArray(), OriginalTarget = ParseTarget(item.GetValue(_targetName)) }; modelData.Observations = modelData.Observations .Concat(_populationDataExtractor.Get(populationData, modelData)) .Concat(_londonDensityDataExtractor.Get(densityData, modelData)) .Concat(_otherPopulationDataExtractor.Get(otherPopulationData, modelData)) .Concat(_gvaDataExtractor.Get(gvaData, modelData)) .ToArray(); data.TryAdd(i, modelData); } _targetCalculator.Calculate(data, _targetOffset); //TypeSerializer.SerializeToWriter<ConcurrentDictionary<int, ModelData>>(data, new StreamWriter(Path())); var json = JsonConvert.SerializeObject(data, Formatting.Indented); File.WriteAllText(Path(), json); } var itemCount = 0; Parallel.ForEach(data.OrderBy(o => o.Value.Date).GroupBy(g => g.Value.Name).AsParallel(), new ParallelOptions { MaxDegreeOfParallelism = -1 }, (grouping) => { var lastDate = grouping.Last().Value.Date; var dataWithTarget = grouping.Where(s => s.Value.OriginalTarget.HasValue && s.Value.Target != -1); if (dataWithTarget.Any()) { var allObservations = dataWithTarget.Select(s => s.Value.Observations).ToArray(); var allTargets = dataWithTarget.Select(s => s.Value.Target).ToArray(); //var validation = new TimeSeriesCrossValidation<double>((int)(allObservationsExceptLast.RowCount * 0.8), 0, 1); //var validationPredictions = validation.Validate((IIndexedLearner<double>)learner, allObservationsExceptLast, allTargetsExceptLast); //var crossMetric = new MeanSquaredErrorRegressionMetric(); //var crossError = crossMetric.Error(validation.GetValidationTargets(allTargetsExceptLast), validationPredictions); //_totalCrossError += crossError; var meanZeroTransformer = new MeanZeroFeatureTransformer(); var minMaxTransformer = new MinMaxTransformer(0d, 1d); var lastObservations = grouping.Last().Value.Observations; F64Matrix allTransformed = minMaxTransformer.Transform(meanZeroTransformer.Transform(allObservations.Append(lastObservations).ToArray())); var transformed = new F64Matrix(allTransformed.Rows(Enumerable.Range(0, allTransformed.RowCount - 1).ToArray()).Data(), allTransformed.RowCount - 1, allTransformed.ColumnCount); var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(transformed, allTargets); transformed = trainingTestSplit.TrainingSet.Observations; var testSet = trainingTestSplit.TestSet; //var learner = GetRandomForest(); //var learner = GetAda(); //var learner = GetNeuralNet(grouping.First().Value.Observations.Length, transformed.RowCount); var learner = GetEnsemble(grouping.First().Value.Observations.Length, transformed.RowCount); Program.StatusLogger.Info("Learning commenced " + grouping.First().Value.Name); var model = learner.Learn(transformed, trainingTestSplit.TrainingSet.Targets); Program.StatusLogger.Info("Learning completed " + grouping.First().Value.Name); if (model.GetRawVariableImportance().Any(a => a > 0)) { var importanceSummary = string.Join(",\r\n", model.GetRawVariableImportance().Select((d, i) => i.ToString() + ":" + d.ToString())); Program.StatusLogger.Info("Raw variable importance:\r\n" + importanceSummary); } var lastTransformed = allTransformed.Row(transformed.RowCount); var prediction = model.Predict(lastTransformed); //var before = item.Value.Item2[transformed.RowCount - _targetOffset - 1]; var change = -1; //Math.Round(prediction / before, 2); var testPrediction = model.Predict(testSet.Observations); var metric = new MeanSquaredErrorRegressionMetric(); var error = metric.Error(testSet.Targets, testPrediction); var averageError = 0d; lock (Locker) { _totalError += error; itemCount++; averageError = Math.Round(_totalError / itemCount, 3); } var isLondon = London.Contains(grouping.First().Value.Name); var message = $"TotalError: {Math.Round(_totalError, 3)}, AverageError: {averageError}, Target: {_targetName}, Offset: {_targetOffset}, Region: {grouping.First().Value.Name}, London: {isLondon}, Error: {Math.Round(error, 3)}, Next: {Math.Round(prediction, 3)}, Change: {change}"; Program.Logger.Info(message); } }); if (pauseAtEnd) { Console.WriteLine("Press any key to continue"); Console.ReadKey(); } }
public void RegressionEnsembleLearner() { #region read and split data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // creates training test splitter, // Since this is a regression problem, we use the random training/test set splitter. // 30 % of the data is used for the test set. var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; #endregion // create the list of learners to include in the ensemble var ensembleLearners = new IIndexedLearner <double>[] { new RegressionAdaBoostLearner(maximumTreeDepth: 15), new RegressionRandomForestLearner(runParallel: false), new RegressionSquareLossGradientBoostLearner(iterations: 198, learningRate: 0.028, maximumTreeDepth: 12, subSampleRatio: 0.559, featuresPrSplit: 10, runParallel: false) }; // create the ensemble learner var learner = new RegressionEnsembleLearner(learners: ensembleLearners); // the ensemble learnr combines all the provided learners // into a single ensemble model. var model = learner.Learn(trainSet.Observations, trainSet.Targets); // predict the training and test set. var trainPredictions = model.Predict(trainSet.Observations); var testPredictions = model.Predict(testSet.Observations); // since this is a regression problem we are using square error as metric // for evaluating how well the model performs. var metric = new MeanSquaredErrorRegressionMetric(); // measure the error on training and test set. var trainError = metric.Error(trainSet.Targets, trainPredictions); var testError = metric.Error(testSet.Targets, testPredictions); // The ensemble model achieves a lower test error // then any of the individual models: // RegressionAdaBoostLearner: 0.4005 // RegressionRandomForestLearner: 0.4037 // RegressionSquareLossGradientBoostLearner: 0.3936 TraceTrainingAndTestError(trainError, testError); }
public void RandomForest_Default_Parameters_Variable_Importance() { #region read and split data // Use StreamReader(filepath) when running from filesystem var parser = new CsvParser(() => new StringReader(Resources.winequality_white)); var targetName = "quality"; // read feature matrix var observations = parser.EnumerateRows(c => c != targetName) .ToF64Matrix(); // read regression targets var targets = parser.EnumerateRows(targetName) .ToF64Vector(); // creates training test splitter, // Since this is a regression problem, we use the random training/test set splitter. // 30 % of the data is used for the test set. var splitter = new RandomTrainingTestIndexSplitter <double>(trainingPercentage: 0.7, seed: 24); var trainingTestSplit = splitter.SplitSet(observations, targets); var trainSet = trainingTestSplit.TrainingSet; var testSet = trainingTestSplit.TestSet; #endregion // create learner with default parameters var learner = new RegressionRandomForestLearner(trees: 100); // learn model with found parameters var model = learner.Learn(trainSet.Observations, trainSet.Targets); // predict the training and test set. var trainPredictions = model.Predict(trainSet.Observations); var testPredictions = model.Predict(testSet.Observations); // since this is a regression problem we are using square error as metric // for evaluating how well the model performs. var metric = new MeanSquaredErrorRegressionMetric(); // measure the error on training and test set. var trainError = metric.Error(trainSet.Targets, trainPredictions); var testError = metric.Error(testSet.Targets, testPredictions); TraceTrainingAndTestError(trainError, testError); // the variable importance requires the featureNameToIndex // from the data set. This mapping describes the relation // from column name to index in the feature matrix. var featureNameToIndex = parser.EnumerateRows(c => c != targetName) .First().ColumnNameToIndex; // Get the variable importance from the model. // Variable importance is a measure made by to model // of how important each feature is. var importances = model.GetVariableImportance(featureNameToIndex); // trace normalized importances as csv. var importanceCsv = new StringBuilder(); importanceCsv.Append("FeatureName;Importance"); foreach (var feature in importances) { importanceCsv.AppendLine(); importanceCsv.Append(string.Format("{0};{1:0.00}", feature.Key, feature.Value)); } Trace.WriteLine(importanceCsv); }