FastForest( this SweepableRegressionTrainers trainers, string labelColumnName = "Label", string featureColumnName = "Features", SweepableOption <FastForestRegressionTrainer.Options> optionSweeper = null, FastForestRegressionTrainer.Options defaultOption = null) { var context = trainers.Context; if (optionSweeper == null) { optionSweeper = FastForestRegressionTrainerSweepableOptions.Default; } optionSweeper.SetDefaultOption(defaultOption); return(context.AutoML().CreateSweepableEstimator( (context, option) => { option.LabelColumnName = labelColumnName; option.FeatureColumnName = featureColumnName; return context.Regression.Trainers.FastForest(option); }, optionSweeper, new string[] { labelColumnName, featureColumnName }, new string[] { Score }, nameof(FastForestRegressionTrainer))); }
/// <summary> /// Create <see cref="FastForestRegressionTrainer"/> with advanced options, which predicts a target using a decision tree regression model. /// </summary> /// <param name="catalog">The <see cref="RegressionCatalog"/>.</param> /// <param name="options">Trainer options.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// [!code-csharp[FastForestRegression](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastForestWithOptions.cs)] /// ]]> /// </format> /// </example> public static FastForestRegressionTrainer FastForest(this RegressionCatalog.RegressionTrainers catalog, FastForestRegressionTrainer.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); Contracts.CheckValue(options, nameof(options)); var env = CatalogUtils.GetEnvironment(catalog); return(new FastForestRegressionTrainer(env, options)); }
public void TestSaveAndLoadTreeFeaturizer() { int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "Label" }; var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths", TrainerOptions = trainerOptions }; var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); // Save the trained model into file. ITransformer loadedModel = null; var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(Env, tempPath, true, true)) { using (var fs = file.CreateWriteStream()) ML.Model.Save(model, null, fs); using (var fs = file.OpenReadStream()) loadedModel = ML.Model.Load(fs, out var schema); } var loadedPrediction = loadedModel.Transform(dataView); var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); }
public override IEstimator <ITransformer> BuildFromOption(MLContext context, FastForestOption param) { var option = new FastForestRegressionTrainer.Options() { NumberOfTrees = param.NumberOfTrees, FeatureFraction = param.FeatureFraction, LabelColumnName = param.LabelColumnName, FeatureColumnName = param.FeatureColumnName, ExampleWeightColumnName = param.ExampleWeightColumnName, NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(), }; return(context.Regression.Trainers.FastForest(option)); }
public void TreeEnsembleFeaturizingPipelineMulticlass() { int dataPointCount = 1000; var data = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "FloatLabel", ShuffleLabels = true }; var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths", TrainerOptions = trainerOptions }; Action <RowWithKey, RowWithFloat> actionConvertKeyToFloat = (RowWithKey rowWithKey, RowWithFloat rowWithFloat) => { rowWithFloat.FloatLabel = rowWithKey.KeyLabel == 0 ? float.NaN : rowWithKey.KeyLabel - 1; }; var split = ML.Data.TrainTestSplit(dataView, 0.5); var trainData = split.TrainSet; var testData = split.TestSet; var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label") .Append(ML.Transforms.CustomMapping(actionConvertKeyToFloat, "KeyLabel")) .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Trees", "Leaves", "Paths")) .Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); var model = pipeline.Fit(trainData); var prediction = model.Transform(testData); var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel"); Assert.True(metrics.MacroAccuracy > 0.6); Assert.True(metrics.MicroAccuracy > 0.6); }
private static EstimatorChain <RegressionPredictionTransformer <FastForestRegressionModelParameters> > BuildPipeline() { var options = new FastForestRegressionTrainer.Options() { NumberOfTrees = 500, MinimumExampleCountPerLeaf = 100 }; // Data process configuration with pipeline data transformations var trainingPipeline = mlContext.Transforms.Categorical .OneHotEncoding("TeamEncoded", nameof(Match.Team)) .Append(mlContext.Transforms.Categorical.OneHotEncoding( "MatchDateEncoded", nameof(Match.MatchDate))) .Append(mlContext.Transforms.Categorical.OneHotEncoding( "OpponentEncoded", nameof(Match.Opponent))) .Append(mlContext.Transforms.Concatenate( "Features", "IsHome", "TeamEncoded", "OpponentEncoded", "MatchDateEncoded")) .Append(mlContext.Regression.Trainers.FastForest(options)); return(trainingPipeline); }
public void TestFastForestRegressionFeaturizationInPipeline() { int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "Label" }; var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths", TrainerOptions = trainerOptions }; var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); }
public void TestSaveAndLoadDoubleTreeFeaturizer() { int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "Label" }; // Trains tree featurization on "Features" and applies on "CopiedFeatures". var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "CopiedFeatures", TrainerOptions = trainerOptions, TreesColumnName = "OhMyTrees", LeavesColumnName = "OhMyLeaves", PathsColumnName = "OhMyPaths" }; var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); // Save the trained model into file and then load it back. ITransformer loadedModel = null; var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(Env, tempPath, true, true)) { using (var fs = file.CreateWriteStream()) ML.Model.Save(model, null, fs); using (var fs = file.OpenReadStream()) loadedModel = ML.Model.Load(fs, out var schema); } // Compute prediction using the loaded model. var loadedPrediction = loadedModel.Transform(dataView); var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); // Check if the loaded model produces the same result as the trained model. Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") .Append(ML.Transforms.NormalizeBinning("CopiedFeatures")) .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var secondModel = secondPipeline.Fit(dataView); var secondPrediction = secondModel.Transform(dataView); var secondMetrics = ML.Regression.Evaluate(secondPrediction); // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline // is different from the first pipeline. Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError); Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError); }
internal FastForestRegressionFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) { _trainerOptions = options.TrainerOptions; }
// This example requires installation of additional NuGet package // <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>. public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. // Setting the seed to a fixed number in this example to make outputs deterministic. var mlContext = new MLContext(seed: 0); // Create a list of training data points. var dataPoints = GenerateRandomDataPoints(1000); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints); // Define trainer options. var options = new FastForestRegressionTrainer.Options { LabelColumnName = nameof(DataPoint.Label), FeatureColumnName = nameof(DataPoint.Features), // Only use 80% of features to reduce over-fitting. FeatureFraction = 0.8, // Create a simpler model by penalizing usage of new features. FeatureFirstUsePenalty = 0.1, // Reduce the number of trees to 50. NumberOfTrees = 50 }; // Define the trainer. var pipeline = mlContext.Regression.Trainers.FastForest(options); // Train the model. var model = pipeline.Fit(trainingData); // Create testing data. Use different random seed to make it different from training data. var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(5, seed: 123)); // Run the model on test data set. var transformedTestData = model.Transform(testData); // Convert IDataView object to a list. var predictions = mlContext.Data.CreateEnumerable <Prediction>(transformedTestData, reuseRowObject: false).ToList(); // Look at 5 predictions for the Label, side by side with the actual Label for comparison. foreach (var p in predictions) { Console.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}"); } // Expected output: // Label: 0.985, Prediction: 0.866 // Label: 0.155, Prediction: 0.171 // Label: 0.515, Prediction: 0.470 // Label: 0.566, Prediction: 0.476 // Label: 0.096, Prediction: 0.140 // Evaluate the overall metrics var metrics = mlContext.Regression.Evaluate(transformedTestData); PrintMetrics(metrics); // Expected output: // Mean Absolute Error: 0.06 // Mean Squared Error: 0.01 // Root Mean Squared Error: 0.07 // RSquared: 0.95 (closer to 1 is better. The worest case is 0) }
// This example requires installation of additional NuGet package // <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>. public static void Example() { // Create a new context for ML.NET operations. It can be used for // exception tracking and logging, as a catalog of available operations // and as the source of randomness. Setting the seed to a fixed number // in this example to make outputs deterministic. var mlContext = new MLContext(seed: 0); // Create a list of training data points. var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is // consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); // ML.NET doesn't cache data set by default. Therefore, if one reads a // data set from a file and accesses it many times, it can be slow due // to expensive featurization and disk operations. When the considered // data can fit into memory, a solution is to cache the data in memory. // Caching is especially helpful when working with iterative algorithms // which needs many data passes. dataView = mlContext.Data.Cache(dataView); // Define input and output columns of tree-based featurizer. string labelColumnName = nameof(DataPoint.Label); string featureColumnName = nameof(DataPoint.Features); string treesColumnName = nameof(TransformedDataPoint.Trees); string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); // Define the configuration of the trainer used to train a tree-based // model. var trainerOptions = new FastForestRegressionTrainer.Options { // Only use 80% of features to reduce over-fitting. FeatureFraction = 0.8, // Create a simpler model by penalizing usage of new features. FeatureFirstUsePenalty = 0.1, // Reduce the number of trees to 3. NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, LabelColumnName = labelColumnName, FeatureColumnName = featureColumnName }; // Define the tree-based featurizer's configuration. var options = new FastForestRegressionFeaturizationEstimator.Options { InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, TrainerOptions = trainerOptions }; // Define the featurizer. var pipeline = mlContext.Transforms.FeaturizeByFastForestRegression( options); // Train the model. var model = pipeline.Fit(dataView); // Create testing data. Use different random seed to make it different // from training data. var transformed = model.Transform(dataView); // Convert IDataView object to a list. Each element in the resulted list // corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable < TransformedDataPoint>(transformed, false).ToList(); // Print out the transformation of the first 3 data points. for (int i = 0; i < 3; ++i) { var dataPoint = dataPoints[i]; var transformedDataPoint = transformedDataPoints[i]; Console.WriteLine("The original feature vector [" + String.Join(",", dataPoint.Features) + "] is transformed to three different " + "tree-based feature vectors:"); Console.WriteLine(" Trees' output values: [" + String.Join(",", transformedDataPoint.Trees) + "]."); Console.WriteLine(" Leave IDs' 0-1 representation: [" + String .Join(",", transformedDataPoint.Leaves) + "]."); Console.WriteLine(" Paths IDs' 0-1 representation: [" + String .Join(",", transformedDataPoint.Paths) + "]."); } // Expected output: // The original feature vector[1.543569, 1.494266, 1.284405] is // transformed to three different tree - based feature vectors: // Trees' output values: [0.7291142,0.7825329,0.8764582]. // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0]. // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. // The original feature vector[0.764918, 1.11206, 0.648211] is // transformed to three different tree - based feature vectors: // Trees' output values: [0.3802337,0.584159,0.5648927]. // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,1,0,0]. // The original feature vector[1.251254, 1.269456, 1.444864] is // transformed to three different tree - based feature vectors: // Trees' output values: [0.7591804,0.7825329,0.7443035]. // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1]. // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. }