Пример #1
0
        FastForest(
            this SweepableRegressionTrainers trainers,
            string labelColumnName   = "Label",
            string featureColumnName = "Features",
            SweepableOption <FastForestRegressionTrainer.Options> optionSweeper = null,
            FastForestRegressionTrainer.Options defaultOption = null)
        {
            var context = trainers.Context;

            if (optionSweeper == null)
            {
                optionSweeper = FastForestRegressionTrainerSweepableOptions.Default;
            }

            optionSweeper.SetDefaultOption(defaultOption);

            return(context.AutoML().CreateSweepableEstimator(
                       (context, option) =>
            {
                option.LabelColumnName = labelColumnName;
                option.FeatureColumnName = featureColumnName;

                return context.Regression.Trainers.FastForest(option);
            },
                       optionSweeper,
                       new string[] { labelColumnName, featureColumnName },
                       new string[] { Score },
                       nameof(FastForestRegressionTrainer)));
        }
Пример #2
0
        /// <summary>
        /// Create <see cref="FastForestRegressionTrainer"/> with advanced options, which predicts a target using a decision tree regression model.
        /// </summary>
        /// <param name="catalog">The <see cref="RegressionCatalog"/>.</param>
        /// <param name="options">Trainer options.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[FastForestRegression](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastForestWithOptions.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static FastForestRegressionTrainer FastForest(this RegressionCatalog.RegressionTrainers catalog,
                                                             FastForestRegressionTrainer.Options options)
        {
            Contracts.CheckValue(catalog, nameof(catalog));
            Contracts.CheckValue(options, nameof(options));

            var env = CatalogUtils.GetEnvironment(catalog);

            return(new FastForestRegressionTrainer(env, options));
        }
Пример #3
0
        public void TestSaveAndLoadTreeFeaturizer()
        {
            int dataPointCount = 200;
            var data           = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName          = "Features",
                LabelColumnName            = "Label"
            };

            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName  = "Features",
                TreesColumnName  = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName  = "Paths",
                TrainerOptions   = trainerOptions
            };

            var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options)
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                           .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model      = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics    = ML.Regression.Evaluate(prediction);

            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);

            // Save the trained model into file.
            ITransformer loadedModel = null;
            var          tempPath    = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);

                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }
            var loadedPrediction = loadedModel.Transform(dataView);
            var loadedMetrics    = ML.Regression.Evaluate(loadedPrediction);

            Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError);
            Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError);
        }
Пример #4
0
        public override IEstimator <ITransformer> BuildFromOption(MLContext context, FastForestOption param)
        {
            var option = new FastForestRegressionTrainer.Options()
            {
                NumberOfTrees           = param.NumberOfTrees,
                FeatureFraction         = param.FeatureFraction,
                LabelColumnName         = param.LabelColumnName,
                FeatureColumnName       = param.FeatureColumnName,
                ExampleWeightColumnName = param.ExampleWeightColumnName,
                NumberOfThreads         = AutoMlUtils.GetNumberOfThreadFromEnvrionment(),
            };

            return(context.Regression.Trainers.FastForest(option));
        }
Пример #5
0
        public void TreeEnsembleFeaturizingPipelineMulticlass()
        {
            int dataPointCount = 1000;
            var data           = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName          = "Features",
                LabelColumnName            = "FloatLabel",
                ShuffleLabels              = true
            };

            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName  = "Features",
                TreesColumnName  = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName  = "Paths",
                TrainerOptions   = trainerOptions
            };

            Action <RowWithKey, RowWithFloat> actionConvertKeyToFloat = (RowWithKey rowWithKey, RowWithFloat rowWithFloat) =>
            {
                rowWithFloat.FloatLabel = rowWithKey.KeyLabel == 0 ? float.NaN : rowWithKey.KeyLabel - 1;
            };

            var split     = ML.Data.TrainTestSplit(dataView, 0.5);
            var trainData = split.TrainSet;
            var testData  = split.TestSet;

            var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label")
                           .Append(ML.Transforms.CustomMapping(actionConvertKeyToFloat, "KeyLabel"))
                           .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Trees", "Leaves", "Paths"))
                           .Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures"));

            var model      = pipeline.Fit(trainData);
            var prediction = model.Transform(testData);
            var metrics    = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel");

            Assert.True(metrics.MacroAccuracy > 0.6);
            Assert.True(metrics.MicroAccuracy > 0.6);
        }
        private static EstimatorChain <RegressionPredictionTransformer <FastForestRegressionModelParameters> > BuildPipeline()
        {
            var options = new FastForestRegressionTrainer.Options()
            {
                NumberOfTrees = 500,
                MinimumExampleCountPerLeaf = 100
            };

            // Data process configuration with pipeline data transformations
            var trainingPipeline =
                mlContext.Transforms.Categorical
                .OneHotEncoding("TeamEncoded", nameof(Match.Team))
                .Append(mlContext.Transforms.Categorical.OneHotEncoding(
                            "MatchDateEncoded", nameof(Match.MatchDate)))
                .Append(mlContext.Transforms.Categorical.OneHotEncoding(
                            "OpponentEncoded", nameof(Match.Opponent)))
                .Append(mlContext.Transforms.Concatenate(
                            "Features",
                            "IsHome", "TeamEncoded", "OpponentEncoded", "MatchDateEncoded"))
                .Append(mlContext.Regression.Trainers.FastForest(options));

            return(trainingPipeline);
        }
Пример #7
0
        public void TestFastForestRegressionFeaturizationInPipeline()
        {
            int dataPointCount = 200;
            var data           = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName          = "Features",
                LabelColumnName            = "Label"
            };

            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName  = "Features",
                TreesColumnName  = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName  = "Paths",
                TrainerOptions   = trainerOptions
            };

            var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options)
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                           .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model      = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics    = ML.Regression.Evaluate(prediction);

            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);
        }
Пример #8
0
        public void TestSaveAndLoadDoubleTreeFeaturizer()
        {
            int dataPointCount = 200;
            var data           = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName          = "Features",
                LabelColumnName            = "Label"
            };

            // Trains tree featurization on "Features" and applies on "CopiedFeatures".
            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName  = "CopiedFeatures",
                TrainerOptions   = trainerOptions,
                TreesColumnName  = "OhMyTrees",
                LeavesColumnName = "OhMyLeaves",
                PathsColumnName  = "OhMyPaths"
            };

            var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features")
                           .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths"))
                           .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model      = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics    = ML.Regression.Evaluate(prediction);

            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);

            // Save the trained model into file and then load it back.
            ITransformer loadedModel = null;
            var          tempPath    = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);

                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }

            // Compute prediction using the loaded model.
            var loadedPrediction = loadedModel.Transform(dataView);
            var loadedMetrics    = ML.Regression.Evaluate(loadedPrediction);

            // Check if the loaded model produces the same result as the trained model.
            Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError);
            Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError);

            var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features")
                                 .Append(ML.Transforms.NormalizeBinning("CopiedFeatures"))
                                 .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                                 .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths"))
                                 .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var secondModel      = secondPipeline.Fit(dataView);
            var secondPrediction = secondModel.Transform(dataView);
            var secondMetrics    = ML.Regression.Evaluate(secondPrediction);

            // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline
            // is different from the first pipeline.
            Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError);
            Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError);
        }
 internal FastForestRegressionFeaturizationEstimator(IHostEnvironment env, Options options)
     : base(env, options)
 {
     _trainerOptions = options.TrainerOptions;
 }
        // This example requires installation of additional NuGet package
        // <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>.
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            // Setting the seed to a fixed number in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(1000);

            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
            var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);

            // Define trainer options.
            var options = new FastForestRegressionTrainer.Options
            {
                LabelColumnName   = nameof(DataPoint.Label),
                FeatureColumnName = nameof(DataPoint.Features),
                // Only use 80% of features to reduce over-fitting.
                FeatureFraction = 0.8,
                // Create a simpler model by penalizing usage of new features.
                FeatureFirstUsePenalty = 0.1,
                // Reduce the number of trees to 50.
                NumberOfTrees = 50
            };

            // Define the trainer.
            var pipeline = mlContext.Regression.Trainers.FastForest(options);

            // Train the model.
            var model = pipeline.Fit(trainingData);

            // Create testing data. Use different random seed to make it different from training data.
            var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(5, seed: 123));

            // Run the model on test data set.
            var transformedTestData = model.Transform(testData);

            // Convert IDataView object to a list.
            var predictions = mlContext.Data.CreateEnumerable <Prediction>(transformedTestData, reuseRowObject: false).ToList();

            // Look at 5 predictions for the Label, side by side with the actual Label for comparison.
            foreach (var p in predictions)
            {
                Console.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}");
            }

            // Expected output:
            //   Label: 0.985, Prediction: 0.866
            //   Label: 0.155, Prediction: 0.171
            //   Label: 0.515, Prediction: 0.470
            //   Label: 0.566, Prediction: 0.476
            //   Label: 0.096, Prediction: 0.140

            // Evaluate the overall metrics
            var metrics = mlContext.Regression.Evaluate(transformedTestData);

            PrintMetrics(metrics);

            // Expected output:
            //   Mean Absolute Error: 0.06
            //   Mean Squared Error: 0.01
            //   Root Mean Squared Error: 0.07
            //   RSquared: 0.95 (closer to 1 is better. The worest case is 0)
        }
        // This example requires installation of additional NuGet package
        // <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>.
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for
            // exception tracking and logging, as a catalog of available operations
            // and as the source of randomness. Setting the seed to a fixed number
            // in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(100).ToList();

            // Convert the list of data points to an IDataView object, which is
            // consumable by ML.NET API.
            var dataView = mlContext.Data.LoadFromEnumerable(dataPoints);

            // ML.NET doesn't cache data set by default. Therefore, if one reads a
            // data set from a file and accesses it many times, it can be slow due
            // to expensive featurization and disk operations. When the considered
            // data can fit into memory, a solution is to cache the data in memory.
            // Caching is especially helpful when working with iterative algorithms
            // which needs many data passes.
            dataView = mlContext.Data.Cache(dataView);

            // Define input and output columns of tree-based featurizer.
            string labelColumnName   = nameof(DataPoint.Label);
            string featureColumnName = nameof(DataPoint.Features);
            string treesColumnName   = nameof(TransformedDataPoint.Trees);
            string leavesColumnName  = nameof(TransformedDataPoint.Leaves);
            string pathsColumnName   = nameof(TransformedDataPoint.Paths);

            // Define the configuration of the trainer used to train a tree-based
            // model.
            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                // Only use 80% of features to reduce over-fitting.
                FeatureFraction = 0.8,
                // Create a simpler model by penalizing usage of new features.
                FeatureFirstUsePenalty = 0.1,
                // Reduce the number of trees to 3.
                NumberOfTrees = 3,
                // Number of leaves per tree.
                NumberOfLeaves    = 6,
                LabelColumnName   = labelColumnName,
                FeatureColumnName = featureColumnName
            };

            // Define the tree-based featurizer's configuration.
            var options = new FastForestRegressionFeaturizationEstimator.Options
            {
                InputColumnName  = featureColumnName,
                TreesColumnName  = treesColumnName,
                LeavesColumnName = leavesColumnName,
                PathsColumnName  = pathsColumnName,
                TrainerOptions   = trainerOptions
            };

            // Define the featurizer.
            var pipeline = mlContext.Transforms.FeaturizeByFastForestRegression(
                options);

            // Train the model.
            var model = pipeline.Fit(dataView);

            // Create testing data. Use different random seed to make it different
            // from training data.
            var transformed = model.Transform(dataView);

            // Convert IDataView object to a list. Each element in the resulted list
            // corresponds to a row in the IDataView.
            var transformedDataPoints = mlContext.Data.CreateEnumerable <
                TransformedDataPoint>(transformed, false).ToList();

            // Print out the transformation of the first 3 data points.
            for (int i = 0; i < 3; ++i)
            {
                var dataPoint            = dataPoints[i];
                var transformedDataPoint = transformedDataPoints[i];
                Console.WriteLine("The original feature vector [" + String.Join(",",
                                                                                dataPoint.Features) + "] is transformed to three different " +
                                  "tree-based feature vectors:");

                Console.WriteLine("  Trees' output values: [" + String.Join(",",
                                                                            transformedDataPoint.Trees) + "].");

                Console.WriteLine("  Leave IDs' 0-1 representation: [" + String
                                  .Join(",", transformedDataPoint.Leaves) + "].");

                Console.WriteLine("  Paths IDs' 0-1 representation: [" + String
                                  .Join(",", transformedDataPoint.Paths) + "].");
            }

            // Expected output:
            //   The original feature vector[1.543569, 1.494266, 1.284405] is
            //   transformed to three different tree - based feature vectors:
            //     Trees' output values: [0.7291142,0.7825329,0.8764582].
            //     Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0].
            //     Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1].
            //   The original feature vector[0.764918, 1.11206, 0.648211] is
            //   transformed to three different tree - based feature vectors:
            //     Trees' output values: [0.3802337,0.584159,0.5648927].
            //     Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0].
            //     Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,1,0,0].
            //   The original feature vector[1.251254, 1.269456, 1.444864] is
            //   transformed to three different tree - based feature vectors:
            //     Trees' output values: [0.7591804,0.7825329,0.7443035].
            //     Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1].
            //     Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1].
        }