public void TestFastForestBinaryFeaturizationInPipeline() { int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestBinaryTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "Label" }; var options = new FastForestBinaryFeaturizationEstimator.Options() { InputColumnName = "Features", TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths", TrainerOptions = trainerOptions }; var pipeline = ML.Transforms.FeaturizeByFastForestBinary(options) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.BinaryClassification.Evaluate(prediction); Assert.True(metrics.Accuracy > 0.97); Assert.True(metrics.LogLoss < 0.07); Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); }
// This example requires installation of additional NuGet package // <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>. public static void Example() { // Create a new context for ML.NET operations. It can be used for // exception tracking and logging, as a catalog of available operations // and as the source of randomness. Setting the seed to a fixed number // in this example to make outputs deterministic. var mlContext = new MLContext(seed: 0); // Create a list of data points to be transformed. var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is // consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); // ML.NET doesn't cache data set by default. Therefore, if one reads a // data set from a file and accesses it many times, it can be slow due // to expensive featurization and disk operations. When the considered // data can fit into memory, a solution is to cache the data in memory. // Caching is especially helpful when working with iterative algorithms // which needs many data passes. dataView = mlContext.Data.Cache(dataView); // Define input and output columns of tree-based featurizer. string labelColumnName = nameof(DataPoint.Label); string featureColumnName = nameof(DataPoint.Features); string treesColumnName = nameof(TransformedDataPoint.Trees); string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); // Define the configuration of the trainer used to train a tree-based // model. var trainerOptions = new FastForestBinaryTrainer.Options { // Create a simpler model by penalizing usage of new features. FeatureFirstUsePenalty = 0.1, // Reduce the number of trees to 3. NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, // Feature column name. FeatureColumnName = featureColumnName, // Label column name. LabelColumnName = labelColumnName }; // Define the tree-based featurizer's configuration. var options = new FastForestBinaryFeaturizationEstimator.Options { InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, TrainerOptions = trainerOptions }; // Define the featurizer. var pipeline = mlContext.Transforms.FeaturizeByFastForestBinary( options); // Train the model. var model = pipeline.Fit(dataView); // Apply the trained transformer to the considered data set. var transformed = model.Transform(dataView); // Convert IDataView object to a list. Each element in the resulted list // corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable < TransformedDataPoint>(transformed, false).ToList(); // Print out the transformation of the first 3 data points. for (int i = 0; i < 3; ++i) { var dataPoint = dataPoints[i]; var transformedDataPoint = transformedDataPoints[i]; Console.WriteLine("The original feature vector [" + String.Join( ",", dataPoint.Features) + "] is transformed to three " + "different tree-based feature vectors:"); Console.WriteLine(" Trees' output values: [" + String.Join(",", transformedDataPoint.Trees) + "]."); Console.WriteLine(" Leave IDs' 0-1 representation: [" + String .Join(",", transformedDataPoint.Leaves) + "]."); Console.WriteLine(" Paths IDs' 0-1 representation: [" + String .Join(",", transformedDataPoint.Paths) + "]."); } // Expected output: // The original feature vector [0.8173254,0.7680227,0.5581612] is // transformed to three different tree-based feature vectors: // Trees' output values: [0.1111111,0.8823529]. // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0]. // Paths IDs' 0-1 representation: [1,1,1,1,1,1,0,1,0]. // The original feature vector [0.5888848,0.9360271,0.4721779] is // transformed to three different tree-based feature vectors: // Trees' output values: [0.4545455,0.8]. // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1]. // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. // The original feature vector [0.2737045,0.2919063,0.4673147] is // transformed to three different tree-based feature vectors: // Trees' output values: [0.4545455,0.1111111]. // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0]. // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. }
/// <summary> /// Create <see cref="FastForestBinaryFeaturizationEstimator"/>, which uses <see cref="FastForestBinaryTrainer"/> to train <see cref="TreeEnsembleModelParameters"/> to create tree-based features. /// </summary> /// <param name="catalog">The context <see cref="TransformsCatalog"/> to create <see cref="FastForestBinaryFeaturizationEstimator"/>.</param> /// <param name="options">The options to configure <see cref="FastForestBinaryFeaturizationEstimator"/>. See <see cref="FastForestBinaryFeaturizationEstimator.Options"/> and /// <see cref="TreeEnsembleFeaturizationEstimatorBase.OptionsBase"/> for available settings.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// [!code-csharp[FeaturizeByFastForestBinary](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs)] /// ]]> /// </format> /// </example> public static FastForestBinaryFeaturizationEstimator FeaturizeByFastForestBinary(this TransformsCatalog catalog, FastForestBinaryFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); return(new FastForestBinaryFeaturizationEstimator(env, options)); }