public void TreeEnsembleFeaturizingPipeline() { // Create data set int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); // Define a tree model whose trees will be extracted to construct a tree featurizer. var trainer = ML.BinaryClassification.Trainers.FastTree( new FastTreeBinaryTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10 }); // Train the defined tree model. This trained model will be used to construct TreeEnsembleFeaturizationEstimator. var treeModel = trainer.Fit(dataView); var predicted = treeModel.Transform(dataView); // Combine the output of TreeEnsembleFeaturizationTransformer and the original features as the final training features. // Then train a linear model. var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = "Features", TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths", ModelParameters = treeModel.Model.SubModel }; var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.BinaryClassification.Evaluate(prediction); // Then train the same linear model without tree features. var naivePipeline = ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "Features"); var naiveModel = naivePipeline.Fit(dataView); var naivePrediction = naiveModel.Transform(dataView); var naiveMetrics = ML.BinaryClassification.Evaluate(naivePrediction); // The linear model trained with tree features should perform better than that without tree features. Assert.True(metrics.Accuracy > naiveMetrics.Accuracy); Assert.True(metrics.LogLoss < naiveMetrics.LogLoss); Assert.True(metrics.AreaUnderPrecisionRecallCurve > naiveMetrics.AreaUnderPrecisionRecallCurve); }
/// <summary> /// Create <see cref="PretrainedTreeFeaturizationEstimator"/>, which produces tree-based features given a <see cref="TreeEnsembleModelParameters"/>. /// </summary> /// <param name="catalog">The context <see cref="TransformsCatalog"/> to create <see cref="PretrainedTreeFeaturizationEstimator"/>.</param> /// <param name="options">The options to configure <see cref="PretrainedTreeFeaturizationEstimator"/>. See <see cref="PretrainedTreeFeaturizationEstimator.Options"/> and /// <see cref="TreeEnsembleFeaturizationEstimatorBase.OptionsBase"/> for available settings.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ /// [!code-csharp[FeaturizeByPretrainTreeEnsemble](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs)] /// ]]> /// </format> /// </example> public static PretrainedTreeFeaturizationEstimator FeaturizeByPretrainTreeEnsemble(this TransformsCatalog catalog, PretrainedTreeFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); var env = CatalogUtils.GetEnvironment(catalog); return(new PretrainedTreeFeaturizationEstimator(env, options)); }
public void TestPretrainedTreeFeaturizationEstimator() { // Create data set int dataPointCount = 20; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); // Define a tree model whose trees will be extracted to construct a tree featurizer. var trainer = ML.BinaryClassification.Trainers.FastTree( new FastTreeBinaryTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 1, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 1 }); // Train the defined tree model. var model = trainer.Fit(dataView); var predicted = model.Transform(dataView); // From the trained tree model, a mapper of tree featurizer is created. string featureColumnName = "Features"; string treesColumnName = "MyTrees"; // a tree-based feature column. string leavesColumnName = "MyLeaves"; // a tree-based feature column. string pathsColumnName = "MyPaths"; // a tree-based feature column. var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = featureColumnName, ModelParameters = model.Model.SubModel, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName }; var treeFeaturizer = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView); // Apply TreeEnsembleFeaturizer to the input data. var transformed = treeFeaturizer.Transform(dataView); // Extract the outputs of TreeEnsembleFeaturizer. var features = transformed.GetColumn <float[]>(featureColumnName).ToArray(); var leafValues = transformed.GetColumn <float[]>(treesColumnName).ToArray(); var leafIds = transformed.GetColumn <float[]>(leavesColumnName).ToArray(); var paths = transformed.GetColumn <float[]>(pathsColumnName).ToArray(); // Check if the TreeEnsembleFeaturizer produce expected values. List <int> path = null; for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) { int treeIndex = 0; var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer <float>(10, features[dataPointIndex]), ref path); var leafValue = model.Model.SubModel.GetLeafValue(0, leafId); Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); foreach (var nodeId in path) { Assert.Equal(1.0, paths[dataPointIndex][nodeId]); } } }
public static void Example() { // Create data set int dataPointCount = 200; // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. // Setting the seed to a fixed number in this example to make outputs deterministic. var mlContext = new MLContext(seed: 0); // Create a list of training data points. var dataPoints = GenerateRandomDataPoints(dataPointCount).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); // Define input and output columns of tree-based featurizer. string labelColumnName = nameof(DataPoint.Label); string featureColumnName = nameof(DataPoint.Features); string treesColumnName = nameof(TransformedDataPoint.Trees); string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); // Define a tree model whose trees will be extracted to construct a tree featurizer. var trainer = mlContext.BinaryClassification.Trainers.FastTree( new FastTreeBinaryTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 1, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 1, FeatureColumnName = featureColumnName, LabelColumnName = labelColumnName }); // Train the defined tree model. var model = trainer.Fit(dataView); var predicted = model.Transform(dataView); // Define the configuration of tree-based featurizer. var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = featureColumnName, ModelParameters = model.Model.SubModel, // Pretrained tree model. TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName }; // Fit the created featurizer. It doesn't perform actual training because a pretrained model is provided. var treeFeaturizer = mlContext.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView); // Apply TreeEnsembleFeaturizer to the input data. var transformed = treeFeaturizer.Transform(dataView); // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable <TransformedDataPoint>(transformed, false).ToList(); // Print out the transformation of the first 3 data points. for (int i = 0; i < 3; ++i) { var dataPoint = dataPoints[i]; var transformedDataPoint = transformedDataPoints[i]; Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); } // Expected output: // The original feature vector[0.8173254, 0.7680227, 0.5581612] is transformed to three different tree - based feature vectors: // Trees' output values: [0.4172185]. // Leave IDs' 0-1 representation: [1,0,0,0]. // Paths IDs' 0-1 representation: [1,1,1]. // The original feature vector[0.7588848, 1.106027, 0.6421779] is transformed to three different tree - based feature vectors: // Trees' output values: [-1]. // Leave IDs' 0-1 representation: [0,0,1,0]. // Paths IDs' 0-1 representation: [1,1,0]. // The original feature vector[0.2737045, 0.2919063, 0.4673147] is transformed to three different tree - based feature vectors: // Trees' output values: [0.4172185]. // Leave IDs' 0-1 representation: [1,0,0,0]. // Paths IDs' 0-1 representation: [1,1,1]. // // Note that the trained model contains only one tree. // // Node 0 // / \ // / Leaf -2 // Node 1 // / \ // / Leaf -3 // Node 2 // / \ // / Leaf -4 // Leaf -1 // // Thus, if a data point reaches Leaf indexed by -1, its 0-1 path representation may be [1,1,1] because that data point // went through all Node 0, Node 1, and Node 2. }