Esempio n. 1
0
        public void TreeEnsembleFeaturizingPipeline()
        {
            // Create data set
            int dataPointCount = 200;
            var data           = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10
            });

            // Train the defined tree model. This trained model will be used to construct TreeEnsembleFeaturizationEstimator.
            var treeModel = trainer.Fit(dataView);
            var predicted = treeModel.Transform(dataView);

            // Combine the output of TreeEnsembleFeaturizationTransformer and the original features as the final training features.
            // Then train a linear model.
            var options = new PretrainedTreeFeaturizationEstimator.Options()
            {
                InputColumnName  = "Features",
                TreesColumnName  = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName  = "Paths",
                ModelParameters  = treeModel.Model.SubModel
            };
            var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options)
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                           .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures"));
            var model      = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics    = ML.BinaryClassification.Evaluate(prediction);

            // Then train the same linear model without tree features.
            var naivePipeline   = ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "Features");
            var naiveModel      = naivePipeline.Fit(dataView);
            var naivePrediction = naiveModel.Transform(dataView);
            var naiveMetrics    = ML.BinaryClassification.Evaluate(naivePrediction);

            // The linear model trained with tree features should perform better than that without tree features.
            Assert.True(metrics.Accuracy > naiveMetrics.Accuracy);
            Assert.True(metrics.LogLoss < naiveMetrics.LogLoss);
            Assert.True(metrics.AreaUnderPrecisionRecallCurve > naiveMetrics.AreaUnderPrecisionRecallCurve);
        }
Esempio n. 2
0
        /// <summary>
        /// Create <see cref="PretrainedTreeFeaturizationEstimator"/>, which produces tree-based features given a <see cref="TreeEnsembleModelParameters"/>.
        /// </summary>
        /// <param name="catalog">The context <see cref="TransformsCatalog"/> to create <see cref="PretrainedTreeFeaturizationEstimator"/>.</param>
        /// <param name="options">The options to configure <see cref="PretrainedTreeFeaturizationEstimator"/>. See <see cref="PretrainedTreeFeaturizationEstimator.Options"/> and
        /// <see cref="TreeEnsembleFeaturizationEstimatorBase.OptionsBase"/> for available settings.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        /// [!code-csharp[FeaturizeByPretrainTreeEnsemble](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs)]
        /// ]]>
        /// </format>
        /// </example>
        public static PretrainedTreeFeaturizationEstimator FeaturizeByPretrainTreeEnsemble(this TransformsCatalog catalog,
                                                                                           PretrainedTreeFeaturizationEstimator.Options options)
        {
            Contracts.CheckValue(catalog, nameof(catalog));
            var env = CatalogUtils.GetEnvironment(catalog);

            return(new PretrainedTreeFeaturizationEstimator(env, options));
        }
Esempio n. 3
0
        public void TestPretrainedTreeFeaturizationEstimator()
        {
            // Create data set
            int dataPointCount = 20;
            var data           = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = ML.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 1,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 1
            });

            // Train the defined tree model.
            var model     = trainer.Fit(dataView);
            var predicted = model.Transform(dataView);

            // From the trained tree model, a mapper of tree featurizer is created.
            string featureColumnName = "Features";
            string treesColumnName   = "MyTrees";  // a tree-based feature column.
            string leavesColumnName  = "MyLeaves"; // a tree-based feature column.
            string pathsColumnName   = "MyPaths";  // a tree-based feature column.
            var    options           = new PretrainedTreeFeaturizationEstimator.Options()
            {
                InputColumnName  = featureColumnName,
                ModelParameters  = model.Model.SubModel,
                TreesColumnName  = treesColumnName,
                LeavesColumnName = leavesColumnName,
                PathsColumnName  = pathsColumnName
            };
            var treeFeaturizer = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView);

            // Apply TreeEnsembleFeaturizer to the input data.
            var transformed = treeFeaturizer.Transform(dataView);

            // Extract the outputs of TreeEnsembleFeaturizer.
            var features   = transformed.GetColumn <float[]>(featureColumnName).ToArray();
            var leafValues = transformed.GetColumn <float[]>(treesColumnName).ToArray();
            var leafIds    = transformed.GetColumn <float[]>(leavesColumnName).ToArray();
            var paths      = transformed.GetColumn <float[]>(pathsColumnName).ToArray();

            // Check if the TreeEnsembleFeaturizer produce expected values.
            List <int> path = null;

            for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex)
            {
                int treeIndex = 0;
                var leafId    = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer <float>(10, features[dataPointIndex]), ref path);
                var leafValue = model.Model.SubModel.GetLeafValue(0, leafId);
                Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue);
                Assert.Equal(1.0, leafIds[dataPointIndex][leafId]);
                foreach (var nodeId in path)
                {
                    Assert.Equal(1.0, paths[dataPointIndex][nodeId]);
                }
            }
        }
        public static void Example()
        {
            // Create data set
            int dataPointCount = 200;
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            // Setting the seed to a fixed number in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(dataPointCount).ToList();

            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
            var dataView = mlContext.Data.LoadFromEnumerable(dataPoints);

            // Define input and output columns of tree-based featurizer.
            string labelColumnName   = nameof(DataPoint.Label);
            string featureColumnName = nameof(DataPoint.Features);
            string treesColumnName   = nameof(TransformedDataPoint.Trees);
            string leavesColumnName  = nameof(TransformedDataPoint.Leaves);
            string pathsColumnName   = nameof(TransformedDataPoint.Paths);

            // Define a tree model whose trees will be extracted to construct a tree featurizer.
            var trainer = mlContext.BinaryClassification.Trainers.FastTree(
                new FastTreeBinaryTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 1,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 1,
                FeatureColumnName          = featureColumnName,
                LabelColumnName            = labelColumnName
            });

            // Train the defined tree model.
            var model     = trainer.Fit(dataView);
            var predicted = model.Transform(dataView);

            // Define the configuration of tree-based featurizer.
            var options = new PretrainedTreeFeaturizationEstimator.Options()
            {
                InputColumnName  = featureColumnName,
                ModelParameters  = model.Model.SubModel, // Pretrained tree model.
                TreesColumnName  = treesColumnName,
                LeavesColumnName = leavesColumnName,
                PathsColumnName  = pathsColumnName
            };

            // Fit the created featurizer. It doesn't perform actual training because a pretrained model is provided.
            var treeFeaturizer = mlContext.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView);

            // Apply TreeEnsembleFeaturizer to the input data.
            var transformed = treeFeaturizer.Transform(dataView);

            // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView.
            var transformedDataPoints = mlContext.Data.CreateEnumerable <TransformedDataPoint>(transformed, false).ToList();

            // Print out the transformation of the first 3 data points.
            for (int i = 0; i < 3; ++i)
            {
                var dataPoint            = dataPoints[i];
                var transformedDataPoint = transformedDataPoints[i];
                Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:");
                Console.WriteLine($"  Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}].");
                Console.WriteLine($"  Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}].");
                Console.WriteLine($"  Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}].");
            }

            // Expected output:
            //  The original feature vector[0.8173254, 0.7680227, 0.5581612] is transformed to three different tree - based feature vectors:
            //    Trees' output values: [0.4172185].
            //    Leave IDs' 0-1 representation: [1,0,0,0].
            //    Paths IDs' 0-1 representation: [1,1,1].
            //  The original feature vector[0.7588848, 1.106027, 0.6421779] is transformed to three different tree - based feature vectors:
            //    Trees' output values: [-1].
            //    Leave IDs' 0-1 representation: [0,0,1,0].
            //    Paths IDs' 0-1 representation: [1,1,0].
            //  The original feature vector[0.2737045, 0.2919063, 0.4673147] is transformed to three different tree - based feature vectors:
            //    Trees' output values: [0.4172185].
            //    Leave IDs' 0-1 representation: [1,0,0,0].
            //    Paths IDs' 0-1 representation: [1,1,1].
            //
            //   Note that the trained model contains only one tree.
            //
            //            Node 0
            //            /    \
            //           /    Leaf -2
            //         Node 1
            //         /    \
            //        /    Leaf -3
            //      Node 2
            //      /    \
            //     /    Leaf -4
            //   Leaf -1
            //
            //   Thus, if a data point reaches Leaf indexed by -1, its 0-1 path representation may be [1,1,1] because that data point
            //   went through all Node 0, Node 1, and Node 2.
        }