public static void FeatureContributionCalculationTransform_Regression()
        {
            // Downloading the dataset from github.com/dotnet/machinelearning.
            // This will create a sentiment.tsv file in the filesystem.
            // You can open this file, if you want to see the data.
            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();

            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Step 1: Read the data as an IDataView.
            // First, we define the reader: specify the data columns and where to find them in the text file.
            var reader = mlContext.Data.CreateTextReader(
                columns: new[]
            {
                new TextLoader.Column("MedianHomeValue", DataKind.R4, 0),
                new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1),
                new TextLoader.Column("PercentResidental", DataKind.R4, 2),
                new TextLoader.Column("PercentNonRetail", DataKind.R4, 3),
                new TextLoader.Column("CharlesRiver", DataKind.R4, 4),
                new TextLoader.Column("NitricOxides", DataKind.R4, 5),
                new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6),
                new TextLoader.Column("PercentPre40s", DataKind.R4, 7),
                new TextLoader.Column("EmploymentDistance", DataKind.R4, 8),
                new TextLoader.Column("HighwayDistance", DataKind.R4, 9),
                new TextLoader.Column("TaxRate", DataKind.R4, 10),
                new TextLoader.Column("TeacherRatio", DataKind.R4, 11),
            },
                hasHeader: true
                );

            // Read the data
            var data = reader.Read(dataFile);

            // Step 2: Pipeline
            // Concatenate the features to create a Feature vector.
            // Then append a linear model, setting the "MedianHomeValue" column as the label of the dataset,
            // the "Features" column produced by concatenation as the features column.
            var transformPipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental",
                                                                     "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s",
                                                                     "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio");
            var learner = mlContext.Regression.Trainers.StochasticDualCoordinateAscent(
                labelColumn: "MedianHomeValue", featureColumn: "Features");

            var transformedData = transformPipeline.Fit(data).Transform(data);

            var model = learner.Fit(transformedData);

            // Create a Feature Contribution Calculator
            // Calculate the feature contributions for all features
            // And don't normalize the contribution scores
            var args = new FeatureContributionCalculationTransform.Arguments()
            {
                Top       = 11,
                Normalize = false
            };
            var featureContributionCalculator = FeatureContributionCalculationTransform.Create(mlContext, args, transformedData, model.Model, model.FeatureColumn);

            // Let's extract the weights from the linear model to use as a comparison
            var weights = new VBuffer <float>();

            model.Model.GetFeatureWeights(ref weights);

            // Let's now walk through the first ten reconds and see which feature drove the values the most
            // Get prediction scores and contributions
            var scoringEnumerator = featureContributionCalculator.AsEnumerable <HousingRegressionScoreAndContribution>(mlContext, true).GetEnumerator();
            int index             = 0;

            Console.WriteLine("Label\tScore\tBiggestFeature\tValue\tWeight\tContribution\tPercent");
            while (scoringEnumerator.MoveNext() && index < 10)
            {
                var row = scoringEnumerator.Current;

                // Get the feature index with the biggest contribution
                var featureOfInterest = GetMostContributingFeature(row.FeatureContributions);

                // And the corresponding information about the feature
                var value               = row.Features[featureOfInterest];
                var contribution        = row.FeatureContributions[featureOfInterest];
                var percentContribution = 100 * contribution / row.Score;
                var name   = data.Schema[(int)(featureOfInterest + 1)].Name;
                var weight = weights.GetValues()[featureOfInterest];

                Console.WriteLine("{0:0.00}\t{1:0.00}\t{2}\t{3:0.00}\t{4:0.00}\t{5:0.00}\t{6:0.00}",
                                  row.MedianHomeValue,
                                  row.Score,
                                  name,
                                  value,
                                  weight,
                                  contribution,
                                  percentContribution
                                  );

                index++;
            }

            // For bulk scoring, the ApplyToData API can also be used
            var scoredData = featureContributionCalculator.ApplyToData(mlContext, transformedData);
            var preview    = scoredData.Preview(100);
        }
Ejemplo n.º 2
0
        public void TestFeatureImportance()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .AppendCacheCheckpoint(ML)
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OrdinaryLeastSquares().Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            // Get prediction scores and contributions
            var enumerator           = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator();
            ScoreAndContribution row = null;
            var expectedValues       = new List <float[]>();

            expectedValues.Add(new float[4] {
                0.06319684F, 1, 0.1386623F, 4.46209469E-06F
            });
            expectedValues.Add(new float[4] {
                0.03841561F, 1, 0.1633037F, 2.68303256E-06F
            });
            expectedValues.Add(new float[4] {
                0.12006103F, 1, 0.254072F, 1.18671605E-05F
            });
            expectedValues.Add(new float[4] {
                0.20861618F, 0.99999994F, 0.407312155F, 6.963478E-05F
            });
            expectedValues.Add(new float[4] {
                0.024050576F, 0.99999994F, 0.31106182F, 8.456762E-06F
            });
            int index = 0;

            while (enumerator.MoveNext() && index < expectedValues.Count)
            {
                row = enumerator.Current;
                // We set predicion to 6 because the limit of floating-point numbers is 7.
                Assert.Equal(expectedValues[index][0], row.FeatureContributions[0], 6);
                Assert.Equal(expectedValues[index][1], row.FeatureContributions[1], 6);
                Assert.Equal(expectedValues[index][2], row.FeatureContributions[2], 6);
                Assert.Equal(expectedValues[index++][3], row.FeatureContributions[3], 6);
            }

            Done();
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random, Label y is dependant on xRand.
        /// Test verifies that feature contribution scores are outputted along with a score for predicted data.
        /// </summary>
        private void TestFeatureContribution(
            ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer,
            List <float[]> expectedValues,
            int precision = 6)
        {
            // Setup synthetic dataset.
            const int numInstances = 1000;
            const int numFeatures  = 4;

            var rand = new Random(10);

            float[]   yArray       = new float[numInstances];
            float[][] xArray       = new float[numFeatures][];
            int[]     xRangeArray  = new[] { 1000, 10000, 5000, 1000 };
            float[]   xWeightArray = new[] {
                10,
                20, // Most important feature with high weight. Should have the highest contribution.
                5.5f,
                0,  // Least important feature. Should have the least contribution.
            };

            for (var instanceIndex = 0; instanceIndex < numInstances; instanceIndex++)
            {
                for (int featureIndex = 0; featureIndex < numFeatures; featureIndex++)
                {
                    if (xArray[featureIndex] == null)
                    {
                        xArray[featureIndex] = new float[numInstances];
                    }
                    xArray[featureIndex][instanceIndex] = rand.Next(xRangeArray[featureIndex]);
                    yArray[instanceIndex] += xArray[featureIndex][instanceIndex] * xWeightArray[featureIndex];
                }

                var noise = rand.Next(50);
                yArray[instanceIndex] += noise;
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, xArray[0]);
            bldr.AddColumn("X2Important", NumberType.Float, xArray[1]);
            bldr.AddColumn("X3", NumberType.Float, xArray[2]);
            bldr.AddColumn("X4Rand", NumberType.Float, xArray[3]);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .AppendCacheCheckpoint(ML)
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = trainer.Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            var transformedOutput = output.AsEnumerable <ScoreAndContribution>(Env, true);
            int rowIndex          = 0;

            foreach (var row in transformedOutput.Take(expectedValues.Count))
            {
                var expectedValue = expectedValues[rowIndex++];
                for (int i = 0; i < numFeatures; i++)
                {
                    Assert.Equal(expectedValue[i], row.FeatureContributions[i], precision);
                }
            }

            Done();
        }
        public void TestFeatureImportance()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            // Get prediction scores and contributions
            var enumerator           = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator();
            ScoreAndContribution row = null;
            var expectedValues       = new List <float[]>();

            expectedValues.Add(new float[4] {
                0.15640761F, 1, 0.155862764F, 0.07276783F
            });
            expectedValues.Add(new float[4] {
                0.09507586F, 1, 0.1835608F, 0.0437548943F
            });
            expectedValues.Add(new float[4] {
                0.297142357F, 1, 0.2855884F, 0.193529665F
            });
            expectedValues.Add(new float[4] {
                0.45465675F, 0.8805887F, 0.4031663F, 1
            });
            expectedValues.Add(new float[4] {
                0.0595234372F, 0.99999994F, 0.349647522F, 0.137912869F
            });
            int index = 0;

            while (enumerator.MoveNext() && index < expectedValues.Count)
            {
                row = enumerator.Current;
                Assert.True(row.FeatureContributions[0] == expectedValues[index][0]);
                Assert.True(row.FeatureContributions[1] == expectedValues[index][1]);
                Assert.True(row.FeatureContributions[2] == expectedValues[index][2]);
                Assert.True(row.FeatureContributions[3] == expectedValues[index++][3]);
            }

            Done();
        }