public static void FeatureContributionCalculationTransform_Regression() { // Downloading the dataset from github.com/dotnet/machinelearning. // This will create a sentiment.tsv file in the filesystem. // You can open this file, if you want to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. var reader = mlContext.Data.CreateTextReader( columns: new[] { new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), new TextLoader.Column("PercentResidental", DataKind.R4, 2), new TextLoader.Column("PercentNonRetail", DataKind.R4, 3), new TextLoader.Column("CharlesRiver", DataKind.R4, 4), new TextLoader.Column("NitricOxides", DataKind.R4, 5), new TextLoader.Column("RoomsPerDwelling", DataKind.R4, 6), new TextLoader.Column("PercentPre40s", DataKind.R4, 7), new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), new TextLoader.Column("HighwayDistance", DataKind.R4, 9), new TextLoader.Column("TaxRate", DataKind.R4, 10), new TextLoader.Column("TeacherRatio", DataKind.R4, 11), }, hasHeader: true ); // Read the data var data = reader.Read(dataFile); // Step 2: Pipeline // Concatenate the features to create a Feature vector. // Then append a linear model, setting the "MedianHomeValue" column as the label of the dataset, // the "Features" column produced by concatenation as the features column. var transformPipeline = mlContext.Transforms.Concatenate("Features", "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"); var learner = mlContext.Regression.Trainers.StochasticDualCoordinateAscent( labelColumn: "MedianHomeValue", featureColumn: "Features"); var transformedData = transformPipeline.Fit(data).Transform(data); var model = learner.Fit(transformedData); // Create a Feature Contribution Calculator // Calculate the feature contributions for all features // And don't normalize the contribution scores var args = new FeatureContributionCalculationTransform.Arguments() { Top = 11, Normalize = false }; var featureContributionCalculator = FeatureContributionCalculationTransform.Create(mlContext, args, transformedData, model.Model, model.FeatureColumn); // Let's extract the weights from the linear model to use as a comparison var weights = new VBuffer <float>(); model.Model.GetFeatureWeights(ref weights); // Let's now walk through the first ten reconds and see which feature drove the values the most // Get prediction scores and contributions var scoringEnumerator = featureContributionCalculator.AsEnumerable <HousingRegressionScoreAndContribution>(mlContext, true).GetEnumerator(); int index = 0; Console.WriteLine("Label\tScore\tBiggestFeature\tValue\tWeight\tContribution\tPercent"); while (scoringEnumerator.MoveNext() && index < 10) { var row = scoringEnumerator.Current; // Get the feature index with the biggest contribution var featureOfInterest = GetMostContributingFeature(row.FeatureContributions); // And the corresponding information about the feature var value = row.Features[featureOfInterest]; var contribution = row.FeatureContributions[featureOfInterest]; var percentContribution = 100 * contribution / row.Score; var name = data.Schema[(int)(featureOfInterest + 1)].Name; var weight = weights.GetValues()[featureOfInterest]; Console.WriteLine("{0:0.00}\t{1:0.00}\t{2}\t{3:0.00}\t{4:0.00}\t{5:0.00}\t{6:0.00}", row.MedianHomeValue, row.Score, name, value, weight, contribution, percentContribution ); index++; } // For bulk scoring, the ApplyToData API can also be used var scoredData = featureContributionCalculator.ApplyToData(mlContext, transformedData); var preview = scoredData.Preview(100); }
public void TestFeatureImportance() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OrdinaryLeastSquares().Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); // Get prediction scores and contributions var enumerator = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator(); ScoreAndContribution row = null; var expectedValues = new List <float[]>(); expectedValues.Add(new float[4] { 0.06319684F, 1, 0.1386623F, 4.46209469E-06F }); expectedValues.Add(new float[4] { 0.03841561F, 1, 0.1633037F, 2.68303256E-06F }); expectedValues.Add(new float[4] { 0.12006103F, 1, 0.254072F, 1.18671605E-05F }); expectedValues.Add(new float[4] { 0.20861618F, 0.99999994F, 0.407312155F, 6.963478E-05F }); expectedValues.Add(new float[4] { 0.024050576F, 0.99999994F, 0.31106182F, 8.456762E-06F }); int index = 0; while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; // We set predicion to 6 because the limit of floating-point numbers is 7. Assert.Equal(expectedValues[index][0], row.FeatureContributions[0], 6); Assert.Equal(expectedValues[index][1], row.FeatureContributions[1], 6); Assert.Equal(expectedValues[index][2], row.FeatureContributions[2], 6); Assert.Equal(expectedValues[index++][3], row.FeatureContributions[3], 6); } Done(); }
/// <summary> /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random, Label y is dependant on xRand. /// Test verifies that feature contribution scores are outputted along with a score for predicted data. /// </summary> private void TestFeatureContribution( ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer, List <float[]> expectedValues, int precision = 6) { // Setup synthetic dataset. const int numInstances = 1000; const int numFeatures = 4; var rand = new Random(10); float[] yArray = new float[numInstances]; float[][] xArray = new float[numFeatures][]; int[] xRangeArray = new[] { 1000, 10000, 5000, 1000 }; float[] xWeightArray = new[] { 10, 20, // Most important feature with high weight. Should have the highest contribution. 5.5f, 0, // Least important feature. Should have the least contribution. }; for (var instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) { for (int featureIndex = 0; featureIndex < numFeatures; featureIndex++) { if (xArray[featureIndex] == null) { xArray[featureIndex] = new float[numInstances]; } xArray[featureIndex][instanceIndex] = rand.Next(xRangeArray[featureIndex]); yArray[instanceIndex] += xArray[featureIndex][instanceIndex] * xWeightArray[featureIndex]; } var noise = rand.Next(50); yArray[instanceIndex] += noise; } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, xArray[0]); bldr.AddColumn("X2Important", NumberType.Float, xArray[1]); bldr.AddColumn("X3", NumberType.Float, xArray[2]); bldr.AddColumn("X4Rand", NumberType.Float, xArray[3]); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = trainer.Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); var transformedOutput = output.AsEnumerable <ScoreAndContribution>(Env, true); int rowIndex = 0; foreach (var row in transformedOutput.Take(expectedValues.Count)) { var expectedValue = expectedValues[rowIndex++]; for (int i = 0; i < numFeatures; i++) { Assert.Equal(expectedValue[i], row.FeatureContributions[i], precision); } } Done(); }
public void TestFeatureImportance() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); // Get prediction scores and contributions var enumerator = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator(); ScoreAndContribution row = null; var expectedValues = new List <float[]>(); expectedValues.Add(new float[4] { 0.15640761F, 1, 0.155862764F, 0.07276783F }); expectedValues.Add(new float[4] { 0.09507586F, 1, 0.1835608F, 0.0437548943F }); expectedValues.Add(new float[4] { 0.297142357F, 1, 0.2855884F, 0.193529665F }); expectedValues.Add(new float[4] { 0.45465675F, 0.8805887F, 0.4031663F, 1 }); expectedValues.Add(new float[4] { 0.0595234372F, 0.99999994F, 0.349647522F, 0.137912869F }); int index = 0; while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; Assert.True(row.FeatureContributions[0] == expectedValues[index][0]); Assert.True(row.FeatureContributions[1] == expectedValues[index][1]); Assert.True(row.FeatureContributions[2] == expectedValues[index][2]); Assert.True(row.FeatureContributions[3] == expectedValues[index++][3]); } Done(); }