public void TransposerTest() { const int rowCount = 1000; Random rgen = new Random(0); ArrayDataViewBuilder builder = new ArrayDataViewBuilder(Env); // A is to check the splitting of a sparse-ish column. var dataA = GenerateHelper(rowCount, 0.1, rgen, () => (int)rgen.Next(), 50, 5, 10, 15); dataA[rowCount / 2] = new VBuffer <int>(50, 0, null, null); // Coverage for the null vbuffer case. builder.AddColumn("A", NumberType.I4, dataA); // B is to check the splitting of a dense-ish column. builder.AddColumn("B", NumberType.R8, GenerateHelper(rowCount, 0.8, rgen, rgen.NextDouble, 50, 0, 25, 49)); // C is to just have some column we do nothing with. builder.AddColumn("C", NumberType.I2, GenerateHelper(rowCount, 0.1, rgen, () => (short)1, 30, 3, 10, 24)); // D is to check some column we don't have to split because it's sufficiently small. builder.AddColumn("D", NumberType.R8, GenerateHelper(rowCount, 0.1, rgen, rgen.NextDouble, 3, 1)); // E is to check a sparse scalar column. builder.AddColumn("E", NumberType.U4, GenerateHelper(rowCount, 0.1, rgen, () => (uint)rgen.Next(int.MinValue, int.MaxValue))); // F is to check a dense-ish scalar column. builder.AddColumn("F", NumberType.I4, GenerateHelper(rowCount, 0.8, rgen, () => rgen.Next())); IDataView view = builder.GetDataView(); // Do not force save. This will have a mix of passthrough and saved columns. Note that duplicate // specification of "D" to test that specifying a column twice has no ill effects. string[] names = { "B", "A", "E", "D", "F", "D" }; using (Transposer trans = Transposer.Create(Env, view, false, names)) { // Before checking the contents, check the names. for (int i = 0; i < names.Length; ++i) { int index; Assert.True(trans.Schema.TryGetColumnIndex(names[i], out index), $"Transpose schema couldn't find column '{names[i]}'"); int trueIndex; bool result = view.Schema.TryGetColumnIndex(names[i], out trueIndex); Contracts.Assert(result); Assert.True(trueIndex == index, $"Transpose schema had column '{names[i]}' at unexpected index"); } // Check the contents Assert.Null(trans.TransposeSchema.GetSlotType(2)); // C check to see that it's not transposable. TransposeCheckHelper <int>(view, 0, trans); // A check. TransposeCheckHelper <Double>(view, 1, trans); // B check. TransposeCheckHelper <Double>(view, 3, trans); // D check. TransposeCheckHelper <uint>(view, 4, trans); // E check. TransposeCheckHelper <int>(view, 5, trans); // F check. } // Force save. Recheck columns that would have previously been passthrough columns. // The primary benefit of this check is that we check the binary saving / loading // functionality of scalars which are otherwise always must necessarily be // passthrough. Also exercise the select by index functionality while we're at it. using (Transposer trans = Transposer.Create(Env, view, true, 3, 5, 4)) { // Check to see that A, B, and C were not transposed somehow. Assert.Null(trans.TransposeSchema.GetSlotType(0)); Assert.Null(trans.TransposeSchema.GetSlotType(1)); Assert.Null(trans.TransposeSchema.GetSlotType(2)); TransposeCheckHelper <Double>(view, 3, trans); // D check. TransposeCheckHelper <uint>(view, 4, trans); // E check. TransposeCheckHelper <int>(view, 5, trans); // F check. } }
/// <summary> /// Features: x1, x2vBuff(sparce vector), x3. /// y = 10x1 + 10x2vBuff + 30x3 + e. /// Within xBuff feature 2nd slot will be sparse most of the time. /// 2nd slot of xBuff has the least importance: Evaluation metrics do not change a lot when this slot is permuted. /// x2 has the biggest importance. /// </summary> private IDataView GetSparseDataset(TaskType task = TaskType.Regression) { // Setup synthetic dataset. const int numberOfInstances = 10000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x3Array = new float[numberOfInstances]; VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x3Important = rand.Next(10000); x3Array[i] = x3Important; VBuffer <float> vb; if (i % 10 != 0) { vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 }); } else { vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 }); } vbArray[i] = vb; float vbSum = 0; foreach (var vbValue in vb.DenseValues()) { vbSum += vbValue * 10; } var noise = rand.Next(50); yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise; } // If binary classification, modify the labels if (task == TaskType.BinaryClassification || task == TaskType.MulticlassClassification) { GetBinaryClassificationLabels(yArray); } else if (task == TaskType.Ranking) { GetRankingLabels(yArray); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray); bldr.AddColumn("X3Important", NumberType.Float, x3Array); bldr.AddColumn("Label", NumberType.Float, yArray); if (task == TaskType.Ranking) { bldr.AddColumn("GroupId", NumberType.U4, CreateGroupIds(yArray.Length)); } var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") .Append(ML.Transforms.Normalize("Features")); // Create a keytype for Ranking if (task == TaskType.Ranking) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV)); } return(pipeline.Fit(srcDV).Transform(srcDV)); }
/// <summary> /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random, Label y is dependant on xRand. /// Test verifies that feature contribution scores are outputted along with a score for predicted data. /// </summary> private void TestFeatureContribution( ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer, List <float[]> expectedValues, int precision = 6) { // Setup synthetic dataset. const int numInstances = 1000; const int numFeatures = 4; var rand = new Random(10); float[] yArray = new float[numInstances]; float[][] xArray = new float[numFeatures][]; int[] xRangeArray = new[] { 1000, 10000, 5000, 1000 }; float[] xWeightArray = new[] { 10, 20, // Most important feature with high weight. Should have the highest contribution. 5.5f, 0, // Least important feature. Should have the least contribution. }; for (var instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) { for (int featureIndex = 0; featureIndex < numFeatures; featureIndex++) { if (xArray[featureIndex] == null) { xArray[featureIndex] = new float[numInstances]; } xArray[featureIndex][instanceIndex] = rand.Next(xRangeArray[featureIndex]); yArray[instanceIndex] += xArray[featureIndex][instanceIndex] * xWeightArray[featureIndex]; } var noise = rand.Next(50); yArray[instanceIndex] += noise; } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, xArray[0]); bldr.AddColumn("X2Important", NumberType.Float, xArray[1]); bldr.AddColumn("X3", NumberType.Float, xArray[2]); bldr.AddColumn("X4Rand", NumberType.Float, xArray[3]); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = trainer.Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); var transformedOutput = output.AsEnumerable <ScoreAndContribution>(Env, true); int rowIndex = 0; foreach (var row in transformedOutput.Take(expectedValues.Count)) { var expectedValue = expectedValues[rowIndex++]; for (int i = 0; i < numFeatures; i++) { Assert.Equal(expectedValue[i], row.FeatureContributions[i], precision); } } Done(); }
public static CombinedOutput CombineMetrics(IHostEnvironment env, CombineMetricsInput input) { var eval = GetEvaluator(env, input.Kind); var perInst = EvaluateUtils.ConcatenatePerInstanceDataViews(env, eval, true, true, input.PerInstanceMetrics.Select( idv => new RoleMappedData(idv, opt: true, RoleMappedSchema.ColumnRole.Label.Bind(input.LabelColumn), RoleMappedSchema.ColumnRole.Weight.Bind(input.WeightColumn.Value), RoleMappedSchema.ColumnRole.Group.Bind(input.GroupColumn), RoleMappedSchema.ColumnRole.Name.Bind(input.NameColumn.Value))).ToArray(), out var variableSizeVectorColumnNames); var warnings = input.Warnings != null ? new List <IDataView>(input.Warnings) : new List <IDataView>(); if (variableSizeVectorColumnNames.Length > 0) { var dvBldr = new ArrayDataViewBuilder(env); var warn = $"Detected columns of variable length: {string.Join(", ", variableSizeVectorColumnNames)}." + $" Consider setting collateMetrics- for meaningful per-Folds results."; dvBldr.AddColumn(MetricKinds.ColumnNames.WarningText, TextType.Instance, warn.AsMemory()); warnings.Add(dvBldr.GetDataView()); } env.Assert(Utils.Size(perInst) == 1); var overall = eval.GetOverallResults(input.OverallMetrics); overall = EvaluateUtils.CombineFoldMetricsDataViews(env, overall, input.OverallMetrics.Length); IDataView conf = null; if (Utils.Size(input.ConfusionMatrix) > 0) { EvaluateUtils.ReconcileSlotNames <double>(env, input.ConfusionMatrix, MetricKinds.ColumnNames.Count, NumberType.R8); for (int i = 0; i < input.ConfusionMatrix.Length; i++) { var idv = input.ConfusionMatrix[i]; // Find the old Count column and drop it. for (int col = 0; col < idv.Schema.ColumnCount; col++) { if (idv.Schema[col].IsHidden && idv.Schema.GetColumnName(col).Equals(MetricKinds.ColumnNames.Count)) { input.ConfusionMatrix[i] = new ChooseColumnsByIndexTransform(env, new ChooseColumnsByIndexTransform.Arguments() { Drop = true, Index = new[] { col } }, idv); break; } } } conf = EvaluateUtils.ConcatenateOverallMetrics(env, input.ConfusionMatrix); } var warningsIdv = warnings.Count > 0 ? AppendRowsDataView.Create(env, warnings[0].Schema, warnings.ToArray()) : null; return(new CombinedOutput() { PerInstanceMetrics = perInst[0], OverallMetrics = overall, ConfusionMatrix = conf, Warnings = warningsIdv }); }
/// <summary> /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random and Label y is to dependant on xRand. /// xRand has the least importance: Evaluation metrics do not change a lot when xRand is permuted. /// x2 has the biggest importance. /// </summary> private IDataView GetDenseDataset(TaskType task = TaskType.Regression) { Contracts.Assert(task != TaskType.Clustering, $"TaskType {nameof(TaskType.Clustering)} not supported."); // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // If binary classification, modify the labels if (task == TaskType.BinaryClassification || task == TaskType.MulticlassClassification) { GetBinaryClassificationLabels(yArray); } else if (task == TaskType.Ranking) { GetRankingLabels(yArray); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); if (task == TaskType.Ranking) { bldr.AddColumn("GroupId", NumberType.U4, CreateGroupIds(yArray.Length)); } var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); // Create a keytype for Ranking if (task == TaskType.Ranking) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV)); } return(pipeline.Fit(srcDV).Transform(srcDV)); }
/// <summary> /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random and Label y is to dependant on xRand. /// xRand has the least importance: Evaluation metrics do not change a lot when xRand is permuted. /// x2 has the biggest importance. /// </summary> private IDataView GetDenseDataset(TaskType task = TaskType.Regression) { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances]; float[] x1Array = new float[numberOfInstances]; float[] x2Array = new float[numberOfInstances]; float[] x3Array = new float[numberOfInstances]; float[] x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // If binary classification, modify the labels if (task == TaskType.BinaryClassification || task == TaskType.MulticlassClassification) { GetBinaryClassificationLabels(yArray); } else if (task == TaskType.Ranking) { GetRankingLabels(yArray); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberDataViewType.Single, x1Array); bldr.AddColumn("X2Important", NumberDataViewType.Single, x2Array); bldr.AddColumn("X3", NumberDataViewType.Single, x3Array); bldr.AddColumn("X4Rand", NumberDataViewType.Single, x4RandArray); bldr.AddColumn("Label", NumberDataViewType.Single, yArray); if (task == TaskType.Ranking) { bldr.AddColumn("GroupId", NumberDataViewType.UInt32, CreateGroupIds(yArray.Length)); } var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.NormalizeMinMax("Features")); if (task == TaskType.BinaryClassification) { return(pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean)) .Fit(srcDV).Transform(srcDV)); } else if (task == TaskType.MulticlassClassification) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("Label")) .Fit(srcDV).Transform(srcDV)); } else if (task == TaskType.Ranking) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV)); } return(pipeline.Fit(srcDV).Transform(srcDV)); }
public void Train(List <FeatureSubsetModel <IPredictorProducing <TOutput> > > models, RoleMappedData data, IHostEnvironment env) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(Stacking.LoadName); host.CheckValue(models, nameof(models)); host.CheckValue(data, nameof(data)); using (var ch = host.Start("Training stacked model")) { ch.Check(Meta == null, "Train called multiple times"); ch.Check(BasePredictorType != null); var maps = new ValueMapper <VBuffer <Single>, TOutput> [models.Count]; for (int i = 0; i < maps.Length; i++) { Contracts.Assert(models[i].Predictor is IValueMapper); var m = (IValueMapper)models[i].Predictor; maps[i] = m.GetMapper <VBuffer <Single>, TOutput>(); } // REVIEW: Should implement this better.... var labels = new Single[100]; var features = new VBuffer <Single> [100]; int count = 0; // REVIEW: Should this include bad values or filter them? using (var cursor = new FloatLabelCursor(data, CursOpt.AllFeatures | CursOpt.AllLabels)) { TOutput[] predictions = new TOutput[maps.Length]; var vBuffers = new VBuffer <Single> [maps.Length]; while (cursor.MoveNext()) { Parallel.For(0, maps.Length, i => { var model = models[i]; if (model.SelectedFeatures != null) { EnsembleUtils.SelectFeatures(ref cursor.Features, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]); maps[i](ref vBuffers[i], ref predictions[i]); } else { maps[i](ref cursor.Features, ref predictions[i]); } }); Utils.EnsureSize(ref labels, count + 1); Utils.EnsureSize(ref features, count + 1); labels[count] = cursor.Label; FillFeatureBuffer(predictions, ref features[count]); count++; } } ch.Info("The number of instances used for stacking trainer is {0}", count); var bldr = new ArrayDataViewBuilder(host); Array.Resize(ref labels, count); Array.Resize(ref features, count); bldr.AddColumn(DefaultColumnNames.Label, NumberType.Float, labels); bldr.AddColumn(DefaultColumnNames.Features, NumberType.Float, features); var view = bldr.GetDataView(); var rmd = new RoleMappedData(view, DefaultColumnNames.Label, DefaultColumnNames.Features); var trainer = BasePredictorType.CreateInstance(host); if (trainer.Info.NeedNormalization) { ch.Warning("The trainer specified for stacking wants normalization, but we do not currently allow this."); } Meta = trainer.Train(rmd); CheckMeta(); ch.Done(); } }
/// <summary> /// Helper function that builds the IDataView given a list of keys and non-vector values /// </summary> internal static IDataView CreateDataView <TKey, TValue>(IHostEnvironment env, IEnumerable <TKey> keys, IEnumerable <TValue> values, string keyColumnName, string valueColumnName, bool treatValuesAsKeyTypes) { var keyType = GetPrimitiveType(typeof(TKey), out bool isKeyVectorType); var valueType = GetPrimitiveType(typeof(TValue), out bool isValueVectorType); var dataViewBuilder = new ArrayDataViewBuilder(env); AddColumnWrapper(dataViewBuilder, keyColumnName, keyType, keys.ToArray()); if (treatValuesAsKeyTypes) { // When treating the values as KeyTypes, generate the unique // set of values. This is used for generating the metadata of // the column. HashSet <TValue> valueSet = new HashSet <TValue>(); foreach (var v in values) { if (valueSet.Contains(v)) { continue; } valueSet.Add(v); } var metaKeys = valueSet.ToArray(); // Key Values are treated in one of two ways: // If the values are of type uint or ulong, these values are used directly as the keys types and no new keys are created. // If the values are not of uint or ulong, then key values are generated as uints starting from 1, since 0 is missing key. if (valueType.RawType == typeof(uint)) { uint[] indices = values.Select((x) => Convert.ToUInt32(x)).ToArray(); dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), (ulong)metaKeys.Length, indices); } else if (valueType.RawType == typeof(ulong)) { ulong[] indices = values.Select((x) => Convert.ToUInt64(x)).ToArray(); dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), (ulong)metaKeys.Length, indices); } else { // When generating the indices, treat each value as being unique, i.e. two values that are the same will // be assigned the same index. The dictionary is used to maintain uniqueness, indices will contain // the full list of indices (equal to the same length of values). Dictionary <TValue, uint> keyTypeValueMapping = new Dictionary <TValue, uint>(); uint[] indices = new uint[values.Count()]; // Start the index at 1 uint index = 1; for (int i = 0; i < values.Count(); ++i) { TValue value = values.ElementAt(i); if (!keyTypeValueMapping.ContainsKey(value)) { keyTypeValueMapping.Add(value, index); index++; } var keyValue = keyTypeValueMapping[value]; indices[i] = keyValue; } dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), (ulong)metaKeys.Count(), indices); } } else { AddColumnWrapper(dataViewBuilder, valueColumnName, valueType, values.ToArray()); } return(dataViewBuilder.GetDataView()); }
public void TestFeatureImportance() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OrdinaryLeastSquares().Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); // Get prediction scores and contributions var enumerator = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator(); ScoreAndContribution row = null; var expectedValues = new List <float[]>(); expectedValues.Add(new float[4] { 0.06319684F, 1, 0.1386623F, 4.46209469E-06F }); expectedValues.Add(new float[4] { 0.03841561F, 1, 0.1633037F, 2.68303256E-06F }); expectedValues.Add(new float[4] { 0.12006103F, 1, 0.254072F, 1.18671605E-05F }); expectedValues.Add(new float[4] { 0.20861618F, 0.99999994F, 0.407312155F, 6.963478E-05F }); expectedValues.Add(new float[4] { 0.024050576F, 0.99999994F, 0.31106182F, 8.456762E-06F }); int index = 0; while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; // We set predicion to 6 because the limit of floating-point numbers is 7. Assert.Equal(expectedValues[index][0], row.FeatureContributions[0], 6); Assert.Equal(expectedValues[index][1], row.FeatureContributions[1], 6); Assert.Equal(expectedValues[index][2], row.FeatureContributions[2], 6); Assert.Equal(expectedValues[index++][3], row.FeatureContributions[3], 6); } Done(); }
public void TestFeatureImportance() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); // Get prediction scores and contributions var enumerator = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator(); ScoreAndContribution row = null; var expectedValues = new List <float[]>(); expectedValues.Add(new float[4] { 0.15640761F, 1, 0.155862764F, 0.07276783F }); expectedValues.Add(new float[4] { 0.09507586F, 1, 0.1835608F, 0.0437548943F }); expectedValues.Add(new float[4] { 0.297142357F, 1, 0.2855884F, 0.193529665F }); expectedValues.Add(new float[4] { 0.45465675F, 0.8805887F, 0.4031663F, 1 }); expectedValues.Add(new float[4] { 0.0595234372F, 0.99999994F, 0.349647522F, 0.137912869F }); int index = 0; while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; Assert.True(row.FeatureContributions[0] == expectedValues[index][0]); Assert.True(row.FeatureContributions[1] == expectedValues[index][1]); Assert.True(row.FeatureContributions[2] == expectedValues[index][2]); Assert.True(row.FeatureContributions[3] == expectedValues[index++][3]); } Done(); }
public void TestDenseSGD() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var pfi = ML.Regression.PermutationFeatureImportance(model, data); // Pfi Indices: // X1: 0 // X2Important: 1 // X3: 2 // X4Rand: 3 // For the following metrics lower is better, so maximum delta means more important feature, and vice versa Assert.True(MinDeltaIndex(pfi, m => m.L1) == 3); Assert.True(MaxDeltaIndex(pfi, m => m.L1) == 1); Assert.True(MinDeltaIndex(pfi, m => m.L2) == 3); Assert.True(MaxDeltaIndex(pfi, m => m.L2) == 1); Assert.True(MinDeltaIndex(pfi, m => m.Rms) == 3); Assert.True(MaxDeltaIndex(pfi, m => m.Rms) == 1); // For the following metrics higher is better, so minimum delta means more important feature, and vice versa Assert.True(MaxDeltaIndex(pfi, m => m.RSquared) == 3); Assert.True(MinDeltaIndex(pfi, m => m.RSquared) == 1); Done(); }
public void TestSparseSGD() { // Setup synthetic dataset. const int numberOfInstances = 10000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x3Array = new float[numberOfInstances]; VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x3Important = rand.Next(10000); x3Array[i] = x3Important; VBuffer <float> vb; if (i % 10 != 0) { vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 }); } else { vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 }); } vbArray[i] = vb; float vbSum = 0; foreach (var vbValue in vb.DenseValues()) { vbSum += vbValue * 10; } var noise = rand.Next(50); yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise; } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray); bldr.AddColumn("X3Important", NumberType.Float, x3Array); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var results = ML.Regression.PermutationFeatureImportance(model, data); // Pfi Indices: // X1: 0 // X2VBuffer-Slot-0: 1 // X2VBuffer-Slot-1: 2 // X2VBuffer-Slot-2: 3 // X2VBuffer-Slot-3: 4 // X3Important: 5 // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact. // For the following metrics lower is better, so maximum delta means more important feature, and vice versa Assert.True(MinDeltaIndex(results, m => m.L1) == 2); Assert.True(MaxDeltaIndex(results, m => m.L1) == 5); Assert.True(MinDeltaIndex(results, m => m.L2) == 2); Assert.True(MaxDeltaIndex(results, m => m.L2) == 5); Assert.True(MinDeltaIndex(results, m => m.Rms) == 2); Assert.True(MaxDeltaIndex(results, m => m.Rms) == 5); // For the following metrics higher is better, so minimum delta means more important feature, and vice versa Assert.True(MaxDeltaIndex(results, m => m.RSquared) == 2); Assert.True(MinDeltaIndex(results, m => m.RSquared) == 5); }