public void TestLDATransform() { var builder = new ArrayDataViewBuilder(Env); var data = new[] { new[] { (Float)1.0, (Float)0.0, (Float)0.0 }, new[] { (Float)0.0, (Float)1.0, (Float)0.0 }, new[] { (Float)0.0, (Float)0.0, (Float)1.0 }, }; builder.AddColumn("F1V", NumberType.Float, data); var srcView = builder.GetDataView(); LdaTransform.Column col = new LdaTransform.Column(); col.Source = "F1V"; col.NumTopic = 20; col.NumTopic = 3; col.NumSummaryTermPerTopic = 3; col.AlphaSum = 3; col.NumThreads = 1; col.ResetRandomGenerator = true; LdaTransform.Arguments args = new LdaTransform.Arguments(); args.Column = new LdaTransform.Column[] { col }; LdaTransform ldaTransform = new LdaTransform(Env, args, srcView); using (var cursor = ldaTransform.GetRowCursor(c => true)) { var resultGetter = cursor.GetGetter <VBuffer <Float> >(1); VBuffer <Float> resultFirstRow = new VBuffer <Float>(); VBuffer <Float> resultSecondRow = new VBuffer <Float>(); VBuffer <Float> resultThirdRow = new VBuffer <Float>(); Assert.True(cursor.MoveNext()); resultGetter(ref resultFirstRow); Assert.True(cursor.MoveNext()); resultGetter(ref resultSecondRow); Assert.True(cursor.MoveNext()); resultGetter(ref resultThirdRow); Assert.False(cursor.MoveNext()); Assert.True(resultFirstRow.Length == 3); Assert.True(resultFirstRow.GetItemOrDefault(0) == 0); Assert.True(resultFirstRow.GetItemOrDefault(2) == 0); Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0); Assert.True(resultSecondRow.Length == 3); Assert.True(resultSecondRow.GetItemOrDefault(0) == 0); Assert.True(resultSecondRow.GetItemOrDefault(2) == 0); Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0); Assert.True(resultThirdRow.Length == 3); Assert.True(resultThirdRow.GetItemOrDefault(0) == 0); Assert.True(resultThirdRow.GetItemOrDefault(1) == 0); Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0); } }
public void RangeFilterTest() { var builder = new ArrayDataViewBuilder(ML); builder.AddColumn("Strings", new[] { "foo", "bar", "baz" }); builder.AddColumn("Floats", NumberType.R4, new float[] { 1, 2, 3 }); var data = builder.GetDataView(); var data1 = ML.Data.FilterByColumn(data, "Floats", upperBound: 2.8); var cnt = data1.GetColumn <float>(ML, "Floats").Count(); Assert.Equal(2L, cnt); data = ML.Transforms.Conversion.Hash("Key", "Strings", hashBits: 20).Fit(data).Transform(data); var data2 = ML.Data.FilterByKeyColumnFraction(data, "Key", upperBound: 0.5); cnt = data2.GetColumn <float>(ML, "Floats").Count(); Assert.Equal(1L, cnt); }
private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns) { Single[] targets = new Single[previousRuns.Count()]; Single[][] features = new Single[previousRuns.Count()][]; int i = 0; foreach (RunResult r in previousRuns) { features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true); targets[i] = (float)r.MetricValue; i++; } ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host); dvBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, targets); dvBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single, features); IDataView view = dvBuilder.GetDataView(); _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); using (IChannel ch = _host.Start("Single training")) { // Set relevant random forest arguments. // Train random forest. var trainer = new FastForestRegressionTrainer(_host, new FastForestRegressionTrainer.Options { FeatureFraction = _args.SplitRatio, NumberOfTrees = _args.NumOfTrees, MinimumExampleCountPerLeaf = _args.NMinForSplit, LabelColumnName = DefaultColumnNames.Label, FeatureColumnName = DefaultColumnNames.Features, }); var predictor = trainer.Fit(view); // Return random forest predictor. return(predictor.Model); } }
private FastForestRegressionPredictor FitModel(IEnumerable <IRunResult> previousRuns) { Single[] targets = new Single[previousRuns.Count()]; Single[][] features = new Single[previousRuns.Count()][]; int i = 0; foreach (RunResult r in previousRuns) { features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true); targets[i] = (Float)r.MetricValue; i++; } ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host); dvBuilder.AddColumn("Label", NumberType.Float, targets); dvBuilder.AddColumn("Features", NumberType.Float, features); IDataView view = dvBuilder.GetDataView(); _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); RoleMappedData data = TrainUtils.CreateExamples(view, "Label", "Features"); using (IChannel ch = _host.Start("Single training")) { // Set relevant random forest arguments. FastForestRegression.Arguments args = new FastForestRegression.Arguments(); args.FeatureFraction = _args.SplitRatio; args.NumTrees = _args.NumOfTrees; args.MinDocumentsInLeafs = _args.NMinForSplit; // Train random forest. FastForestRegression trainer = new FastForestRegression(_host, args); trainer.Train(data); FastForestRegressionPredictor predictor = trainer.CreatePredictor(); // Return random forest predictor. ch.Done(); return(predictor); } }
public void TextColumnDimensionsTest() { var context = new MLContext(); var dataBuilder = new ArrayDataViewBuilder(context); dataBuilder.AddColumn("categorical", new string[] { "0", "1", "0", "1", "0", "1", "2", "2", "0", "1" }); dataBuilder.AddColumn("text", new string[] { "0", "1", "0", "1", "0", "1", "2", "2", "0", "1" }); var data = dataBuilder.GetDataView(); var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] { new PurposeInference.Column(0, ColumnPurpose.CategoricalFeature), new PurposeInference.Column(0, ColumnPurpose.TextFeature), }); Assert.NotNull(dimensions); Assert.Equal(2, dimensions.Length); Assert.Equal(3, dimensions[0].Cardinality); Assert.Null(dimensions[1].Cardinality); Assert.Null(dimensions[0].HasMissing); Assert.Null(dimensions[1].HasMissing); }
public void FloatColumnDimensionsTest() { var context = new MLContext(); var dataBuilder = new ArrayDataViewBuilder(context); dataBuilder.AddColumn("NoNan", NumberDataViewType.Single, new float[] { 0, 1, 0, 1, 0 }); dataBuilder.AddColumn("Nan", NumberDataViewType.Single, new float[] { 0, 1, 0, 1, float.NaN }); var data = dataBuilder.GetDataView(); var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] { new PurposeInference.Column(0, ColumnPurpose.NumericFeature), new PurposeInference.Column(1, ColumnPurpose.NumericFeature), }); Assert.NotNull(dimensions); Assert.Equal(2, dimensions.Length); Assert.Null(dimensions[0].Cardinality); Assert.Null(dimensions[1].Cardinality); Assert.False(dimensions[0].HasMissing); Assert.True(dimensions[1].HasMissing); }
public IDataView GetSummaryDataView(RoleMappedSchema schema) { var bldr = new ArrayDataViewBuilder(Host); var cols = new VBuffer <Float> [_rank + 1]; var names = new string[_rank + 1]; for (var i = 0; i < _rank; ++i) { names[i] = "EigenVector" + i; cols[i] = _eigenVectors[i]; } names[_rank] = "MeanVector"; cols[_rank] = _mean; bldr.AddColumn("VectorName", names); bldr.AddColumn("VectorData", NumberType.R4, cols); return(bldr.GetDataView()); }
public void ValidateEmptyValidationDataThrows() { // Training data var dataViewBuilder = new ArrayDataViewBuilder(new MLContext()); dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, 0f); dataViewBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, 0f); var trainingData = dataViewBuilder.GetDataView(); // Validation data var schemaBuilder = new DataViewSchema.Builder(); schemaBuilder.AddColumn("Number", NumberDataViewType.Single); schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); var schema = schemaBuilder.ToSchema(); var validationData = DataViewTestFixture.BuildDummyDataView(schema, createDummyRow: false); var ex = Assert.Throws <ArgumentException>(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainingData, new ColumnInformation(), validationData, TaskKind.Regression)); Assert.StartsWith("Validation data has 0 rows", ex.Message); }
public void Setup() { var ctx = new MLContext(1); var builder = new ArrayDataViewBuilder(ctx); int[] values = new int[Length]; for (int i = 0; i < values.Length; ++i) { values[i] = i; } builder.AddColumn("A", NumberDataViewType.Int32, values); var dv = builder.GetDataView(); var cacheDv = ctx.Data.Cache(dv); var col = cacheDv.Schema.GetColumnOrNull("A").Value; // First do one pass through. using (var cursor = cacheDv.GetRowCursor(col)) { var getter = cursor.GetGetter <int>(col); int val = 0; int count = 0; while (cursor.MoveNext()) { getter(ref val); if (val != cursor.Position) { throw new Exception($"Unexpected value {val} at {cursor.Position}"); } count++; } if (count != Length) { throw new Exception($"Expected {Length} values in cache but only saw {count}"); } } _cacheDataView = cacheDv; // Only needed for seeker, but may as well set it. _positions = new long[Length]; var rand = new Random(0); for (int i = 0; i < _positions.Length; ++i) { _positions[i] = rand.Next(Length); } _col = _cacheDataView.Schema["A"]; _seeker = ((IRowSeekable)_cacheDataView).GetSeeker(colIndex => colIndex == _col.Index); _seekerGetter = _seeker.GetGetter <int>(_col); }
/// <summary> /// Save schema associations of role/column-name in <paramref name="rep"/>. /// </summary> internal static void SaveRoleMappings(IHostEnvironment env, IChannel ch, RoleMappedSchema schema, RepositoryWriter rep) { // REVIEW: Should we also save this stuff, for instance, in some portion of the // score command or transform? Contracts.AssertValue(env); env.AssertValue(ch); ch.AssertValue(schema); ArrayDataViewBuilder builder = new ArrayDataViewBuilder(env); List <string> rolesList = new List <string>(); List <string> columnNamesList = new List <string>(); // OrderBy is stable, so there is no danger in it "reordering" columns // when a role is filled by multiple columns. foreach (var role in schema.GetColumnRoleNames().OrderBy(r => r.Key.Value)) { rolesList.Add(role.Key.Value); columnNamesList.Add(role.Value); } builder.AddColumn("Role", rolesList.ToArray()); builder.AddColumn("Column", columnNamesList.ToArray()); using (var entry = rep.CreateEntry(DirTrainingInfo, RoleMappingFile)) { // REVIEW: It seems very important that we have the role mappings // be easily human interpretable and even manipulable, but relying on the // text saver/loader means that special characters like '\n' won't be reinterpretable. // On the other hand, no one is such a big lunatic that they will actually // ever go ahead and do something so stupid as that. var saver = new TextSaver(env, new TextSaver.Arguments() { Dense = true, Silent = true }); var view = builder.GetDataView(); saver.SaveData(entry.Stream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount)); } }
public void ValidateExperimentExecuteArgsTrainValidColCountMismatch() { var context = new MLContext(); var trainDataBuilder = new ArrayDataViewBuilder(context); trainDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); trainDataBuilder.AddColumn("1", new string[] { "1" }); var trainData = trainDataBuilder.GetDataView(); var validDataBuilder = new ArrayDataViewBuilder(context); validDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); var validData = validDataBuilder.GetDataView(); var ex = Assert.Throws <ArgumentException>(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainData, new ColumnInformation() { LabelColumnName = "0" }, validData, TaskKind.Regression)); Assert.StartsWith("Training data and validation data schemas do not match. Train data has '2' columns,and validation data has '1' columns.", ex.Message); }
private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns) { Single[] targets = new Single[previousRuns.Count()]; Single[][] features = new Single[previousRuns.Count()][]; int i = 0; foreach (RunResult r in previousRuns) { features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, r.ParameterSet, true); targets[i] = (Float)r.MetricValue; i++; } ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_context); dvBuilder.AddColumn(DefaultColumnNames.Label, NumberType.Float, targets); dvBuilder.AddColumn(DefaultColumnNames.Features, NumberType.Float, features); IDataView data = dvBuilder.GetDataView(); AutoMlUtils.Assert(data.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); // Set relevant random forest arguments. // Train random forest. var trainer = new FastForestRegression(_context, DefaultColumnNames.Label, DefaultColumnNames.Features, advancedSettings: s => { s.FeatureFraction = _args.SplitRatio; s.NumTrees = _args.NumOfTrees; s.MinDocumentsInLeafs = _args.NMinForSplit; }); var predictor = trainer.Train(data).Model; // Return random forest predictor. return(predictor); }
public void LdaWorkoutEstimatorCore() { var ml = new MLContext(); var builder = new ArrayDataViewBuilder(Env); var data = new[] { new[] { (float)1.0, (float)0.0, (float)0.0 }, new[] { (float)0.0, (float)1.0, (float)0.0 }, new[] { (float)0.0, (float)0.0, (float)1.0 }, }; builder.AddColumn("F1V", NumberDataViewType.Single, data); var srcView = builder.GetDataView(); var est = ml.Transforms.Text.LatentDirichletAllocation("F1V"); TestEstimatorCore(est, srcView); }
public void TransposerSaverLoaderTest() { const int rowCount = 1000; Random rgen = new Random(1); ArrayDataViewBuilder builder = new ArrayDataViewBuilder(Env); // A is to check the splitting of a sparse-ish column. var dataA = GenerateHelper(rowCount, 0.1, rgen, () => (int)rgen.Next(), 50, 5, 10, 15); dataA[rowCount / 2] = new VBuffer <int>(50, 0, null, null); // Coverage for the null vbuffer case. builder.AddColumn("A", NumberDataViewType.Int32, dataA); // B is to check the splitting of a dense-ish column. builder.AddColumn("B", NumberDataViewType.Double, GenerateHelper(rowCount, 0.8, rgen, rgen.NextDouble, 50, 0, 25, 49)); // C is to just have some column we do nothing with. builder.AddColumn("C", NumberDataViewType.Int16, GenerateHelper(rowCount, 0.1, rgen, () => (short)1, 30, 3, 10, 24)); // D is to check some column we don't have to split because it's sufficiently small. builder.AddColumn("D", NumberDataViewType.Double, GenerateHelper(rowCount, 0.1, rgen, rgen.NextDouble, 3, 1)); // E is to check a sparse scalar column. builder.AddColumn("E", NumberDataViewType.UInt32, GenerateHelper(rowCount, 0.1, rgen, () => (uint)rgen.Next(int.MinValue, int.MaxValue))); // F is to check a dense-ish scalar column. builder.AddColumn("F", NumberDataViewType.Int32, GenerateHelper(rowCount, 0.8, rgen, () => (int)rgen.Next())); IDataView view = builder.GetDataView(); IMultiStreamSource src; using (MemoryStream mem = new MemoryStream()) { TransposeSaver saver = new TransposeSaver(Env, new TransposeSaver.Arguments()); saver.SaveData(mem, view, Utils.GetIdentityPermutation(view.Schema.Count)); src = new BytesStreamSource(mem.ToArray()); } TransposeLoader loader = new TransposeLoader(Env, new TransposeLoader.Arguments(), src); // First check whether this as an IDataView yields the same values. CheckSameValues(view, loader); TransposeCheckHelper <int>(view, 0, loader); // A TransposeCheckHelper <Double>(view, 1, loader); // B TransposeCheckHelper <short>(view, 2, loader); // C TransposeCheckHelper <Double>(view, 3, loader); // D TransposeCheckHelper <uint>(view, 4, loader); // E TransposeCheckHelper <int>(view, 5, loader); // F Done(); }
public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) { var autoMlState = input.State as AutoInference.AutoMlMlState; if (autoMlState == null) { throw env.Except("The state must be a valid AutoMlState."); } // Create results output dataview var rows = autoMlState.GetAllEvaluatedPipelines().Select(p => p.ToResultRow()).ToList(); IDataView outputView; var col1 = new KeyValuePair <string, ColumnType>("Graph", TextType.Instance); var col2 = new KeyValuePair <string, ColumnType>("MetricValue", PrimitiveType.FromKind(DataKind.R8)); var col3 = new KeyValuePair <string, ColumnType>("PipelineId", TextType.Instance); var col4 = new KeyValuePair <string, ColumnType>("TrainingMetricValue", PrimitiveType.FromKind(DataKind.R8)); var col5 = new KeyValuePair <string, ColumnType>("FirstInput", TextType.Instance); var col6 = new KeyValuePair <string, ColumnType>("PredictorModel", TextType.Instance); if (rows.Count == 0) { var host = env.Register("ExtractSweepResult"); outputView = new EmptyDataView(host, SimpleSchemaUtils.Create(host, col1, col2, col3, col4, col5, col6)); } else { var builder = new ArrayDataViewBuilder(env); builder.AddColumn(col1.Key, (PrimitiveType)col1.Value, rows.Select(r => r.GraphJson.AsMemory()).ToArray()); builder.AddColumn(col2.Key, (PrimitiveType)col2.Value, rows.Select(r => r.MetricValue).ToArray()); builder.AddColumn(col3.Key, (PrimitiveType)col3.Value, rows.Select(r => r.PipelineId.AsMemory()).ToArray()); builder.AddColumn(col4.Key, (PrimitiveType)col4.Value, rows.Select(r => r.TrainingMetricValue).ToArray()); builder.AddColumn(col5.Key, (PrimitiveType)col5.Value, rows.Select(r => r.FirstInput.AsMemory()).ToArray()); builder.AddColumn(col6.Key, (PrimitiveType)col6.Value, rows.Select(r => r.PredictorModel.AsMemory()).ToArray()); outputView = builder.GetDataView(); } return(new Output { Results = outputView, State = autoMlState }); }
/// <summary> /// Features: x1, x2vBuff(sparce vector), x3. /// y = 10x1 + 10x2vBuff + 30x3 + e. /// Within xBuff feature 2nd slot will be sparse most of the time. /// 2nd slot of xBuff has the least importance: Evaluation metrics do not change a lot when this slot is permuted. /// x3 has the biggest importance. /// </summary> private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numberOfInstances = 1000) { // Setup synthetic dataset. var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x3Array = new float[numberOfInstances]; VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x3Important = rand.Next(10000); x3Array[i] = x3Important; VBuffer <float> vb; if (i % 10 != 0) { vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 }); } else { vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 }); } vbArray[i] = vb; float vbSum = 0; foreach (var vbValue in vb.DenseValues()) { vbSum += vbValue * 10; } var noise = rand.Next(50); yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise; } // If binary classification, modify the labels if (task == TaskType.BinaryClassification || task == TaskType.MulticlassClassification) { GetBinaryClassificationLabels(yArray); } else if (task == TaskType.Ranking) { GetRankingLabels(yArray); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray); bldr.AddColumn("X3Important", NumberType.Float, x3Array); bldr.AddColumn("Label", NumberType.Float, yArray); if (task == TaskType.Ranking) { bldr.AddColumn("GroupId", NumberType.U4, CreateGroupIds(yArray.Length)); } var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") .Append(ML.Transforms.Normalize("Features")); // Create a keytype for Ranking if (task == TaskType.Ranking) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV)); } return(pipeline.Fit(srcDV).Transform(srcDV)); }
public void TestFeatureImportance() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OrdinaryLeastSquares().Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); // Get prediction scores and contributions var enumerator = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator(); ScoreAndContribution row = null; var expectedValues = new List <float[]>(); expectedValues.Add(new float[4] { 0.06319684F, 1, 0.1386623F, 4.46209469E-06F }); expectedValues.Add(new float[4] { 0.03841561F, 1, 0.1633037F, 2.68303256E-06F }); expectedValues.Add(new float[4] { 0.12006103F, 1, 0.254072F, 1.18671605E-05F }); expectedValues.Add(new float[4] { 0.20861618F, 0.99999994F, 0.407312155F, 6.963478E-05F }); expectedValues.Add(new float[4] { 0.024050576F, 0.99999994F, 0.31106182F, 8.456762E-06F }); int index = 0; while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; // We set predicion to 6 because the limit of floating-point numbers is 7. Assert.Equal(expectedValues[index][0], row.FeatureContributions[0], 6); Assert.Equal(expectedValues[index][1], row.FeatureContributions[1], 6); Assert.Equal(expectedValues[index][2], row.FeatureContributions[2], 6); Assert.Equal(expectedValues[index++][3], row.FeatureContributions[3], 6); } Done(); }
public static CombinedOutput CombineMetrics(IHostEnvironment env, CombineMetricsInput input) { var eval = GetEvaluator(env, input.Kind); var perInst = EvaluateUtils.ConcatenatePerInstanceDataViews(env, eval, true, true, input.PerInstanceMetrics.Select( idv => RoleMappedData.CreateOpt(idv, new[] { RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, input.LabelColumn), RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Weight, input.WeightColumn.Value), RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Group, input.GroupColumn.Value) })).ToArray(), out var variableSizeVectorColumnNames); var warnings = input.Warnings != null ? new List <IDataView>(input.Warnings) : new List <IDataView>(); if (variableSizeVectorColumnNames.Length > 0) { var dvBldr = new ArrayDataViewBuilder(env); var warn = $"Detected columns of variable length: {string.Join(", ", variableSizeVectorColumnNames)}." + $" Consider setting collateMetrics- for meaningful per-Folds results."; dvBldr.AddColumn(MetricKinds.ColumnNames.WarningText, TextType.Instance, new DvText(warn)); warnings.Add(dvBldr.GetDataView()); } env.Assert(Utils.Size(perInst) == 1); var overall = eval.GetOverallResults(input.OverallMetrics); overall = EvaluateUtils.CombineFoldMetricsDataViews(env, overall, input.OverallMetrics.Length); IDataView conf = null; if (Utils.Size(input.ConfusionMatrix) > 0) { EvaluateUtils.ReconcileSlotNames <double>(env, input.ConfusionMatrix, MetricKinds.ColumnNames.Count, NumberType.R8); for (int i = 0; i < input.ConfusionMatrix.Length; i++) { var idv = input.ConfusionMatrix[i]; // Find the old Count column and drop it. for (int col = 0; col < idv.Schema.ColumnCount; col++) { if (idv.Schema.IsHidden(col) && idv.Schema.GetColumnName(col).Equals(MetricKinds.ColumnNames.Count)) { input.ConfusionMatrix[i] = new ChooseColumnsByIndexTransform(env, new ChooseColumnsByIndexTransform.Arguments() { Drop = true, Index = new[] { col } }, idv); break; } } } conf = EvaluateUtils.ConcatenateOverallMetrics(env, input.ConfusionMatrix); } var warningsIdv = warnings.Count > 0 ? AppendRowsDataView.Create(env, warnings[0].Schema, warnings.ToArray()) : null; return(new CombinedOutput() { PerInstanceMetrics = perInst[0], OverallMetrics = overall, ConfusionMatrix = conf, Warnings = warningsIdv }); }
/// <summary> /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random and Label y is to dependant on xRand. /// xRand has the least importance: Evaluation metrics do not change a lot when xRand is permuted. /// x2 has the biggest importance. /// </summary> private IDataView GetDenseDataset(TaskType task = TaskType.Regression) { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // If binary classification, modify the labels if (task == TaskType.BinaryClassification || task == TaskType.MulticlassClassification) { GetBinaryClassificationLabels(yArray); } else if (task == TaskType.Ranking) { GetRankingLabels(yArray); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberDataViewType.Single, x1Array); bldr.AddColumn("X2Important", NumberDataViewType.Single, x2Array); bldr.AddColumn("X3", NumberDataViewType.Single, x3Array); bldr.AddColumn("X4Rand", NumberDataViewType.Single, x4RandArray); bldr.AddColumn("Label", NumberDataViewType.Single, yArray); if (task == TaskType.Ranking) { bldr.AddColumn("GroupId", NumberDataViewType.UInt32, CreateGroupIds(yArray.Length)); } var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); // Create a keytype for Ranking if (task == TaskType.Ranking) { return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId")) .Fit(srcDV).Transform(srcDV)); } return(pipeline.Fit(srcDV).Transform(srcDV)); }
/// <summary> /// Utility method used to represent a tree ensemble as an <see cref="IDataView"/>. /// Every row in the <see cref="IDataView"/> corresponds to a node in the tree ensemble. The columns are the fields for each node. /// The column TreeID specifies which tree the node belongs to. The <see cref="QuantileRegressionTree"/> gets /// special treatment since it has some additional fields (<see cref="QuantileRegressionTree.GetLeafSamplesAt(int)"/> /// and <see cref="QuantileRegressionTree.GetLeafSampleWeightsAt(int)"/>). /// </summary> public static IDataView RegressionTreeEnsembleAsIDataView(IHost host, double bias, IReadOnlyList <double> treeWeights, IReadOnlyList <RegressionTreeBase> trees) { var builder = new ArrayDataViewBuilder(host); var numberOfRows = trees.Select(tree => tree.NumberOfNodes).Sum() + trees.Select(tree => tree.NumberOfLeaves).Sum(); var treeWeightsList = new List <double>(); var treeId = new List <int>(); var isLeaf = new List <ReadOnlyMemory <char> >(); var leftChild = new List <int>(); var rightChild = new List <int>(); var numericalSplitFeatureIndexes = new List <int>(); var numericalSplitThresholds = new List <float>(); var categoricalSplitFlags = new List <bool>(); var leafValues = new List <double>(); var splitGains = new List <double>(); var categoricalSplitFeatures = new List <VBuffer <int> >(); var categoricalCategoricalSplitFeatureRange = new List <VBuffer <int> >(); for (int i = 0; i < trees.Count; i++) { // TreeWeights column. The TreeWeight value will be repeated for all the notes in the same tree in the IDataView. treeWeightsList.AddRange(Enumerable.Repeat(treeWeights[i], trees[i].NumberOfNodes + trees[i].NumberOfLeaves)); // Tree id indicates which tree the node belongs to. treeId.AddRange(Enumerable.Repeat(i, trees[i].NumberOfNodes + trees[i].NumberOfLeaves)); // IsLeaf column indicates if node is a leaf node. isLeaf.AddRange(Enumerable.Repeat(new ReadOnlyMemory <char>("Tree node".ToCharArray()), trees[i].NumberOfNodes)); isLeaf.AddRange(Enumerable.Repeat(new ReadOnlyMemory <char>("Leaf node".ToCharArray()), trees[i].NumberOfLeaves)); // LeftChild column. leftChild.AddRange(trees[i].LeftChild.AsEnumerable()); leftChild.AddRange(Enumerable.Repeat(0, trees[i].NumberOfLeaves)); // RightChild column. rightChild.AddRange(trees[i].RightChild.AsEnumerable()); rightChild.AddRange(Enumerable.Repeat(0, trees[i].NumberOfLeaves)); // NumericalSplitFeatureIndexes column. numericalSplitFeatureIndexes.AddRange(trees[i].NumericalSplitFeatureIndexes.AsEnumerable()); numericalSplitFeatureIndexes.AddRange(Enumerable.Repeat(0, trees[i].NumberOfLeaves)); // NumericalSplitThresholds column. numericalSplitThresholds.AddRange(trees[i].NumericalSplitThresholds.AsEnumerable()); numericalSplitThresholds.AddRange(Enumerable.Repeat(0f, trees[i].NumberOfLeaves)); // CategoricalSplitFlags column. categoricalSplitFlags.AddRange(trees[i].CategoricalSplitFlags.AsEnumerable()); categoricalSplitFlags.AddRange(Enumerable.Repeat(false, trees[i].NumberOfLeaves)); // LeafValues column. leafValues.AddRange(Enumerable.Repeat(0d, trees[i].NumberOfNodes)); leafValues.AddRange(trees[i].LeafValues.AsEnumerable()); // SplitGains column. splitGains.AddRange(trees[i].SplitGains.AsEnumerable()); splitGains.AddRange(Enumerable.Repeat(0d, trees[i].NumberOfLeaves)); for (int j = 0; j < trees[i].NumberOfNodes; j++) { // CategoricalSplitFeatures column. var categoricalSplitFeaturesArray = trees[i].GetCategoricalSplitFeaturesAt(j).ToArray(); categoricalSplitFeatures.Add(new VBuffer <int>(categoricalSplitFeaturesArray.Length, categoricalSplitFeaturesArray)); var len = trees[i].GetCategoricalSplitFeaturesAt(j).ToArray().Length; // CategoricalCategoricalSplitFeatureRange column. var categoricalCategoricalSplitFeatureRangeArray = trees[i].GetCategoricalCategoricalSplitFeatureRangeAt(j).ToArray(); categoricalCategoricalSplitFeatureRange.Add(new VBuffer <int>(categoricalCategoricalSplitFeatureRangeArray.Length, categoricalCategoricalSplitFeatureRangeArray)); len = trees[i].GetCategoricalCategoricalSplitFeatureRangeAt(j).ToArray().Length; } categoricalSplitFeatures.AddRange(Enumerable.Repeat(new VBuffer <int>(), trees[i].NumberOfLeaves)); categoricalCategoricalSplitFeatureRange.AddRange(Enumerable.Repeat(new VBuffer <int>(), trees[i].NumberOfLeaves)); } // Bias column. This will be a repeated value for all rows in the resulting IDataView. builder.AddColumn("Bias", NumberDataViewType.Double, Enumerable.Repeat(bias, numberOfRows).ToArray()); builder.AddColumn("TreeWeights", NumberDataViewType.Double, treeWeightsList.ToArray()); builder.AddColumn("TreeID", NumberDataViewType.Int32, treeId.ToArray()); builder.AddColumn("IsLeaf", TextDataViewType.Instance, isLeaf.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.LeftChild), NumberDataViewType.Int32, leftChild.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.RightChild), NumberDataViewType.Int32, rightChild.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.NumericalSplitFeatureIndexes), NumberDataViewType.Int32, numericalSplitFeatureIndexes.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.NumericalSplitThresholds), NumberDataViewType.Single, numericalSplitThresholds.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.CategoricalSplitFlags), BooleanDataViewType.Instance, categoricalSplitFlags.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.LeafValues), NumberDataViewType.Double, leafValues.ToArray()); builder.AddColumn(nameof(RegressionTreeBase.SplitGains), NumberDataViewType.Double, splitGains.ToArray()); builder.AddColumn("CategoricalSplitFeatures", NumberDataViewType.Int32, categoricalSplitFeatures.ToArray()); builder.AddColumn("CategoricalCategoricalSplitFeatureRange", NumberDataViewType.Int32, categoricalCategoricalSplitFeatureRange.ToArray()); // If the input tree array is a quantile regression tree we need to add two more columns. var quantileTrees = trees as IReadOnlyList <QuantileRegressionTree>; if (quantileTrees != null) { // LeafSamples column. var leafSamples = new List <VBuffer <double> >(); // LeafSampleWeights column. var leafSampleWeights = new List <VBuffer <double> >(); for (int i = 0; i < quantileTrees.Count; i++) { leafSamples.AddRange(Enumerable.Repeat(new VBuffer <double>(), quantileTrees[i].NumberOfNodes)); leafSampleWeights.AddRange(Enumerable.Repeat(new VBuffer <double>(), quantileTrees[i].NumberOfNodes)); for (int j = 0; j < quantileTrees[i].NumberOfLeaves; j++) { var leafSamplesArray = quantileTrees[i].GetLeafSamplesAt(j).ToArray(); leafSamples.Add(new VBuffer <double>(leafSamplesArray.Length, leafSamplesArray)); var len = quantileTrees[i].GetLeafSamplesAt(j).ToArray().Length; var leafSampleWeightsArray = quantileTrees[i].GetLeafSampleWeightsAt(j).ToArray(); leafSampleWeights.Add(new VBuffer <double>(leafSampleWeightsArray.Length, leafSampleWeightsArray)); len = quantileTrees[i].GetLeafSampleWeightsAt(j).ToArray().Length; } } builder.AddColumn("LeafSamples", NumberDataViewType.Double, leafSamples.ToArray()); builder.AddColumn("LeafSampleWeights", NumberDataViewType.Double, leafSampleWeights.ToArray()); } var data = builder.GetDataView(); return(data); }
/// <summary> /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random, Label y is dependant on xRand. /// Test verifies that feature contribution scores are outputted along with a score for predicted data. /// </summary> private void TestFeatureContribution( ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer, List <float[]> expectedValues, int precision = 6) { // Setup synthetic dataset. const int numInstances = 1000; const int numFeatures = 4; var rand = new Random(10); float[] yArray = new float[numInstances]; float[][] xArray = new float[numFeatures][]; int[] xRangeArray = new[] { 1000, 10000, 5000, 1000 }; float[] xWeightArray = new[] { 10, 20, // Most important feature with high weight. Should have the highest contribution. 5.5f, 0, // Least important feature. Should have the least contribution. }; for (var instanceIndex = 0; instanceIndex < numInstances; instanceIndex++) { for (int featureIndex = 0; featureIndex < numFeatures; featureIndex++) { if (xArray[featureIndex] == null) { xArray[featureIndex] = new float[numInstances]; } xArray[featureIndex][instanceIndex] = rand.Next(xRangeArray[featureIndex]); yArray[instanceIndex] += xArray[featureIndex][instanceIndex] * xWeightArray[featureIndex]; } var noise = rand.Next(50); yArray[instanceIndex] += noise; } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, xArray[0]); bldr.AddColumn("X2Important", NumberType.Float, xArray[1]); bldr.AddColumn("X3", NumberType.Float, xArray[2]); bldr.AddColumn("X4Rand", NumberType.Float, xArray[3]); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = trainer.Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); var transformedOutput = output.AsEnumerable <ScoreAndContribution>(Env, true); int rowIndex = 0; foreach (var row in transformedOutput.Take(expectedValues.Count)) { var expectedValue = expectedValues[rowIndex++]; for (int i = 0; i < numFeatures; i++) { Assert.Equal(expectedValue[i], row.FeatureContributions[i], precision); } } Done(); }
/// <summary> /// Helper function that builds the IDataView given a list of keys and non-vector values /// </summary> internal static IDataView CreateDataView <TKey, TValue>(IHostEnvironment env, IEnumerable <TKey> keys, IEnumerable <TValue> values, string keyColumnName, string valueColumnName, bool treatValuesAsKeyTypes) { var keyType = GetPrimitiveType(typeof(TKey), out bool isKeyVectorType); var valueType = GetPrimitiveType(typeof(TValue), out bool isValueVectorType); var dataViewBuilder = new ArrayDataViewBuilder(env); dataViewBuilder.AddColumn(keyColumnName, keyType, keys.ToArray()); if (treatValuesAsKeyTypes) { // When treating the values as KeyTypes, generate the unique // set of values. This is used for generating the metadata of // the column. HashSet <TValue> valueSet = new HashSet <TValue>(); HashSet <TKey> keySet = new HashSet <TKey>(); for (int i = 0; i < values.Count(); ++i) { var v = values.ElementAt(i); if (valueSet.Contains(v)) { continue; } valueSet.Add(v); var k = keys.ElementAt(i); keySet.Add(k); } var metaKeys = keySet.ToArray(); // Key Values are treated in one of two ways: // If the values are of type uint or ulong, these values are used directly as the keys types and no new keys are created. // If the values are not of uint or ulong, then key values are generated as uints starting from 1, since 0 is missing key. if (valueType.RawKind == DataKind.U4) { uint[] indices = values.Select((x) => Convert.ToUInt32(x)).ToArray(); dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), 0, metaKeys.Length, indices); } else if (valueType.RawKind == DataKind.U8) { ulong[] indices = values.Select((x) => Convert.ToUInt64(x)).ToArray(); dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), 0, metaKeys.Length, indices); } else { // When generating the indices, treat each value as being unique, i.e. two values that are the same will // be assigned the same index. The dictionary is used to maintain uniqueness, indices will contain // the full list of indices (equal to the same length of values). Dictionary <TValue, uint> keyTypeValueMapping = new Dictionary <TValue, uint>(); uint[] indices = new uint[values.Count()]; // Start the index at 1 uint index = 1; for (int i = 0; i < values.Count(); ++i) { TValue value = values.ElementAt(i); if (!keyTypeValueMapping.ContainsKey(value)) { keyTypeValueMapping.Add(value, index); index++; } var keyValue = keyTypeValueMapping[value]; indices[i] = keyValue; } dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), 0, metaKeys.Count(), indices); } } else { dataViewBuilder.AddColumn(valueColumnName, valueType, values.ToArray()); } return(dataViewBuilder.GetDataView()); }
public void Train(List <FeatureSubsetModel <IPredictorProducing <TOutput> > > models, RoleMappedData data, IHostEnvironment env) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(Stacking.LoadName); host.CheckValue(models, nameof(models)); host.CheckValue(data, nameof(data)); using (var ch = host.Start("Training stacked model")) { ch.Check(Meta == null, "Train called multiple times"); ch.Check(BasePredictorType != null); var maps = new ValueMapper <VBuffer <Single>, TOutput> [models.Count]; for (int i = 0; i < maps.Length; i++) { Contracts.Assert(models[i].Predictor is IValueMapper); var m = (IValueMapper)models[i].Predictor; maps[i] = m.GetMapper <VBuffer <Single>, TOutput>(); } // REVIEW: Should implement this better.... var labels = new Single[100]; var features = new VBuffer <Single> [100]; int count = 0; // REVIEW: Should this include bad values or filter them? using (var cursor = new FloatLabelCursor(data, CursOpt.AllFeatures | CursOpt.AllLabels)) { TOutput[] predictions = new TOutput[maps.Length]; var vBuffers = new VBuffer <Single> [maps.Length]; while (cursor.MoveNext()) { Parallel.For(0, maps.Length, i => { var model = models[i]; if (model.SelectedFeatures != null) { EnsembleUtils.SelectFeatures(ref cursor.Features, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]); maps[i](ref vBuffers[i], ref predictions[i]); } else { maps[i](ref cursor.Features, ref predictions[i]); } }); Utils.EnsureSize(ref labels, count + 1); Utils.EnsureSize(ref features, count + 1); labels[count] = cursor.Label; FillFeatureBuffer(predictions, ref features[count]); count++; } } ch.Info("The number of instances used for stacking trainer is {0}", count); var bldr = new ArrayDataViewBuilder(host); Array.Resize(ref labels, count); Array.Resize(ref features, count); bldr.AddColumn(DefaultColumnNames.Label, NumberType.Float, labels); bldr.AddColumn(DefaultColumnNames.Features, NumberType.Float, features); var view = bldr.GetDataView(); var rmd = new RoleMappedData(view, DefaultColumnNames.Label, DefaultColumnNames.Features); var trainer = BasePredictorType.CreateComponent(host); if (trainer.Info.NeedNormalization) { ch.Warning("The trainer specified for stacking wants normalization, but we do not currently allow this."); } Meta = trainer.Train(rmd); CheckMeta(); } }
public void TestDenseSGD() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var pfi = ML.Regression.PermutationFeatureImportance(model, data); // Pfi Indices: // X1: 0 // X2Important: 1 // X3: 2 // X4Rand: 3 // For the following metrics lower is better, so maximum delta means more important feature, and vice versa Assert.True(MinDeltaIndex(pfi, m => m.L1) == 3); Assert.True(MaxDeltaIndex(pfi, m => m.L1) == 1); Assert.True(MinDeltaIndex(pfi, m => m.L2) == 3); Assert.True(MaxDeltaIndex(pfi, m => m.L2) == 1); Assert.True(MinDeltaIndex(pfi, m => m.Rms) == 3); Assert.True(MaxDeltaIndex(pfi, m => m.Rms) == 1); // For the following metrics higher is better, so minimum delta means more important feature, and vice versa Assert.True(MaxDeltaIndex(pfi, m => m.RSquared) == 3); Assert.True(MinDeltaIndex(pfi, m => m.RSquared) == 1); Done(); }
public void TestFeatureImportance() { // Setup synthetic dataset. const int numberOfInstances = 1000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x2Array = new float[numberOfInstances], x3Array = new float[numberOfInstances], x4RandArray = new float[numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x2Important = rand.Next(10000); x2Array[i] = x2Important; var x3 = rand.Next(5000); x3Array[i] = x3; var x4Rand = rand.Next(1000); x4RandArray[i] = x4Rand; var noise = rand.Next(50); yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise); } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2Important", NumberType.Float, x2Array); bldr.AddColumn("X3", NumberType.Float, x3Array); bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var args = new FeatureContributionCalculationTransform.Arguments() { Bottom = 10, Top = 10 }; var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn); // Get prediction scores and contributions var enumerator = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator(); ScoreAndContribution row = null; var expectedValues = new List <float[]>(); expectedValues.Add(new float[4] { 0.15640761F, 1, 0.155862764F, 0.07276783F }); expectedValues.Add(new float[4] { 0.09507586F, 1, 0.1835608F, 0.0437548943F }); expectedValues.Add(new float[4] { 0.297142357F, 1, 0.2855884F, 0.193529665F }); expectedValues.Add(new float[4] { 0.45465675F, 0.8805887F, 0.4031663F, 1 }); expectedValues.Add(new float[4] { 0.0595234372F, 0.99999994F, 0.349647522F, 0.137912869F }); int index = 0; while (enumerator.MoveNext() && index < expectedValues.Count) { row = enumerator.Current; Assert.True(row.FeatureContributions[0] == expectedValues[index][0]); Assert.True(row.FeatureContributions[1] == expectedValues[index][1]); Assert.True(row.FeatureContributions[2] == expectedValues[index][2]); Assert.True(row.FeatureContributions[3] == expectedValues[index++][3]); } Done(); }
public void TestSparseSGD() { // Setup synthetic dataset. const int numberOfInstances = 10000; var rand = new Random(10); float[] yArray = new float[numberOfInstances], x1Array = new float[numberOfInstances], x3Array = new float[numberOfInstances]; VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances]; for (var i = 0; i < numberOfInstances; i++) { var x1 = rand.Next(1000); x1Array[i] = x1; var x3Important = rand.Next(10000); x3Array[i] = x3Important; VBuffer <float> vb; if (i % 10 != 0) { vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 }); } else { vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 }); } vbArray[i] = vb; float vbSum = 0; foreach (var vbValue in vb.DenseValues()) { vbSum += vbValue * 10; } var noise = rand.Next(50); yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise; } // Create data view. var bldr = new ArrayDataViewBuilder(Env); bldr.AddColumn("X1", NumberType.Float, x1Array); bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray); bldr.AddColumn("X3Important", NumberType.Float, x3Array); bldr.AddColumn("Label", NumberType.Float, yArray); var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") .Append(ML.Transforms.Normalize("Features")); var data = pipeline.Fit(srcDV).Transform(srcDV); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); var results = ML.Regression.PermutationFeatureImportance(model, data); // Pfi Indices: // X1: 0 // X2VBuffer-Slot-0: 1 // X2VBuffer-Slot-1: 2 // X2VBuffer-Slot-2: 3 // X2VBuffer-Slot-3: 4 // X3Important: 5 // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact. // For the following metrics lower is better, so maximum delta means more important feature, and vice versa Assert.True(MinDeltaIndex(results, m => m.L1) == 2); Assert.True(MaxDeltaIndex(results, m => m.L1) == 5); Assert.True(MinDeltaIndex(results, m => m.L2) == 2); Assert.True(MaxDeltaIndex(results, m => m.L2) == 5); Assert.True(MinDeltaIndex(results, m => m.Rms) == 2); Assert.True(MaxDeltaIndex(results, m => m.Rms) == 5); // For the following metrics higher is better, so minimum delta means more important feature, and vice versa Assert.True(MaxDeltaIndex(results, m => m.RSquared) == 2); Assert.True(MinDeltaIndex(results, m => m.RSquared) == 5); }
public void TransposerTest() { const int rowCount = 1000; Random rgen = new Random(0); ArrayDataViewBuilder builder = new ArrayDataViewBuilder(Env); // A is to check the splitting of a sparse-ish column. var dataA = GenerateHelper(rowCount, 0.1, rgen, () => (int)rgen.Next(), 50, 5, 10, 15); dataA[rowCount / 2] = new VBuffer <int>(50, 0, null, null); // Coverage for the null vbuffer case. builder.AddColumn("A", NumberDataViewType.Int32, dataA); // B is to check the splitting of a dense-ish column. builder.AddColumn("B", NumberDataViewType.Double, GenerateHelper(rowCount, 0.8, rgen, rgen.NextDouble, 50, 0, 25, 49)); // C is to just have some column we do nothing with. builder.AddColumn("C", NumberDataViewType.Int16, GenerateHelper(rowCount, 0.1, rgen, () => (short)1, 30, 3, 10, 24)); // D is to check some column we don't have to split because it's sufficiently small. builder.AddColumn("D", NumberDataViewType.Double, GenerateHelper(rowCount, 0.1, rgen, rgen.NextDouble, 3, 1)); // E is to check a sparse scalar column. builder.AddColumn("E", NumberDataViewType.UInt32, GenerateHelper(rowCount, 0.1, rgen, () => (uint)rgen.Next(int.MinValue, int.MaxValue))); // F is to check a dense-ish scalar column. builder.AddColumn("F", NumberDataViewType.Int32, GenerateHelper(rowCount, 0.8, rgen, () => rgen.Next())); IDataView view = builder.GetDataView(); // Do not force save. This will have a mix of passthrough and saved columns. Note that duplicate // specification of "D" to test that specifying a column twice has no ill effects. string[] names = { "B", "A", "E", "D", "F", "D" }; using (Transposer trans = Transposer.Create(Env, view, false, names)) { // Before checking the contents, check the names. for (int i = 0; i < names.Length; ++i) { int index; Assert.True(trans.Schema.TryGetColumnIndex(names[i], out index), $"Transpose schema couldn't find column '{names[i]}'"); int trueIndex; bool result = view.Schema.TryGetColumnIndex(names[i], out trueIndex); Contracts.Assert(result); Assert.True(trueIndex == index, $"Transpose schema had column '{names[i]}' at unexpected index"); } // Check the contents Assert.Null(((ITransposeDataView)trans).GetSlotType(2)); // C check to see that it's not transposable. TransposeCheckHelper <int>(view, 0, trans); // A check. TransposeCheckHelper <Double>(view, 1, trans); // B check. TransposeCheckHelper <Double>(view, 3, trans); // D check. TransposeCheckHelper <uint>(view, 4, trans); // E check. TransposeCheckHelper <int>(view, 5, trans); // F check. } // Force save. Recheck columns that would have previously been passthrough columns. // The primary benefit of this check is that we check the binary saving / loading // functionality of scalars which are otherwise always must necessarily be // passthrough. Also exercise the select by index functionality while we're at it. using (Transposer trans = Transposer.Create(Env, view, true, 3, 5, 4)) { // Check to see that A, B, and C were not transposed somehow. var itdv = (ITransposeDataView)trans; Assert.Null(itdv.GetSlotType(0)); Assert.Null(itdv.GetSlotType(1)); Assert.Null(itdv.GetSlotType(2)); TransposeCheckHelper <Double>(view, 3, trans); // D check. TransposeCheckHelper <uint>(view, 4, trans); // E check. TransposeCheckHelper <int>(view, 5, trans); // F check. } }