Ejemplo n.º 1
0
        public void LdaWorkoutEstimatorCore()
        {
            var ml = new MLContext();

            var builder = new ArrayDataViewBuilder(Env);
            var data    = new[]
            {
                new[] { (float)1.0, (float)0.0, (float)0.0 },
                new[] { (float)0.0, (float)1.0, (float)0.0 },
                new[] { (float)0.0, (float)0.0, (float)1.0 },
            };

            builder.AddColumn("F1V", NumberType.Float, data);
            var srcView = builder.GetDataView();

            var est = ml.Transforms.Text.LatentDirichletAllocation("F1V");

            TestEstimatorCore(est, srcView);
        }
Ejemplo n.º 2
0
        public void RangeFilterTest()
        {
            var builder = new ArrayDataViewBuilder(ML);

            builder.AddColumn("Strings", new[] { "foo", "bar", "baz" });
            builder.AddColumn("Floats", NumberType.R4, new float[] { 1, 2, 3 });
            var data = builder.GetDataView();

            var data1 = ML.Data.FilterByColumn(data, "Floats", upperBound: 2.8);
            var cnt   = data1.GetColumn <float>(ML, "Floats").Count();

            Assert.Equal(2L, cnt);

            data = ML.Transforms.Conversion.Hash("Key", "Strings", hashBits: 20).Fit(data).Transform(data);
            var data2 = ML.Data.FilterByKeyColumnFraction(data, "Key", upperBound: 0.5);

            cnt = data2.GetColumn <float>(ML, "Floats").Count();
            Assert.Equal(1L, cnt);
        }
Ejemplo n.º 3
0
        private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns)
        {
            Single[]   targets  = new Single[previousRuns.Count()];
            Single[][] features = new Single[previousRuns.Count()][];

            int i = 0;

            foreach (RunResult r in previousRuns)
            {
                features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true);
                targets[i]  = (float)r.MetricValue;
                i++;
            }

            ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host);

            dvBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, targets);
            dvBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single, features);

            IDataView view = dvBuilder.GetDataView();

            _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations");

            using (IChannel ch = _host.Start("Single training"))
            {
                // Set relevant random forest arguments.
                // Train random forest.
                var trainer = new FastForestRegressionTrainer(_host,
                                                              new FastForestRegressionTrainer.Options
                {
                    FeatureFraction            = _args.SplitRatio,
                    NumberOfTrees              = _args.NumOfTrees,
                    MinimumExampleCountPerLeaf = _args.NMinForSplit,
                    LabelColumnName            = DefaultColumnNames.Label,
                    FeatureColumnName          = DefaultColumnNames.Features,
                });
                var predictor = trainer.Fit(view);

                // Return random forest predictor.
                return(predictor.Model);
            }
        }
Ejemplo n.º 4
0
        private FastForestRegressionPredictor FitModel(IEnumerable <IRunResult> previousRuns)
        {
            Single[]   targets  = new Single[previousRuns.Count()];
            Single[][] features = new Single[previousRuns.Count()][];

            int i = 0;

            foreach (RunResult r in previousRuns)
            {
                features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_host, _sweepParameters, r.ParameterSet, true);
                targets[i]  = (Float)r.MetricValue;
                i++;
            }

            ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_host);

            dvBuilder.AddColumn("Label", NumberType.Float, targets);
            dvBuilder.AddColumn("Features", NumberType.Float, features);

            IDataView view = dvBuilder.GetDataView();

            _host.Assert(view.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations");
            RoleMappedData data = TrainUtils.CreateExamples(view, "Label", "Features");

            using (IChannel ch = _host.Start("Single training"))
            {
                // Set relevant random forest arguments.
                FastForestRegression.Arguments args = new FastForestRegression.Arguments();
                args.FeatureFraction     = _args.SplitRatio;
                args.NumTrees            = _args.NumOfTrees;
                args.MinDocumentsInLeafs = _args.NMinForSplit;

                // Train random forest.
                FastForestRegression trainer = new FastForestRegression(_host, args);
                trainer.Train(data);
                FastForestRegressionPredictor predictor = trainer.CreatePredictor();

                // Return random forest predictor.
                ch.Done();
                return(predictor);
            }
        }
        public void TextColumnDimensionsTest()
        {
            var context     = new MLContext();
            var dataBuilder = new ArrayDataViewBuilder(context);

            dataBuilder.AddColumn("categorical", new string[] { "0", "1", "0", "1", "0", "1", "2", "2", "0", "1" });
            dataBuilder.AddColumn("text", new string[] { "0", "1", "0", "1", "0", "1", "2", "2", "0", "1" });
            var data       = dataBuilder.GetDataView();
            var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] {
                new PurposeInference.Column(0, ColumnPurpose.CategoricalFeature),
                new PurposeInference.Column(0, ColumnPurpose.TextFeature),
            });

            Assert.NotNull(dimensions);
            Assert.Equal(2, dimensions.Length);
            Assert.Equal(3, dimensions[0].Cardinality);
            Assert.Null(dimensions[1].Cardinality);
            Assert.Null(dimensions[0].HasMissing);
            Assert.Null(dimensions[1].HasMissing);
        }
        public void FloatColumnDimensionsTest()
        {
            var context     = new MLContext();
            var dataBuilder = new ArrayDataViewBuilder(context);

            dataBuilder.AddColumn("NoNan", NumberDataViewType.Single, new float[] { 0, 1, 0, 1, 0 });
            dataBuilder.AddColumn("Nan", NumberDataViewType.Single, new float[] { 0, 1, 0, 1, float.NaN });
            var data       = dataBuilder.GetDataView();
            var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] {
                new PurposeInference.Column(0, ColumnPurpose.NumericFeature),
                new PurposeInference.Column(1, ColumnPurpose.NumericFeature),
            });

            Assert.NotNull(dimensions);
            Assert.Equal(2, dimensions.Length);
            Assert.Null(dimensions[0].Cardinality);
            Assert.Null(dimensions[1].Cardinality);
            Assert.False(dimensions[0].HasMissing);
            Assert.True(dimensions[1].HasMissing);
        }
Ejemplo n.º 7
0
        public IDataView GetSummaryDataView(RoleMappedSchema schema)
        {
            var bldr = new ArrayDataViewBuilder(Host);

            var cols  = new VBuffer <Float> [_rank + 1];
            var names = new string[_rank + 1];

            for (var i = 0; i < _rank; ++i)
            {
                names[i] = "EigenVector" + i;
                cols[i]  = _eigenVectors[i];
            }
            names[_rank] = "MeanVector";
            cols[_rank]  = _mean;

            bldr.AddColumn("VectorName", names);
            bldr.AddColumn("VectorData", NumberType.R4, cols);

            return(bldr.GetDataView());
        }
Ejemplo n.º 8
0
        public void Setup()
        {
            var ctx = new MLContext();
            var builder = new ArrayDataViewBuilder(ctx);
            int[] values = new int[Length];
            for (int i = 0; i < values.Length; ++i)
                values[i] = i;
            builder.AddColumn("A", NumberType.I4, values);
            var dv = builder.GetDataView();
            var cacheDv = ctx.Data.Cache(dv);

            var col = cacheDv.Schema.GetColumnOrNull("A").Value;
            // First do one pass through.
            using (var cursor = cacheDv.GetRowCursor(col))
            {
                var getter = cursor.GetGetter<int>(col.Index);
                int val = 0;
                int count = 0;
                while (cursor.MoveNext())
                {
                    getter(ref val);
                    if (val != cursor.Position)
                        throw new Exception($"Unexpected value {val} at {cursor.Position}");
                    count++;
                }
                if (count != Length)
                    throw new Exception($"Expected {Length} values in cache but only saw {count}");
            }
            _cacheDataView = cacheDv;

            // Only needed for seeker, but may as well set it.
            _positions = new long[Length];
            var rand = new Random(0);
            for (int i = 0; i < _positions.Length; ++i)
                _positions[i] = rand.Next(Length);

            _col = _cacheDataView.Schema["A"];
            _seeker = ((IRowSeekable)_cacheDataView).GetSeeker(colIndex => colIndex == _col.Index);
            _seekerGetter = _seeker.GetGetter<int>(_col.Index);
        }
Ejemplo n.º 9
0
        public void ValidateEmptyValidationDataThrows()
        {
            // Training data
            var dataViewBuilder = new ArrayDataViewBuilder(new MLContext());

            dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, 0f);
            dataViewBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, 0f);
            var trainingData = dataViewBuilder.GetDataView();

            // Validation data
            var schemaBuilder = new DataViewSchema.Builder();

            schemaBuilder.AddColumn("Number", NumberDataViewType.Single);
            schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single);
            var schema         = schemaBuilder.ToSchema();
            var validationData = DataViewTestFixture.BuildDummyDataView(schema, createDummyRow: false);

            var ex = Assert.Throws <ArgumentException>(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainingData, new ColumnInformation(),
                                                                                                                   validationData, TaskKind.Regression));

            Assert.StartsWith("Validation data has 0 rows", ex.Message);
        }
        public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input)
        {
            var autoMlState = input.State as AutoInference.AutoMlMlState;

            if (autoMlState == null)
            {
                throw env.Except("The state must be a valid AutoMlState.");
            }
            // Create results output dataview
            var       rows = autoMlState.GetAllEvaluatedPipelines().Select(p => p.ToResultRow()).ToList();
            IDataView outputView;
            var       col1 = new KeyValuePair <string, ColumnType>("Graph", TextType.Instance);
            var       col2 = new KeyValuePair <string, ColumnType>("MetricValue", PrimitiveType.FromKind(DataKind.R8));
            var       col3 = new KeyValuePair <string, ColumnType>("PipelineId", TextType.Instance);
            var       col4 = new KeyValuePair <string, ColumnType>("TrainingMetricValue", PrimitiveType.FromKind(DataKind.R8));
            var       col5 = new KeyValuePair <string, ColumnType>("FirstInput", TextType.Instance);
            var       col6 = new KeyValuePair <string, ColumnType>("PredictorModel", TextType.Instance);

            if (rows.Count == 0)
            {
                var host = env.Register("ExtractSweepResult");
                outputView = new EmptyDataView(host, SimpleSchemaUtils.Create(host, col1, col2, col3, col4, col5, col6));
            }
            else
            {
                var builder = new ArrayDataViewBuilder(env);
                builder.AddColumn(col1.Key, (PrimitiveType)col1.Value, rows.Select(r => r.GraphJson.AsMemory()).ToArray());
                builder.AddColumn(col2.Key, (PrimitiveType)col2.Value, rows.Select(r => r.MetricValue).ToArray());
                builder.AddColumn(col3.Key, (PrimitiveType)col3.Value, rows.Select(r => r.PipelineId.AsMemory()).ToArray());
                builder.AddColumn(col4.Key, (PrimitiveType)col4.Value, rows.Select(r => r.TrainingMetricValue).ToArray());
                builder.AddColumn(col5.Key, (PrimitiveType)col5.Value, rows.Select(r => r.FirstInput.AsMemory()).ToArray());
                builder.AddColumn(col6.Key, (PrimitiveType)col6.Value, rows.Select(r => r.PredictorModel.AsMemory()).ToArray());
                outputView = builder.GetDataView();
            }
            return(new Output {
                Results = outputView, State = autoMlState
            });
        }
Ejemplo n.º 11
0
        /// <summary>
        /// Save schema associations of role/column-name in <paramref name="rep"/>.
        /// </summary>
        internal static void SaveRoleMappings(IHostEnvironment env, IChannel ch, RoleMappedSchema schema, RepositoryWriter rep)
        {
            // REVIEW: Should we also save this stuff, for instance, in some portion of the
            // score command or transform?
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(schema);

            ArrayDataViewBuilder builder = new ArrayDataViewBuilder(env);

            List <string> rolesList       = new List <string>();
            List <string> columnNamesList = new List <string>();

            // OrderBy is stable, so there is no danger in it "reordering" columns
            // when a role is filled by multiple columns.
            foreach (var role in schema.GetColumnRoleNames().OrderBy(r => r.Key.Value))
            {
                rolesList.Add(role.Key.Value);
                columnNamesList.Add(role.Value);
            }
            builder.AddColumn("Role", rolesList.ToArray());
            builder.AddColumn("Column", columnNamesList.ToArray());

            using (var entry = rep.CreateEntry(DirTrainingInfo, RoleMappingFile))
            {
                // REVIEW: It seems very important that we have the role mappings
                // be easily human interpretable and even manipulable, but relying on the
                // text saver/loader means that special characters like '\n' won't be reinterpretable.
                // On the other hand, no one is such a big lunatic that they will actually
                // ever go ahead and do something so stupid as that.
                var saver = new TextSaver(env, new TextSaver.Arguments()
                {
                    Dense = true, Silent = true
                });
                var view = builder.GetDataView();
                saver.SaveData(entry.Stream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount));
            }
        }
Ejemplo n.º 12
0
        public void ValidateExperimentExecuteArgsTrainValidColCountMismatch()
        {
            var context = new MLContext();

            var trainDataBuilder = new ArrayDataViewBuilder(context);

            trainDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 });
            trainDataBuilder.AddColumn("1", new string[] { "1" });
            var trainData = trainDataBuilder.GetDataView();

            var validDataBuilder = new ArrayDataViewBuilder(context);

            validDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 });
            var validData = validDataBuilder.GetDataView();

            var ex = Assert.Throws <ArgumentException>(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainData,
                                                                                                                   new ColumnInformation()
            {
                LabelColumnName = "0"
            }, validData, TaskKind.Regression));

            Assert.StartsWith("Training data and validation data schemas do not match. Train data has '2' columns,and validation data has '1' columns.", ex.Message);
        }
Ejemplo n.º 13
0
        private FastForestRegressionModelParameters FitModel(IEnumerable <IRunResult> previousRuns)
        {
            Single[]   targets  = new Single[previousRuns.Count()];
            Single[][] features = new Single[previousRuns.Count()][];

            int i = 0;

            foreach (RunResult r in previousRuns)
            {
                features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, r.ParameterSet, true);
                targets[i]  = (Float)r.MetricValue;
                i++;
            }

            ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_context);

            dvBuilder.AddColumn(DefaultColumnNames.Label, NumberType.Float, targets);
            dvBuilder.AddColumn(DefaultColumnNames.Features, NumberType.Float, features);

            IDataView data = dvBuilder.GetDataView();

            AutoMlUtils.Assert(data.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations");

            // Set relevant random forest arguments.
            // Train random forest.
            var trainer = new FastForestRegression(_context, DefaultColumnNames.Label, DefaultColumnNames.Features, advancedSettings: s =>
            {
                s.FeatureFraction     = _args.SplitRatio;
                s.NumTrees            = _args.NumOfTrees;
                s.MinDocumentsInLeafs = _args.NMinForSplit;
            });
            var predictor = trainer.Train(data).Model;

            // Return random forest predictor.
            return(predictor);
        }
        public void TestSparseSGD()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 10000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array = new float[numberOfInstances],
            x3Array = new float[numberOfInstances];

            VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x3Important = rand.Next(10000);
                x3Array[i] = x3Important;

                VBuffer <float> vb;

                if (i % 10 != 0)
                {
                    vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 });
                }
                else
                {
                    vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 });
                }

                vbArray[i] = vb;

                float vbSum = 0;
                foreach (var vbValue in vb.DenseValues())
                {
                    vbSum += vbValue * 10;
                }

                var noise = rand.Next(50);
                yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise;
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray);
            bldr.AddColumn("X3Important", NumberType.Float, x3Array);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important")
                           .Append(ML.Transforms.Normalize("Features"));
            var data    = pipeline.Fit(srcDV).Transform(srcDV);
            var model   = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var results = ML.Regression.PermutationFeatureImportance(model, data);

            // Pfi Indices:
            // X1: 0
            // X2VBuffer-Slot-0: 1
            // X2VBuffer-Slot-1: 2
            // X2VBuffer-Slot-2: 3
            // X2VBuffer-Slot-3: 4
            // X3Important: 5

            // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact.
            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
            Assert.True(MinDeltaIndex(results, m => m.L1) == 2);
            Assert.True(MaxDeltaIndex(results, m => m.L1) == 5);

            Assert.True(MinDeltaIndex(results, m => m.L2) == 2);
            Assert.True(MaxDeltaIndex(results, m => m.L2) == 5);

            Assert.True(MinDeltaIndex(results, m => m.Rms) == 2);
            Assert.True(MaxDeltaIndex(results, m => m.Rms) == 5);

            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
            Assert.True(MaxDeltaIndex(results, m => m.RSquared) == 2);
            Assert.True(MinDeltaIndex(results, m => m.RSquared) == 5);
        }
        public void TestFeatureImportance()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            // Get prediction scores and contributions
            var enumerator           = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator();
            ScoreAndContribution row = null;
            var expectedValues       = new List <float[]>();

            expectedValues.Add(new float[4] {
                0.15640761F, 1, 0.155862764F, 0.07276783F
            });
            expectedValues.Add(new float[4] {
                0.09507586F, 1, 0.1835608F, 0.0437548943F
            });
            expectedValues.Add(new float[4] {
                0.297142357F, 1, 0.2855884F, 0.193529665F
            });
            expectedValues.Add(new float[4] {
                0.45465675F, 0.8805887F, 0.4031663F, 1
            });
            expectedValues.Add(new float[4] {
                0.0595234372F, 0.99999994F, 0.349647522F, 0.137912869F
            });
            int index = 0;

            while (enumerator.MoveNext() && index < expectedValues.Count)
            {
                row = enumerator.Current;
                Assert.True(row.FeatureContributions[0] == expectedValues[index][0]);
                Assert.True(row.FeatureContributions[1] == expectedValues[index][1]);
                Assert.True(row.FeatureContributions[2] == expectedValues[index][2]);
                Assert.True(row.FeatureContributions[3] == expectedValues[index++][3]);
            }

            Done();
        }
        /// <summary>
        /// Features: x1, x2vBuff(sparce vector), x3.
        /// y = 10x1 + 10x2vBuff + 30x3 + e.
        /// Within xBuff feature  2nd slot will be sparse most of the time.
        /// 2nd slot of xBuff has the least importance: Evaluation metrics do not change a lot when this slot is permuted.
        /// x3 has the biggest importance.
        /// </summary>
        private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numberOfInstances = 1000)
        {
            // Setup synthetic dataset.
            var rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array = new float[numberOfInstances],
            x3Array = new float[numberOfInstances];

            VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x3Important = rand.Next(10000);
                x3Array[i] = x3Important;

                VBuffer <float> vb;

                if (i % 10 != 0)
                {
                    vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 });
                }
                else
                {
                    vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 });
                }

                vbArray[i] = vb;

                float vbSum = 0;
                foreach (var vbValue in vb.DenseValues())
                {
                    vbSum += vbValue * 10;
                }

                var noise = rand.Next(50);
                yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise;
            }

            // If binary classification, modify the labels
            if (task == TaskType.BinaryClassification ||
                task == TaskType.MulticlassClassification)
            {
                GetBinaryClassificationLabels(yArray);
            }
            else if (task == TaskType.Ranking)
            {
                GetRankingLabels(yArray);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray);
            bldr.AddColumn("X3Important", NumberType.Float, x3Array);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            if (task == TaskType.Ranking)
            {
                bldr.AddColumn("GroupId", NumberType.U4, CreateGroupIds(yArray.Length));
            }
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important")
                           .Append(ML.Transforms.Normalize("Features"));

            // Create a keytype for Ranking
            if (task == TaskType.Ranking)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId"))
                       .Fit(srcDV).Transform(srcDV));
            }

            return(pipeline.Fit(srcDV).Transform(srcDV));
        }
Ejemplo n.º 17
0
        public static CombinedOutput CombineMetrics(IHostEnvironment env, CombineMetricsInput input)
        {
            var eval = GetEvaluator(env, input.Kind);

            var perInst = EvaluateUtils.ConcatenatePerInstanceDataViews(env, eval, true, true, input.PerInstanceMetrics.Select(
                                                                            idv => RoleMappedData.CreateOpt(idv, new[]
            {
                RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, input.LabelColumn),
                RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Weight, input.WeightColumn.Value),
                RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Group, input.GroupColumn.Value)
            })).ToArray(),
                                                                        out var variableSizeVectorColumnNames);

            var warnings = input.Warnings != null ? new List <IDataView>(input.Warnings) : new List <IDataView>();

            if (variableSizeVectorColumnNames.Length > 0)
            {
                var dvBldr = new ArrayDataViewBuilder(env);
                var warn   = $"Detected columns of variable length: {string.Join(", ", variableSizeVectorColumnNames)}." +
                             $" Consider setting collateMetrics- for meaningful per-Folds results.";
                dvBldr.AddColumn(MetricKinds.ColumnNames.WarningText, TextType.Instance, new DvText(warn));
                warnings.Add(dvBldr.GetDataView());
            }

            env.Assert(Utils.Size(perInst) == 1);

            var overall = eval.GetOverallResults(input.OverallMetrics);

            overall = EvaluateUtils.CombineFoldMetricsDataViews(env, overall, input.OverallMetrics.Length);

            IDataView conf = null;

            if (Utils.Size(input.ConfusionMatrix) > 0)
            {
                EvaluateUtils.ReconcileSlotNames <double>(env, input.ConfusionMatrix, MetricKinds.ColumnNames.Count, NumberType.R8);

                for (int i = 0; i < input.ConfusionMatrix.Length; i++)
                {
                    var idv = input.ConfusionMatrix[i];
                    // Find the old Count column and drop it.
                    for (int col = 0; col < idv.Schema.ColumnCount; col++)
                    {
                        if (idv.Schema.IsHidden(col) &&
                            idv.Schema.GetColumnName(col).Equals(MetricKinds.ColumnNames.Count))
                        {
                            input.ConfusionMatrix[i] = new ChooseColumnsByIndexTransform(env,
                                                                                         new ChooseColumnsByIndexTransform.Arguments()
                            {
                                Drop = true, Index = new[] { col }
                            }, idv);
                            break;
                        }
                    }
                }

                conf = EvaluateUtils.ConcatenateOverallMetrics(env, input.ConfusionMatrix);
            }

            var warningsIdv = warnings.Count > 0 ? AppendRowsDataView.Create(env, warnings[0].Schema, warnings.ToArray()) : null;

            return(new CombinedOutput()
            {
                PerInstanceMetrics = perInst[0],
                OverallMetrics = overall,
                ConfusionMatrix = conf,
                Warnings = warningsIdv
            });
        }
        /// <summary>
        /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random and Label y is to dependant on xRand.
        /// xRand has the least importance: Evaluation metrics do not change a lot when xRand is permuted.
        /// x2 has the biggest importance.
        /// </summary>
        private IDataView GetDenseDataset(TaskType task = TaskType.Regression)
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);

                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // If binary classification, modify the labels
            if (task == TaskType.BinaryClassification ||
                task == TaskType.MulticlassClassification)
            {
                GetBinaryClassificationLabels(yArray);
            }
            else if (task == TaskType.Ranking)
            {
                GetRankingLabels(yArray);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberDataViewType.Single, x1Array);
            bldr.AddColumn("X2Important", NumberDataViewType.Single, x2Array);
            bldr.AddColumn("X3", NumberDataViewType.Single, x3Array);
            bldr.AddColumn("X4Rand", NumberDataViewType.Single, x4RandArray);
            bldr.AddColumn("Label", NumberDataViewType.Single, yArray);
            if (task == TaskType.Ranking)
            {
                bldr.AddColumn("GroupId", NumberDataViewType.UInt32, CreateGroupIds(yArray.Length));
            }
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));

            // Create a keytype for Ranking
            if (task == TaskType.Ranking)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId"))
                       .Fit(srcDV).Transform(srcDV));
            }

            return(pipeline.Fit(srcDV).Transform(srcDV));
        }
Ejemplo n.º 19
0
        public void TestFeatureImportance()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .AppendCacheCheckpoint(ML)
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OrdinaryLeastSquares().Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            // Get prediction scores and contributions
            var enumerator           = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator();
            ScoreAndContribution row = null;
            var expectedValues       = new List <float[]>();

            expectedValues.Add(new float[4] {
                0.06319684F, 1, 0.1386623F, 4.46209469E-06F
            });
            expectedValues.Add(new float[4] {
                0.03841561F, 1, 0.1633037F, 2.68303256E-06F
            });
            expectedValues.Add(new float[4] {
                0.12006103F, 1, 0.254072F, 1.18671605E-05F
            });
            expectedValues.Add(new float[4] {
                0.20861618F, 0.99999994F, 0.407312155F, 6.963478E-05F
            });
            expectedValues.Add(new float[4] {
                0.024050576F, 0.99999994F, 0.31106182F, 8.456762E-06F
            });
            int index = 0;

            while (enumerator.MoveNext() && index < expectedValues.Count)
            {
                row = enumerator.Current;
                // We set predicion to 6 because the limit of floating-point numbers is 7.
                Assert.Equal(expectedValues[index][0], row.FeatureContributions[0], 6);
                Assert.Equal(expectedValues[index][1], row.FeatureContributions[1], 6);
                Assert.Equal(expectedValues[index][2], row.FeatureContributions[2], 6);
                Assert.Equal(expectedValues[index++][3], row.FeatureContributions[3], 6);
            }

            Done();
        }
Ejemplo n.º 20
0
        public void TransposerTest()
        {
            const int            rowCount = 1000;
            Random               rgen     = new Random(0);
            ArrayDataViewBuilder builder  = new ArrayDataViewBuilder(Env);

            // A is to check the splitting of a sparse-ish column.
            var dataA = GenerateHelper(rowCount, 0.1, rgen, () => (int)rgen.Next(), 50, 5, 10, 15);

            dataA[rowCount / 2] = new VBuffer <int>(50, 0, null, null); // Coverage for the null vbuffer case.
            builder.AddColumn("A", NumberDataViewType.Int32, dataA);
            // B is to check the splitting of a dense-ish column.
            builder.AddColumn("B", NumberDataViewType.Double, GenerateHelper(rowCount, 0.8, rgen, rgen.NextDouble, 50, 0, 25, 49));
            // C is to just have some column we do nothing with.
            builder.AddColumn("C", NumberDataViewType.Int16, GenerateHelper(rowCount, 0.1, rgen, () => (short)1, 30, 3, 10, 24));
            // D is to check some column we don't have to split because it's sufficiently small.
            builder.AddColumn("D", NumberDataViewType.Double, GenerateHelper(rowCount, 0.1, rgen, rgen.NextDouble, 3, 1));
            // E is to check a sparse scalar column.
            builder.AddColumn("E", NumberDataViewType.UInt32, GenerateHelper(rowCount, 0.1, rgen, () => (uint)rgen.Next(int.MinValue, int.MaxValue)));
            // F is to check a dense-ish scalar column.
            builder.AddColumn("F", NumberDataViewType.Int32, GenerateHelper(rowCount, 0.8, rgen, () => rgen.Next()));

            IDataView view = builder.GetDataView();

            // Do not force save. This will have a mix of passthrough and saved columns. Note that duplicate
            // specification of "D" to test that specifying a column twice has no ill effects.
            string[] names = { "B", "A", "E", "D", "F", "D" };
            using (Transposer trans = Transposer.Create(Env, view, false, names))
            {
                // Before checking the contents, check the names.
                for (int i = 0; i < names.Length; ++i)
                {
                    int index;
                    Assert.True(trans.Schema.TryGetColumnIndex(names[i], out index), $"Transpose schema couldn't find column '{names[i]}'");
                    int  trueIndex;
                    bool result = view.Schema.TryGetColumnIndex(names[i], out trueIndex);
                    Contracts.Assert(result);
                    Assert.True(trueIndex == index, $"Transpose schema had column '{names[i]}' at unexpected index");
                }
                // Check the contents
                Assert.Null(((ITransposeDataView)trans).GetSlotType(2)); // C check to see that it's not transposable.
                TransposeCheckHelper <int>(view, 0, trans);              // A check.
                TransposeCheckHelper <Double>(view, 1, trans);           // B check.
                TransposeCheckHelper <Double>(view, 3, trans);           // D check.
                TransposeCheckHelper <uint>(view, 4, trans);             // E check.
                TransposeCheckHelper <int>(view, 5, trans);              // F check.
            }

            // Force save. Recheck columns that would have previously been passthrough columns.
            // The primary benefit of this check is that we check the binary saving / loading
            // functionality of scalars which are otherwise always must necessarily be
            // passthrough. Also exercise the select by index functionality while we're at it.
            using (Transposer trans = Transposer.Create(Env, view, true, 3, 5, 4))
            {
                // Check to see that A, B, and C were not transposed somehow.
                var itdv = (ITransposeDataView)trans;
                Assert.Null(itdv.GetSlotType(0));
                Assert.Null(itdv.GetSlotType(1));
                Assert.Null(itdv.GetSlotType(2));
                TransposeCheckHelper <Double>(view, 3, trans); // D check.
                TransposeCheckHelper <uint>(view, 4, trans);   // E check.
                TransposeCheckHelper <int>(view, 5, trans);    // F check.
            }
        }
Ejemplo n.º 21
0
        /// <summary>
        /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random, Label y is dependant on xRand.
        /// Test verifies that feature contribution scores are outputted along with a score for predicted data.
        /// </summary>
        private void TestFeatureContribution(
            ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer,
            List <float[]> expectedValues,
            int precision = 6)
        {
            // Setup synthetic dataset.
            const int numInstances = 1000;
            const int numFeatures  = 4;

            var rand = new Random(10);

            float[]   yArray       = new float[numInstances];
            float[][] xArray       = new float[numFeatures][];
            int[]     xRangeArray  = new[] { 1000, 10000, 5000, 1000 };
            float[]   xWeightArray = new[] {
                10,
                20, // Most important feature with high weight. Should have the highest contribution.
                5.5f,
                0,  // Least important feature. Should have the least contribution.
            };

            for (var instanceIndex = 0; instanceIndex < numInstances; instanceIndex++)
            {
                for (int featureIndex = 0; featureIndex < numFeatures; featureIndex++)
                {
                    if (xArray[featureIndex] == null)
                    {
                        xArray[featureIndex] = new float[numInstances];
                    }
                    xArray[featureIndex][instanceIndex] = rand.Next(xRangeArray[featureIndex]);
                    yArray[instanceIndex] += xArray[featureIndex][instanceIndex] * xWeightArray[featureIndex];
                }

                var noise = rand.Next(50);
                yArray[instanceIndex] += noise;
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, xArray[0]);
            bldr.AddColumn("X2Important", NumberType.Float, xArray[1]);
            bldr.AddColumn("X3", NumberType.Float, xArray[2]);
            bldr.AddColumn("X4Rand", NumberType.Float, xArray[3]);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .AppendCacheCheckpoint(ML)
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = trainer.Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            var transformedOutput = output.AsEnumerable <ScoreAndContribution>(Env, true);
            int rowIndex          = 0;

            foreach (var row in transformedOutput.Take(expectedValues.Count))
            {
                var expectedValue = expectedValues[rowIndex++];
                for (int i = 0; i < numFeatures; i++)
                {
                    Assert.Equal(expectedValue[i], row.FeatureContributions[i], precision);
                }
            }

            Done();
        }
        /// <summary>
        /// Helper function that builds the IDataView given a list of keys and non-vector values
        /// </summary>
        internal static IDataView CreateDataView <TKey, TValue>(IHostEnvironment env,
                                                                IEnumerable <TKey> keys,
                                                                IEnumerable <TValue> values,
                                                                string keyColumnName,
                                                                string valueColumnName,
                                                                bool treatValuesAsKeyTypes)
        {
            var keyType   = GetPrimitiveType(typeof(TKey), out bool isKeyVectorType);
            var valueType = GetPrimitiveType(typeof(TValue), out bool isValueVectorType);

            var dataViewBuilder = new ArrayDataViewBuilder(env);

            dataViewBuilder.AddColumn(keyColumnName, keyType, keys.ToArray());
            if (treatValuesAsKeyTypes)
            {
                // When treating the values as KeyTypes, generate the unique
                // set of values. This is used for generating the metadata of
                // the column.
                HashSet <TValue> valueSet = new HashSet <TValue>();
                HashSet <TKey>   keySet   = new HashSet <TKey>();
                for (int i = 0; i < values.Count(); ++i)
                {
                    var v = values.ElementAt(i);
                    if (valueSet.Contains(v))
                    {
                        continue;
                    }
                    valueSet.Add(v);

                    var k = keys.ElementAt(i);
                    keySet.Add(k);
                }
                var metaKeys = keySet.ToArray();

                // Key Values are treated in one of two ways:
                // If the values are of type uint or ulong, these values are used directly as the keys types and no new keys are created.
                // If the values are not of uint or ulong, then key values are generated as uints starting from 1, since 0 is missing key.
                if (valueType.RawKind == DataKind.U4)
                {
                    uint[] indices = values.Select((x) => Convert.ToUInt32(x)).ToArray();
                    dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), 0, metaKeys.Length, indices);
                }
                else if (valueType.RawKind == DataKind.U8)
                {
                    ulong[] indices = values.Select((x) => Convert.ToUInt64(x)).ToArray();
                    dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), 0, metaKeys.Length, indices);
                }
                else
                {
                    // When generating the indices, treat each value as being unique, i.e. two values that are the same will
                    // be assigned the same index. The dictionary is used to maintain uniqueness, indices will contain
                    // the full list of indices (equal to the same length of values).
                    Dictionary <TValue, uint> keyTypeValueMapping = new Dictionary <TValue, uint>();
                    uint[] indices = new uint[values.Count()];
                    // Start the index at 1
                    uint index = 1;
                    for (int i = 0; i < values.Count(); ++i)
                    {
                        TValue value = values.ElementAt(i);
                        if (!keyTypeValueMapping.ContainsKey(value))
                        {
                            keyTypeValueMapping.Add(value, index);
                            index++;
                        }

                        var keyValue = keyTypeValueMapping[value];
                        indices[i] = keyValue;
                    }

                    dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), 0, metaKeys.Count(), indices);
                }
            }
            else
            {
                dataViewBuilder.AddColumn(valueColumnName, valueType, values.ToArray());
            }

            return(dataViewBuilder.GetDataView());
        }
Ejemplo n.º 23
0
        public void Train(List <FeatureSubsetModel <IPredictorProducing <TOutput> > > models, RoleMappedData data, IHostEnvironment env)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register(Stacking.LoadName);

            host.CheckValue(models, nameof(models));
            host.CheckValue(data, nameof(data));

            using (var ch = host.Start("Training stacked model"))
            {
                ch.Check(Meta == null, "Train called multiple times");
                ch.Check(BasePredictorType != null);

                var maps = new ValueMapper <VBuffer <Single>, TOutput> [models.Count];
                for (int i = 0; i < maps.Length; i++)
                {
                    Contracts.Assert(models[i].Predictor is IValueMapper);
                    var m = (IValueMapper)models[i].Predictor;
                    maps[i] = m.GetMapper <VBuffer <Single>, TOutput>();
                }

                // REVIEW: Should implement this better....
                var labels   = new Single[100];
                var features = new VBuffer <Single> [100];
                int count    = 0;
                // REVIEW: Should this include bad values or filter them?
                using (var cursor = new FloatLabelCursor(data, CursOpt.AllFeatures | CursOpt.AllLabels))
                {
                    TOutput[] predictions = new TOutput[maps.Length];
                    var       vBuffers    = new VBuffer <Single> [maps.Length];
                    while (cursor.MoveNext())
                    {
                        Parallel.For(0, maps.Length, i =>
                        {
                            var model = models[i];
                            if (model.SelectedFeatures != null)
                            {
                                EnsembleUtils.SelectFeatures(ref cursor.Features, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]);
                                maps[i](ref vBuffers[i], ref predictions[i]);
                            }
                            else
                            {
                                maps[i](ref cursor.Features, ref predictions[i]);
                            }
                        });

                        Utils.EnsureSize(ref labels, count + 1);
                        Utils.EnsureSize(ref features, count + 1);
                        labels[count] = cursor.Label;
                        FillFeatureBuffer(predictions, ref features[count]);
                        count++;
                    }
                }

                ch.Info("The number of instances used for stacking trainer is {0}", count);

                var bldr = new ArrayDataViewBuilder(host);
                Array.Resize(ref labels, count);
                Array.Resize(ref features, count);
                bldr.AddColumn(DefaultColumnNames.Label, NumberType.Float, labels);
                bldr.AddColumn(DefaultColumnNames.Features, NumberType.Float, features);

                var view = bldr.GetDataView();
                var rmd  = new RoleMappedData(view, DefaultColumnNames.Label, DefaultColumnNames.Features);

                var trainer = BasePredictorType.CreateComponent(host);
                if (trainer.Info.NeedNormalization)
                {
                    ch.Warning("The trainer specified for stacking wants normalization, but we do not currently allow this.");
                }
                Meta = trainer.Train(rmd);
                CheckMeta();
            }
        }
        public void TestDenseSGD()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var pfi   = ML.Regression.PermutationFeatureImportance(model, data);

            // Pfi Indices:
            // X1: 0
            // X2Important: 1
            // X3: 2
            // X4Rand: 3

            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
            Assert.True(MinDeltaIndex(pfi, m => m.L1) == 3);
            Assert.True(MaxDeltaIndex(pfi, m => m.L1) == 1);

            Assert.True(MinDeltaIndex(pfi, m => m.L2) == 3);
            Assert.True(MaxDeltaIndex(pfi, m => m.L2) == 1);

            Assert.True(MinDeltaIndex(pfi, m => m.Rms) == 3);
            Assert.True(MaxDeltaIndex(pfi, m => m.Rms) == 1);

            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
            Assert.True(MaxDeltaIndex(pfi, m => m.RSquared) == 3);
            Assert.True(MinDeltaIndex(pfi, m => m.RSquared) == 1);

            Done();
        }
Ejemplo n.º 25
0
        public void TestLDATransform()
        {
            var builder = new ArrayDataViewBuilder(Env);
            var data    = new[]
            {
                new[] { (Float)1.0, (Float)0.0, (Float)0.0 },
                new[] { (Float)0.0, (Float)1.0, (Float)0.0 },
                new[] { (Float)0.0, (Float)0.0, (Float)1.0 },
            };

            builder.AddColumn("F1V", NumberType.Float, data);

            var srcView = builder.GetDataView();

            LdaTransform.Column col = new LdaTransform.Column();
            col.Source   = "F1V";
            col.NumTopic = 20;
            col.NumTopic = 3;
            col.NumSummaryTermPerTopic = 3;
            col.AlphaSum             = 3;
            col.NumThreads           = 1;
            col.ResetRandomGenerator = true;
            LdaTransform.Arguments args = new LdaTransform.Arguments();
            args.Column = new LdaTransform.Column[] { col };

            LdaTransform ldaTransform = new LdaTransform(Env, args, srcView);

            using (var cursor = ldaTransform.GetRowCursor(c => true))
            {
                var             resultGetter    = cursor.GetGetter <VBuffer <Float> >(1);
                VBuffer <Float> resultFirstRow  = new VBuffer <Float>();
                VBuffer <Float> resultSecondRow = new VBuffer <Float>();
                VBuffer <Float> resultThirdRow  = new VBuffer <Float>();

                Assert.True(cursor.MoveNext());
                resultGetter(ref resultFirstRow);
                Assert.True(cursor.MoveNext());
                resultGetter(ref resultSecondRow);
                Assert.True(cursor.MoveNext());
                resultGetter(ref resultThirdRow);
                Assert.False(cursor.MoveNext());

                Assert.True(resultFirstRow.Length == 3);
                Assert.True(resultFirstRow.GetItemOrDefault(0) == 0);
                Assert.True(resultFirstRow.GetItemOrDefault(2) == 0);
                Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0);
                Assert.True(resultSecondRow.Length == 3);
                Assert.True(resultSecondRow.GetItemOrDefault(0) == 0);
                Assert.True(resultSecondRow.GetItemOrDefault(2) == 0);
                Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0);
                Assert.True(resultThirdRow.Length == 3);
                Assert.True(resultThirdRow.GetItemOrDefault(0) == 0);
                Assert.True(resultThirdRow.GetItemOrDefault(1) == 0);
                Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0);
            }

            using (var cursor = ldaTransform.GetRowCursor(c => true))
            {
                var             resultGetter    = cursor.GetGetter <VBuffer <Float> >(1);
                VBuffer <Float> resultFirstRow  = new VBuffer <Float>();
                VBuffer <Float> resultSecondRow = new VBuffer <Float>();
                VBuffer <Float> resultThirdRow  = new VBuffer <Float>();

                Assert.True(cursor.MoveNext());
                resultGetter(ref resultFirstRow);
                Assert.True(cursor.MoveNext());
                resultGetter(ref resultSecondRow);
                Assert.True(cursor.MoveNext());
                resultGetter(ref resultThirdRow);
                Assert.False(cursor.MoveNext());

                Assert.True(resultFirstRow.Length == 3);
                Assert.True(resultFirstRow.GetItemOrDefault(0) == 0);
                Assert.True(resultFirstRow.GetItemOrDefault(2) == 0);
                Assert.True(resultFirstRow.GetItemOrDefault(1) == 1.0);
                Assert.True(resultSecondRow.Length == 3);
                Assert.True(resultSecondRow.GetItemOrDefault(0) == 0);
                Assert.True(resultSecondRow.GetItemOrDefault(2) == 0);
                Assert.True(resultSecondRow.GetItemOrDefault(1) == 1.0);
                Assert.True(resultThirdRow.Length == 3);
                Assert.True(resultThirdRow.GetItemOrDefault(0) == 0);
                Assert.True(resultThirdRow.GetItemOrDefault(1) == 0);
                Assert.True(resultThirdRow.GetItemOrDefault(2) == 1.0);
            }
        }
        /// <summary>
        /// Utility method used to represent a tree ensemble as an <see cref="IDataView"/>.
        /// Every row in the <see cref="IDataView"/> corresponds to a node in the tree ensemble. The columns are the fields for each node.
        /// The column TreeID specifies which tree the node belongs to. The <see cref="QuantileRegressionTree"/> gets
        /// special treatment since it has some additional fields (<see cref="QuantileRegressionTree.GetLeafSamplesAt(int)"/>
        /// and <see cref="QuantileRegressionTree.GetLeafSampleWeightsAt(int)"/>).
        /// </summary>
        public static IDataView RegressionTreeEnsembleAsIDataView(IHost host, double bias, IReadOnlyList <double> treeWeights, IReadOnlyList <RegressionTreeBase> trees)
        {
            var builder      = new ArrayDataViewBuilder(host);
            var numberOfRows = trees.Select(tree => tree.NumberOfNodes).Sum() + trees.Select(tree => tree.NumberOfLeaves).Sum();

            var treeWeightsList = new List <double>();
            var treeId          = new List <int>();
            var isLeaf          = new List <ReadOnlyMemory <char> >();
            var leftChild       = new List <int>();
            var rightChild      = new List <int>();
            var numericalSplitFeatureIndexes = new List <int>();
            var numericalSplitThresholds     = new List <float>();
            var categoricalSplitFlags        = new List <bool>();
            var leafValues = new List <double>();
            var splitGains = new List <double>();
            var categoricalSplitFeatures = new List <VBuffer <int> >();
            var categoricalCategoricalSplitFeatureRange = new List <VBuffer <int> >();

            for (int i = 0; i < trees.Count; i++)
            {
                // TreeWeights column. The TreeWeight value will be repeated for all the notes in the same tree in the IDataView.
                treeWeightsList.AddRange(Enumerable.Repeat(treeWeights[i], trees[i].NumberOfNodes + trees[i].NumberOfLeaves));

                // Tree id indicates which tree the node belongs to.
                treeId.AddRange(Enumerable.Repeat(i, trees[i].NumberOfNodes + trees[i].NumberOfLeaves));

                // IsLeaf column indicates if node is a leaf node.
                isLeaf.AddRange(Enumerable.Repeat(new ReadOnlyMemory <char>("Tree node".ToCharArray()), trees[i].NumberOfNodes));
                isLeaf.AddRange(Enumerable.Repeat(new ReadOnlyMemory <char>("Leaf node".ToCharArray()), trees[i].NumberOfLeaves));

                // LeftChild column.
                leftChild.AddRange(trees[i].LeftChild.AsEnumerable());
                leftChild.AddRange(Enumerable.Repeat(0, trees[i].NumberOfLeaves));

                // RightChild column.
                rightChild.AddRange(trees[i].RightChild.AsEnumerable());
                rightChild.AddRange(Enumerable.Repeat(0, trees[i].NumberOfLeaves));

                // NumericalSplitFeatureIndexes column.
                numericalSplitFeatureIndexes.AddRange(trees[i].NumericalSplitFeatureIndexes.AsEnumerable());
                numericalSplitFeatureIndexes.AddRange(Enumerable.Repeat(0, trees[i].NumberOfLeaves));

                // NumericalSplitThresholds column.
                numericalSplitThresholds.AddRange(trees[i].NumericalSplitThresholds.AsEnumerable());
                numericalSplitThresholds.AddRange(Enumerable.Repeat(0f, trees[i].NumberOfLeaves));

                // CategoricalSplitFlags column.
                categoricalSplitFlags.AddRange(trees[i].CategoricalSplitFlags.AsEnumerable());
                categoricalSplitFlags.AddRange(Enumerable.Repeat(false, trees[i].NumberOfLeaves));

                // LeafValues column.
                leafValues.AddRange(Enumerable.Repeat(0d, trees[i].NumberOfNodes));
                leafValues.AddRange(trees[i].LeafValues.AsEnumerable());

                // SplitGains column.
                splitGains.AddRange(trees[i].SplitGains.AsEnumerable());
                splitGains.AddRange(Enumerable.Repeat(0d, trees[i].NumberOfLeaves));

                for (int j = 0; j < trees[i].NumberOfNodes; j++)
                {
                    // CategoricalSplitFeatures column.
                    var categoricalSplitFeaturesArray = trees[i].GetCategoricalSplitFeaturesAt(j).ToArray();
                    categoricalSplitFeatures.Add(new VBuffer <int>(categoricalSplitFeaturesArray.Length, categoricalSplitFeaturesArray));
                    var len = trees[i].GetCategoricalSplitFeaturesAt(j).ToArray().Length;

                    // CategoricalCategoricalSplitFeatureRange column.
                    var categoricalCategoricalSplitFeatureRangeArray = trees[i].GetCategoricalCategoricalSplitFeatureRangeAt(j).ToArray();
                    categoricalCategoricalSplitFeatureRange.Add(new VBuffer <int>(categoricalCategoricalSplitFeatureRangeArray.Length, categoricalCategoricalSplitFeatureRangeArray));
                    len = trees[i].GetCategoricalCategoricalSplitFeatureRangeAt(j).ToArray().Length;
                }

                categoricalSplitFeatures.AddRange(Enumerable.Repeat(new VBuffer <int>(), trees[i].NumberOfLeaves));
                categoricalCategoricalSplitFeatureRange.AddRange(Enumerable.Repeat(new VBuffer <int>(), trees[i].NumberOfLeaves));
            }

            // Bias column. This will be a repeated value for all rows in the resulting IDataView.
            builder.AddColumn("Bias", NumberDataViewType.Double, Enumerable.Repeat(bias, numberOfRows).ToArray());
            builder.AddColumn("TreeWeights", NumberDataViewType.Double, treeWeightsList.ToArray());
            builder.AddColumn("TreeID", NumberDataViewType.Int32, treeId.ToArray());
            builder.AddColumn("IsLeaf", TextDataViewType.Instance, isLeaf.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.LeftChild), NumberDataViewType.Int32, leftChild.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.RightChild), NumberDataViewType.Int32, rightChild.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.NumericalSplitFeatureIndexes), NumberDataViewType.Int32, numericalSplitFeatureIndexes.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.NumericalSplitThresholds), NumberDataViewType.Single, numericalSplitThresholds.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.CategoricalSplitFlags), BooleanDataViewType.Instance, categoricalSplitFlags.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.LeafValues), NumberDataViewType.Double, leafValues.ToArray());
            builder.AddColumn(nameof(RegressionTreeBase.SplitGains), NumberDataViewType.Double, splitGains.ToArray());
            builder.AddColumn("CategoricalSplitFeatures", NumberDataViewType.Int32, categoricalSplitFeatures.ToArray());
            builder.AddColumn("CategoricalCategoricalSplitFeatureRange", NumberDataViewType.Int32, categoricalCategoricalSplitFeatureRange.ToArray());

            // If the input tree array is a quantile regression tree we need to add two more columns.
            var quantileTrees = trees as IReadOnlyList <QuantileRegressionTree>;

            if (quantileTrees != null)
            {
                // LeafSamples column.
                var leafSamples = new List <VBuffer <double> >();

                // LeafSampleWeights column.
                var leafSampleWeights = new List <VBuffer <double> >();
                for (int i = 0; i < quantileTrees.Count; i++)
                {
                    leafSamples.AddRange(Enumerable.Repeat(new VBuffer <double>(), quantileTrees[i].NumberOfNodes));
                    leafSampleWeights.AddRange(Enumerable.Repeat(new VBuffer <double>(), quantileTrees[i].NumberOfNodes));
                    for (int j = 0; j < quantileTrees[i].NumberOfLeaves; j++)
                    {
                        var leafSamplesArray = quantileTrees[i].GetLeafSamplesAt(j).ToArray();
                        leafSamples.Add(new VBuffer <double>(leafSamplesArray.Length, leafSamplesArray));
                        var len = quantileTrees[i].GetLeafSamplesAt(j).ToArray().Length;

                        var leafSampleWeightsArray = quantileTrees[i].GetLeafSampleWeightsAt(j).ToArray();
                        leafSampleWeights.Add(new VBuffer <double>(leafSampleWeightsArray.Length, leafSampleWeightsArray));
                        len = quantileTrees[i].GetLeafSampleWeightsAt(j).ToArray().Length;
                    }
                }

                builder.AddColumn("LeafSamples", NumberDataViewType.Double, leafSamples.ToArray());
                builder.AddColumn("LeafSampleWeights", NumberDataViewType.Double, leafSampleWeights.ToArray());
            }

            var data = builder.GetDataView();

            return(data);
        }