public void TransposerTest()
        {
            const int            rowCount = 1000;
            Random               rgen     = new Random(0);
            ArrayDataViewBuilder builder  = new ArrayDataViewBuilder(Env);

            // A is to check the splitting of a sparse-ish column.
            var dataA = GenerateHelper(rowCount, 0.1, rgen, () => (int)rgen.Next(), 50, 5, 10, 15);

            dataA[rowCount / 2] = new VBuffer <int>(50, 0, null, null); // Coverage for the null vbuffer case.
            builder.AddColumn("A", NumberType.I4, dataA);
            // B is to check the splitting of a dense-ish column.
            builder.AddColumn("B", NumberType.R8, GenerateHelper(rowCount, 0.8, rgen, rgen.NextDouble, 50, 0, 25, 49));
            // C is to just have some column we do nothing with.
            builder.AddColumn("C", NumberType.I2, GenerateHelper(rowCount, 0.1, rgen, () => (short)1, 30, 3, 10, 24));
            // D is to check some column we don't have to split because it's sufficiently small.
            builder.AddColumn("D", NumberType.R8, GenerateHelper(rowCount, 0.1, rgen, rgen.NextDouble, 3, 1));
            // E is to check a sparse scalar column.
            builder.AddColumn("E", NumberType.U4, GenerateHelper(rowCount, 0.1, rgen, () => (uint)rgen.Next(int.MinValue, int.MaxValue)));
            // F is to check a dense-ish scalar column.
            builder.AddColumn("F", NumberType.I4, GenerateHelper(rowCount, 0.8, rgen, () => rgen.Next()));

            IDataView view = builder.GetDataView();

            // Do not force save. This will have a mix of passthrough and saved columns. Note that duplicate
            // specification of "D" to test that specifying a column twice has no ill effects.
            string[] names = { "B", "A", "E", "D", "F", "D" };
            using (Transposer trans = Transposer.Create(Env, view, false, names))
            {
                // Before checking the contents, check the names.
                for (int i = 0; i < names.Length; ++i)
                {
                    int index;
                    Assert.True(trans.Schema.TryGetColumnIndex(names[i], out index), $"Transpose schema couldn't find column '{names[i]}'");
                    int  trueIndex;
                    bool result = view.Schema.TryGetColumnIndex(names[i], out trueIndex);
                    Contracts.Assert(result);
                    Assert.True(trueIndex == index, $"Transpose schema had column '{names[i]}' at unexpected index");
                }
                // Check the contents
                Assert.Null(trans.TransposeSchema.GetSlotType(2)); // C check to see that it's not transposable.
                TransposeCheckHelper <int>(view, 0, trans);        // A check.
                TransposeCheckHelper <Double>(view, 1, trans);     // B check.
                TransposeCheckHelper <Double>(view, 3, trans);     // D check.
                TransposeCheckHelper <uint>(view, 4, trans);       // E check.
                TransposeCheckHelper <int>(view, 5, trans);        // F check.
            }

            // Force save. Recheck columns that would have previously been passthrough columns.
            // The primary benefit of this check is that we check the binary saving / loading
            // functionality of scalars which are otherwise always must necessarily be
            // passthrough. Also exercise the select by index functionality while we're at it.
            using (Transposer trans = Transposer.Create(Env, view, true, 3, 5, 4))
            {
                // Check to see that A, B, and C were not transposed somehow.
                Assert.Null(trans.TransposeSchema.GetSlotType(0));
                Assert.Null(trans.TransposeSchema.GetSlotType(1));
                Assert.Null(trans.TransposeSchema.GetSlotType(2));
                TransposeCheckHelper <Double>(view, 3, trans); // D check.
                TransposeCheckHelper <uint>(view, 4, trans);   // E check.
                TransposeCheckHelper <int>(view, 5, trans);    // F check.
            }
        }
Example #2
0
        /// <summary>
        /// Features: x1, x2vBuff(sparce vector), x3.
        /// y = 10x1 + 10x2vBuff + 30x3 + e.
        /// Within xBuff feature  2nd slot will be sparse most of the time.
        /// 2nd slot of xBuff has the least importance: Evaluation metrics do not change a lot when this slot is permuted.
        /// x2 has the biggest importance.
        /// </summary>
        private IDataView GetSparseDataset(TaskType task = TaskType.Regression)
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 10000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array = new float[numberOfInstances],
            x3Array = new float[numberOfInstances];

            VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x3Important = rand.Next(10000);
                x3Array[i] = x3Important;

                VBuffer <float> vb;

                if (i % 10 != 0)
                {
                    vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 });
                }
                else
                {
                    vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 });
                }

                vbArray[i] = vb;

                float vbSum = 0;
                foreach (var vbValue in vb.DenseValues())
                {
                    vbSum += vbValue * 10;
                }

                var noise = rand.Next(50);
                yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise;
            }

            // If binary classification, modify the labels
            if (task == TaskType.BinaryClassification ||
                task == TaskType.MulticlassClassification)
            {
                GetBinaryClassificationLabels(yArray);
            }
            else if (task == TaskType.Ranking)
            {
                GetRankingLabels(yArray);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray);
            bldr.AddColumn("X3Important", NumberType.Float, x3Array);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            if (task == TaskType.Ranking)
            {
                bldr.AddColumn("GroupId", NumberType.U4, CreateGroupIds(yArray.Length));
            }
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important")
                           .Append(ML.Transforms.Normalize("Features"));

            // Create a keytype for Ranking
            if (task == TaskType.Ranking)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId"))
                       .Fit(srcDV).Transform(srcDV));
            }

            return(pipeline.Fit(srcDV).Transform(srcDV));
        }
Example #3
0
        /// <summary>
        /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random, Label y is dependant on xRand.
        /// Test verifies that feature contribution scores are outputted along with a score for predicted data.
        /// </summary>
        private void TestFeatureContribution(
            ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer,
            List <float[]> expectedValues,
            int precision = 6)
        {
            // Setup synthetic dataset.
            const int numInstances = 1000;
            const int numFeatures  = 4;

            var rand = new Random(10);

            float[]   yArray       = new float[numInstances];
            float[][] xArray       = new float[numFeatures][];
            int[]     xRangeArray  = new[] { 1000, 10000, 5000, 1000 };
            float[]   xWeightArray = new[] {
                10,
                20, // Most important feature with high weight. Should have the highest contribution.
                5.5f,
                0,  // Least important feature. Should have the least contribution.
            };

            for (var instanceIndex = 0; instanceIndex < numInstances; instanceIndex++)
            {
                for (int featureIndex = 0; featureIndex < numFeatures; featureIndex++)
                {
                    if (xArray[featureIndex] == null)
                    {
                        xArray[featureIndex] = new float[numInstances];
                    }
                    xArray[featureIndex][instanceIndex] = rand.Next(xRangeArray[featureIndex]);
                    yArray[instanceIndex] += xArray[featureIndex][instanceIndex] * xWeightArray[featureIndex];
                }

                var noise = rand.Next(50);
                yArray[instanceIndex] += noise;
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, xArray[0]);
            bldr.AddColumn("X2Important", NumberType.Float, xArray[1]);
            bldr.AddColumn("X3", NumberType.Float, xArray[2]);
            bldr.AddColumn("X4Rand", NumberType.Float, xArray[3]);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .AppendCacheCheckpoint(ML)
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = trainer.Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            var transformedOutput = output.AsEnumerable <ScoreAndContribution>(Env, true);
            int rowIndex          = 0;

            foreach (var row in transformedOutput.Take(expectedValues.Count))
            {
                var expectedValue = expectedValues[rowIndex++];
                for (int i = 0; i < numFeatures; i++)
                {
                    Assert.Equal(expectedValue[i], row.FeatureContributions[i], precision);
                }
            }

            Done();
        }
        public static CombinedOutput CombineMetrics(IHostEnvironment env, CombineMetricsInput input)
        {
            var eval = GetEvaluator(env, input.Kind);

            var perInst = EvaluateUtils.ConcatenatePerInstanceDataViews(env, eval, true, true, input.PerInstanceMetrics.Select(
                                                                            idv => new RoleMappedData(idv, opt: true,
                                                                                                      RoleMappedSchema.ColumnRole.Label.Bind(input.LabelColumn),
                                                                                                      RoleMappedSchema.ColumnRole.Weight.Bind(input.WeightColumn.Value),
                                                                                                      RoleMappedSchema.ColumnRole.Group.Bind(input.GroupColumn),
                                                                                                      RoleMappedSchema.ColumnRole.Name.Bind(input.NameColumn.Value))).ToArray(),
                                                                        out var variableSizeVectorColumnNames);

            var warnings = input.Warnings != null ? new List <IDataView>(input.Warnings) : new List <IDataView>();

            if (variableSizeVectorColumnNames.Length > 0)
            {
                var dvBldr = new ArrayDataViewBuilder(env);
                var warn   = $"Detected columns of variable length: {string.Join(", ", variableSizeVectorColumnNames)}." +
                             $" Consider setting collateMetrics- for meaningful per-Folds results.";
                dvBldr.AddColumn(MetricKinds.ColumnNames.WarningText, TextType.Instance, warn.AsMemory());
                warnings.Add(dvBldr.GetDataView());
            }

            env.Assert(Utils.Size(perInst) == 1);

            var overall = eval.GetOverallResults(input.OverallMetrics);

            overall = EvaluateUtils.CombineFoldMetricsDataViews(env, overall, input.OverallMetrics.Length);

            IDataView conf = null;

            if (Utils.Size(input.ConfusionMatrix) > 0)
            {
                EvaluateUtils.ReconcileSlotNames <double>(env, input.ConfusionMatrix, MetricKinds.ColumnNames.Count, NumberType.R8);

                for (int i = 0; i < input.ConfusionMatrix.Length; i++)
                {
                    var idv = input.ConfusionMatrix[i];
                    // Find the old Count column and drop it.
                    for (int col = 0; col < idv.Schema.ColumnCount; col++)
                    {
                        if (idv.Schema[col].IsHidden &&
                            idv.Schema.GetColumnName(col).Equals(MetricKinds.ColumnNames.Count))
                        {
                            input.ConfusionMatrix[i] = new ChooseColumnsByIndexTransform(env,
                                                                                         new ChooseColumnsByIndexTransform.Arguments()
                            {
                                Drop = true, Index = new[] { col }
                            }, idv);
                            break;
                        }
                    }
                }

                conf = EvaluateUtils.ConcatenateOverallMetrics(env, input.ConfusionMatrix);
            }

            var warningsIdv = warnings.Count > 0 ? AppendRowsDataView.Create(env, warnings[0].Schema, warnings.ToArray()) : null;

            return(new CombinedOutput()
            {
                PerInstanceMetrics = perInst[0],
                OverallMetrics = overall,
                ConfusionMatrix = conf,
                Warnings = warningsIdv
            });
        }
Example #5
0
        /// <summary>
        /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random and Label y is to dependant on xRand.
        /// xRand has the least importance: Evaluation metrics do not change a lot when xRand is permuted.
        /// x2 has the biggest importance.
        /// </summary>
        private IDataView GetDenseDataset(TaskType task = TaskType.Regression)
        {
            Contracts.Assert(task != TaskType.Clustering, $"TaskType {nameof(TaskType.Clustering)} not supported.");

            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);

                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // If binary classification, modify the labels
            if (task == TaskType.BinaryClassification ||
                task == TaskType.MulticlassClassification)
            {
                GetBinaryClassificationLabels(yArray);
            }
            else if (task == TaskType.Ranking)
            {
                GetRankingLabels(yArray);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            if (task == TaskType.Ranking)
            {
                bldr.AddColumn("GroupId", NumberType.U4, CreateGroupIds(yArray.Length));
            }
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));

            // Create a keytype for Ranking
            if (task == TaskType.Ranking)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId"))
                       .Fit(srcDV).Transform(srcDV));
            }

            return(pipeline.Fit(srcDV).Transform(srcDV));
        }
Example #6
0
        /// <summary>
        /// Features: x1, x2, x3, xRand; y = 10*x1 + 20x2 + 5.5x3 + e, xRand- random and Label y is to dependant on xRand.
        /// xRand has the least importance: Evaluation metrics do not change a lot when xRand is permuted.
        /// x2 has the biggest importance.
        /// </summary>
        private IDataView GetDenseDataset(TaskType task = TaskType.Regression)
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray      = new float[numberOfInstances];
            float[] x1Array     = new float[numberOfInstances];
            float[] x2Array     = new float[numberOfInstances];
            float[] x3Array     = new float[numberOfInstances];
            float[] x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);

                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // If binary classification, modify the labels
            if (task == TaskType.BinaryClassification ||
                task == TaskType.MulticlassClassification)
            {
                GetBinaryClassificationLabels(yArray);
            }
            else if (task == TaskType.Ranking)
            {
                GetRankingLabels(yArray);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberDataViewType.Single, x1Array);
            bldr.AddColumn("X2Important", NumberDataViewType.Single, x2Array);
            bldr.AddColumn("X3", NumberDataViewType.Single, x3Array);
            bldr.AddColumn("X4Rand", NumberDataViewType.Single, x4RandArray);
            bldr.AddColumn("Label", NumberDataViewType.Single, yArray);
            if (task == TaskType.Ranking)
            {
                bldr.AddColumn("GroupId", NumberDataViewType.UInt32, CreateGroupIds(yArray.Length));
            }
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.NormalizeMinMax("Features"));

            if (task == TaskType.BinaryClassification)
            {
                return(pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean))
                       .Fit(srcDV).Transform(srcDV));
            }
            else if (task == TaskType.MulticlassClassification)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("Label"))
                       .Fit(srcDV).Transform(srcDV));
            }
            else if (task == TaskType.Ranking)
            {
                return(pipeline.Append(ML.Transforms.Conversion.MapValueToKey("GroupId"))
                       .Fit(srcDV).Transform(srcDV));
            }

            return(pipeline.Fit(srcDV).Transform(srcDV));
        }
Example #7
0
        public void Train(List <FeatureSubsetModel <IPredictorProducing <TOutput> > > models, RoleMappedData data, IHostEnvironment env)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register(Stacking.LoadName);

            host.CheckValue(models, nameof(models));
            host.CheckValue(data, nameof(data));

            using (var ch = host.Start("Training stacked model"))
            {
                ch.Check(Meta == null, "Train called multiple times");
                ch.Check(BasePredictorType != null);

                var maps = new ValueMapper <VBuffer <Single>, TOutput> [models.Count];
                for (int i = 0; i < maps.Length; i++)
                {
                    Contracts.Assert(models[i].Predictor is IValueMapper);
                    var m = (IValueMapper)models[i].Predictor;
                    maps[i] = m.GetMapper <VBuffer <Single>, TOutput>();
                }

                // REVIEW: Should implement this better....
                var labels   = new Single[100];
                var features = new VBuffer <Single> [100];
                int count    = 0;
                // REVIEW: Should this include bad values or filter them?
                using (var cursor = new FloatLabelCursor(data, CursOpt.AllFeatures | CursOpt.AllLabels))
                {
                    TOutput[] predictions = new TOutput[maps.Length];
                    var       vBuffers    = new VBuffer <Single> [maps.Length];
                    while (cursor.MoveNext())
                    {
                        Parallel.For(0, maps.Length, i =>
                        {
                            var model = models[i];
                            if (model.SelectedFeatures != null)
                            {
                                EnsembleUtils.SelectFeatures(ref cursor.Features, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]);
                                maps[i](ref vBuffers[i], ref predictions[i]);
                            }
                            else
                            {
                                maps[i](ref cursor.Features, ref predictions[i]);
                            }
                        });

                        Utils.EnsureSize(ref labels, count + 1);
                        Utils.EnsureSize(ref features, count + 1);
                        labels[count] = cursor.Label;
                        FillFeatureBuffer(predictions, ref features[count]);
                        count++;
                    }
                }

                ch.Info("The number of instances used for stacking trainer is {0}", count);

                var bldr = new ArrayDataViewBuilder(host);
                Array.Resize(ref labels, count);
                Array.Resize(ref features, count);
                bldr.AddColumn(DefaultColumnNames.Label, NumberType.Float, labels);
                bldr.AddColumn(DefaultColumnNames.Features, NumberType.Float, features);

                var view = bldr.GetDataView();
                var rmd  = new RoleMappedData(view, DefaultColumnNames.Label, DefaultColumnNames.Features);

                var trainer = BasePredictorType.CreateInstance(host);
                if (trainer.Info.NeedNormalization)
                {
                    ch.Warning("The trainer specified for stacking wants normalization, but we do not currently allow this.");
                }
                Meta = trainer.Train(rmd);
                CheckMeta();

                ch.Done();
            }
        }
Example #8
0
        /// <summary>
        /// Helper function that builds the IDataView given a list of keys and non-vector values
        /// </summary>
        internal static IDataView CreateDataView <TKey, TValue>(IHostEnvironment env,
                                                                IEnumerable <TKey> keys,
                                                                IEnumerable <TValue> values,
                                                                string keyColumnName,
                                                                string valueColumnName,
                                                                bool treatValuesAsKeyTypes)
        {
            var keyType   = GetPrimitiveType(typeof(TKey), out bool isKeyVectorType);
            var valueType = GetPrimitiveType(typeof(TValue), out bool isValueVectorType);

            var dataViewBuilder = new ArrayDataViewBuilder(env);

            AddColumnWrapper(dataViewBuilder, keyColumnName, keyType, keys.ToArray());
            if (treatValuesAsKeyTypes)
            {
                // When treating the values as KeyTypes, generate the unique
                // set of values. This is used for generating the metadata of
                // the column.
                HashSet <TValue> valueSet = new HashSet <TValue>();
                foreach (var v in values)
                {
                    if (valueSet.Contains(v))
                    {
                        continue;
                    }
                    valueSet.Add(v);
                }

                var metaKeys = valueSet.ToArray();

                // Key Values are treated in one of two ways:
                // If the values are of type uint or ulong, these values are used directly as the keys types and no new keys are created.
                // If the values are not of uint or ulong, then key values are generated as uints starting from 1, since 0 is missing key.
                if (valueType.RawType == typeof(uint))
                {
                    uint[] indices = values.Select((x) => Convert.ToUInt32(x)).ToArray();
                    dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), (ulong)metaKeys.Length, indices);
                }
                else if (valueType.RawType == typeof(ulong))
                {
                    ulong[] indices = values.Select((x) => Convert.ToUInt64(x)).ToArray();
                    dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), (ulong)metaKeys.Length, indices);
                }
                else
                {
                    // When generating the indices, treat each value as being unique, i.e. two values that are the same will
                    // be assigned the same index. The dictionary is used to maintain uniqueness, indices will contain
                    // the full list of indices (equal to the same length of values).
                    Dictionary <TValue, uint> keyTypeValueMapping = new Dictionary <TValue, uint>();
                    uint[] indices = new uint[values.Count()];
                    // Start the index at 1
                    uint index = 1;
                    for (int i = 0; i < values.Count(); ++i)
                    {
                        TValue value = values.ElementAt(i);
                        if (!keyTypeValueMapping.ContainsKey(value))
                        {
                            keyTypeValueMapping.Add(value, index);
                            index++;
                        }

                        var keyValue = keyTypeValueMapping[value];
                        indices[i] = keyValue;
                    }

                    dataViewBuilder.AddColumn(valueColumnName, GetKeyValueGetter(metaKeys), (ulong)metaKeys.Count(), indices);
                }
            }
            else
            {
                AddColumnWrapper(dataViewBuilder, valueColumnName, valueType, values.ToArray());
            }

            return(dataViewBuilder.GetDataView());
        }
Example #9
0
        public void TestFeatureImportance()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .AppendCacheCheckpoint(ML)
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OrdinaryLeastSquares().Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            // Get prediction scores and contributions
            var enumerator           = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator();
            ScoreAndContribution row = null;
            var expectedValues       = new List <float[]>();

            expectedValues.Add(new float[4] {
                0.06319684F, 1, 0.1386623F, 4.46209469E-06F
            });
            expectedValues.Add(new float[4] {
                0.03841561F, 1, 0.1633037F, 2.68303256E-06F
            });
            expectedValues.Add(new float[4] {
                0.12006103F, 1, 0.254072F, 1.18671605E-05F
            });
            expectedValues.Add(new float[4] {
                0.20861618F, 0.99999994F, 0.407312155F, 6.963478E-05F
            });
            expectedValues.Add(new float[4] {
                0.024050576F, 0.99999994F, 0.31106182F, 8.456762E-06F
            });
            int index = 0;

            while (enumerator.MoveNext() && index < expectedValues.Count)
            {
                row = enumerator.Current;
                // We set predicion to 6 because the limit of floating-point numbers is 7.
                Assert.Equal(expectedValues[index][0], row.FeatureContributions[0], 6);
                Assert.Equal(expectedValues[index][1], row.FeatureContributions[1], 6);
                Assert.Equal(expectedValues[index][2], row.FeatureContributions[2], 6);
                Assert.Equal(expectedValues[index++][3], row.FeatureContributions[3], 6);
            }

            Done();
        }
        public void TestFeatureImportance()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var args  = new FeatureContributionCalculationTransform.Arguments()
            {
                Bottom = 10,
                Top    = 10
            };
            var output = FeatureContributionCalculationTransform.Create(Env, args, data, model.Model, model.FeatureColumn);

            // Get prediction scores and contributions
            var enumerator           = output.AsEnumerable <ScoreAndContribution>(Env, true).GetEnumerator();
            ScoreAndContribution row = null;
            var expectedValues       = new List <float[]>();

            expectedValues.Add(new float[4] {
                0.15640761F, 1, 0.155862764F, 0.07276783F
            });
            expectedValues.Add(new float[4] {
                0.09507586F, 1, 0.1835608F, 0.0437548943F
            });
            expectedValues.Add(new float[4] {
                0.297142357F, 1, 0.2855884F, 0.193529665F
            });
            expectedValues.Add(new float[4] {
                0.45465675F, 0.8805887F, 0.4031663F, 1
            });
            expectedValues.Add(new float[4] {
                0.0595234372F, 0.99999994F, 0.349647522F, 0.137912869F
            });
            int index = 0;

            while (enumerator.MoveNext() && index < expectedValues.Count)
            {
                row = enumerator.Current;
                Assert.True(row.FeatureContributions[0] == expectedValues[index][0]);
                Assert.True(row.FeatureContributions[1] == expectedValues[index][1]);
                Assert.True(row.FeatureContributions[2] == expectedValues[index][2]);
                Assert.True(row.FeatureContributions[3] == expectedValues[index++][3]);
            }

            Done();
        }
        public void TestDenseSGD()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 1000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array     = new float[numberOfInstances],
            x2Array     = new float[numberOfInstances],
            x3Array     = new float[numberOfInstances],
            x4RandArray = new float[numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x2Important = rand.Next(10000);
                x2Array[i] = x2Important;
                var x3 = rand.Next(5000);
                x3Array[i] = x3;
                var x4Rand = rand.Next(1000);
                x4RandArray[i] = x4Rand;

                var noise = rand.Next(50);
                yArray[i] = (float)(10 * x1 + 20 * x2Important + 5.5 * x3 + noise);
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2Important", NumberType.Float, x2Array);
            bldr.AddColumn("X3", NumberType.Float, x3Array);
            bldr.AddColumn("X4Rand", NumberType.Float, x4RandArray);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand")
                           .Append(ML.Transforms.Normalize("Features"));
            var data  = pipeline.Fit(srcDV).Transform(srcDV);
            var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var pfi   = ML.Regression.PermutationFeatureImportance(model, data);

            // Pfi Indices:
            // X1: 0
            // X2Important: 1
            // X3: 2
            // X4Rand: 3

            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
            Assert.True(MinDeltaIndex(pfi, m => m.L1) == 3);
            Assert.True(MaxDeltaIndex(pfi, m => m.L1) == 1);

            Assert.True(MinDeltaIndex(pfi, m => m.L2) == 3);
            Assert.True(MaxDeltaIndex(pfi, m => m.L2) == 1);

            Assert.True(MinDeltaIndex(pfi, m => m.Rms) == 3);
            Assert.True(MaxDeltaIndex(pfi, m => m.Rms) == 1);

            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
            Assert.True(MaxDeltaIndex(pfi, m => m.RSquared) == 3);
            Assert.True(MinDeltaIndex(pfi, m => m.RSquared) == 1);

            Done();
        }
        public void TestSparseSGD()
        {
            // Setup synthetic dataset.
            const int numberOfInstances = 10000;
            var       rand = new Random(10);

            float[] yArray = new float[numberOfInstances],
            x1Array = new float[numberOfInstances],
            x3Array = new float[numberOfInstances];

            VBuffer <float>[] vbArray = new VBuffer <float> [numberOfInstances];

            for (var i = 0; i < numberOfInstances; i++)
            {
                var x1 = rand.Next(1000);
                x1Array[i] = x1;
                var x3Important = rand.Next(10000);
                x3Array[i] = x3Important;

                VBuffer <float> vb;

                if (i % 10 != 0)
                {
                    vb = new VBuffer <float>(4, 3, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 2, 3 });
                }
                else
                {
                    vb = new VBuffer <float>(4, 4, new float[] { rand.Next(1000), rand.Next(1000), rand.Next(1000), rand.Next(1000) }, new int[] { 0, 1, 2, 3 });
                }

                vbArray[i] = vb;

                float vbSum = 0;
                foreach (var vbValue in vb.DenseValues())
                {
                    vbSum += vbValue * 10;
                }

                var noise = rand.Next(50);
                yArray[i] = 10 * x1 + vbSum + 20 * x3Important + noise;
            }

            // Create data view.
            var bldr = new ArrayDataViewBuilder(Env);

            bldr.AddColumn("X1", NumberType.Float, x1Array);
            bldr.AddColumn("X2VBuffer", NumberType.Float, vbArray);
            bldr.AddColumn("X3Important", NumberType.Float, x3Array);
            bldr.AddColumn("Label", NumberType.Float, yArray);
            var srcDV = bldr.GetDataView();

            var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important")
                           .Append(ML.Transforms.Normalize("Features"));
            var data    = pipeline.Fit(srcDV).Transform(srcDV);
            var model   = ML.Regression.Trainers.OnlineGradientDescent().Fit(data);
            var results = ML.Regression.PermutationFeatureImportance(model, data);

            // Pfi Indices:
            // X1: 0
            // X2VBuffer-Slot-0: 1
            // X2VBuffer-Slot-1: 2
            // X2VBuffer-Slot-2: 3
            // X2VBuffer-Slot-3: 4
            // X3Important: 5

            // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact.
            // For the following metrics lower is better, so maximum delta means more important feature, and vice versa
            Assert.True(MinDeltaIndex(results, m => m.L1) == 2);
            Assert.True(MaxDeltaIndex(results, m => m.L1) == 5);

            Assert.True(MinDeltaIndex(results, m => m.L2) == 2);
            Assert.True(MaxDeltaIndex(results, m => m.L2) == 5);

            Assert.True(MinDeltaIndex(results, m => m.Rms) == 2);
            Assert.True(MaxDeltaIndex(results, m => m.Rms) == 5);

            // For the following metrics higher is better, so minimum delta means more important feature, and vice versa
            Assert.True(MaxDeltaIndex(results, m => m.RSquared) == 2);
            Assert.True(MinDeltaIndex(results, m => m.RSquared) == 5);
        }