Example #1
0
        public void SimpleConstructorsAndExtensions()
        {
            string dataPath = GetDataPath("iris.txt");

            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(1, 4) }),
                }
            });

            var data = loader.Read(dataPath);

            var est1 = new NormalizingEstimator(Env, "float4");
            var est2 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizerMode.MinMax, ("float4", "float4"));
            var est3 = new NormalizingEstimator(Env, new NormalizingEstimator.MinMaxColumn("float4"));
            var est4 = ML.Transforms.Normalize(NormalizingEstimator.NormalizerMode.MinMax, ("float4", "float4"));
            var est5 = ML.Transforms.Normalize("float4");

            var data1 = est1.Fit(data).Transform(data);
            var data2 = est2.Fit(data).Transform(data);
            var data3 = est3.Fit(data).Transform(data);
            var data4 = est4.Fit(data).Transform(data);
            var data5 = est5.Fit(data).Transform(data);

            CheckSameSchemas(data1.Schema, data2.Schema);
            CheckSameSchemas(data1.Schema, data3.Schema);
            CheckSameSchemas(data1.Schema, data4.Schema);
            CheckSameSchemas(data1.Schema, data5.Schema);
            CheckSameValues(data1, data2);
            CheckSameValues(data1, data3);
            CheckSameValues(data1, data4);
            CheckSameValues(data1, data5);

            Done();
        }
        public void NormalizerParameters()
        {
            string dataPath = GetDataPath("iris.txt");

            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 1),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("double1", DataKind.R8, 1),
                    new TextLoader.Column("double4", DataKind.R8, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("int1", DataKind.I4, 0),
                    new TextLoader.Column("float0", DataKind.R4, new[] { new TextLoader.Range {
                                                                             Min = 1, VariableEnd = true
                                                                         } })
                },
                HasHeader = true
            }, new MultiFileSource(dataPath));

            var est = new NormalizingEstimator(Env,
                                               new NormalizingEstimator.MinMaxColumn("float1"),
                                               new NormalizingEstimator.MinMaxColumn("float4"),
                                               new NormalizingEstimator.MinMaxColumn("double1"),
                                               new NormalizingEstimator.MinMaxColumn("double4"),
                                               new NormalizingEstimator.BinningColumn("float1", "float1bin"),
                                               new NormalizingEstimator.BinningColumn("float4", "float4bin"),
                                               new NormalizingEstimator.BinningColumn("double1", "double1bin"),
                                               new NormalizingEstimator.BinningColumn("double4", "double4bin"),
                                               new NormalizingEstimator.MeanVarColumn("float1", "float1mv"),
                                               new NormalizingEstimator.MeanVarColumn("float4", "float4mv"),
                                               new NormalizingEstimator.MeanVarColumn("double1", "double1mv"),
                                               new NormalizingEstimator.MeanVarColumn("double4", "double4mv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float1", "float1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float4", "float4lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double1", "double1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double4", "double4lmv"));

            var data = loader.Read(dataPath);

            var transformer = est.Fit(data);

            var floatAffineData = transformer.Columns[0].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <float>;

            Assert.Equal(0.12658228f, floatAffineData.Scale);
            Assert.Equal(0, floatAffineData.Offset);

            var floatAffineDataVec = transformer.Columns[1].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <ImmutableArray <float> >;

            Assert.Equal(4, floatAffineDataVec.Scale.Length);
            Assert.Empty(floatAffineDataVec.Offset);

            var doubleAffineData = transformer.Columns[2].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <double>;

            Assert.Equal(0.12658227848101264, doubleAffineData.Scale);
            Assert.Equal(0, doubleAffineData.Offset);

            var doubleAffineDataVec = transformer.Columns[3].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <ImmutableArray <double> >;

            Assert.Equal(4, doubleAffineDataVec.Scale.Length);
            Assert.Empty(doubleAffineDataVec.Offset);

            var floatBinData = transformer.Columns[4].ModelParameters as NormalizingTransformer.BinNormalizerModelParameters <float>;

            Assert.True(35 == floatBinData.UpperBounds.Length);
            Assert.True(34 == floatBinData.Density);
            Assert.True(0 == floatBinData.Offset);

            var floatBinDataVec = transformer.Columns[5].ModelParameters as NormalizingTransformer.BinNormalizerModelParameters <ImmutableArray <float> >;

            Assert.True(4 == floatBinDataVec.UpperBounds.Length);
            Assert.True(35 == floatBinDataVec.UpperBounds[0].Length);
            Assert.True(4 == floatBinDataVec.Density.Length);
            Assert.True(0 == floatBinDataVec.Offset.Length);

            var doubleBinData = transformer.Columns[6].ModelParameters as NormalizingTransformer.BinNormalizerModelParameters <double>;

            Assert.Equal(35, doubleBinData.UpperBounds.Length);
            Assert.Equal(34, doubleBinData.Density);
            Assert.Equal(0, doubleBinData.Offset);

            var doubleBinDataVec = transformer.Columns[7].ModelParameters as NormalizingTransformer.BinNormalizerModelParameters <ImmutableArray <double> >;

            Assert.Equal(35, doubleBinDataVec.UpperBounds[0].Length);
            Assert.Equal(4, doubleBinDataVec.Density.Length);
            Assert.Empty(doubleBinDataVec.Offset);

            var floatCdfMeanData = transformer.Columns[8].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <float>;

            Assert.Equal(0.169309646f, floatCdfMeanData.Scale);
            Assert.Equal(0, floatCdfMeanData.Offset);

            var floatCdfMeanDataVec = transformer.Columns[9].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <ImmutableArray <float> >;

            Assert.Equal(0.16930964589119f, floatCdfMeanDataVec.Scale[0]);
            Assert.Equal(4, floatCdfMeanDataVec.Scale.Length);
            Assert.Empty(floatCdfMeanDataVec.Offset);

            var doubleCdfMeanData = transformer.Columns[10].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <double>;

            Assert.Equal(0.16930963784387665, doubleCdfMeanData.Scale);
            Assert.Equal(0, doubleCdfMeanData.Offset);

            var doubleCdfMeanDataVec = transformer.Columns[11].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters <ImmutableArray <double> >;

            Assert.Equal(4, doubleCdfMeanDataVec.Scale.Length);
            Assert.Empty(doubleCdfMeanDataVec.Offset);

            var floatCdfLogMeanData = transformer.Columns[12].ModelParameters as NormalizingTransformer.CdfNormalizerModelParameters <float>;

            Assert.Equal(1.75623953f, floatCdfLogMeanData.Mean);
            Assert.True(true == floatCdfLogMeanData.UseLog);
            Assert.Equal(0.140807763f, floatCdfLogMeanData.Stddev);

            var floatCdfLogMeanDataVec = transformer.Columns[13].ModelParameters as NormalizingTransformer.CdfNormalizerModelParameters <ImmutableArray <float> >;

            Assert.Equal(4, floatCdfLogMeanDataVec.Mean.Length);
            Assert.True(true == floatCdfLogMeanDataVec.UseLog);
            Assert.Equal(4, floatCdfLogMeanDataVec.Stddev.Length);

            var doubleCdfLogMeanData = transformer.Columns[14].ModelParameters as NormalizingTransformer.CdfNormalizerModelParameters <double>;

            Assert.Equal(1.7562395401953814, doubleCdfLogMeanData.Mean);
            Assert.True(doubleCdfLogMeanData.UseLog);
            Assert.Equal(0.14080776721611848, doubleCdfLogMeanData.Stddev);

            var doubleCdfLogMeanDataVec = transformer.Columns[15].ModelParameters as NormalizingTransformer.CdfNormalizerModelParameters <ImmutableArray <double> >;

            Assert.Equal(4, doubleCdfLogMeanDataVec.Mean.Length);
            Assert.True(doubleCdfLogMeanDataVec.UseLog);
            Assert.Equal(4, doubleCdfLogMeanDataVec.Stddev.Length);

            Done();
        }
        public void NormalizerWorkout()
        {
            string dataPath = GetDataPath(TestDatasets.iris.trainFilename);

            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 1),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("double1", DataKind.R8, 1),
                    new TextLoader.Column("double4", DataKind.R8, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("int1", DataKind.I4, 0),
                    new TextLoader.Column("float0", DataKind.R4, new[] { new TextLoader.Range {
                                                                             Min = 1, VariableEnd = true
                                                                         } }),
                },
                HasHeader = true
            }, new MultiFileSource(dataPath));

            var est = new NormalizingEstimator(Env,
                                               new NormalizingEstimator.MinMaxColumn("float1"),
                                               new NormalizingEstimator.MinMaxColumn("float4"),
                                               new NormalizingEstimator.MinMaxColumn("double1"),
                                               new NormalizingEstimator.MinMaxColumn("double4"),
                                               new NormalizingEstimator.BinningColumn("float1", "float1bin"),
                                               new NormalizingEstimator.BinningColumn("float4", "float4bin"),
                                               new NormalizingEstimator.BinningColumn("double1", "double1bin"),
                                               new NormalizingEstimator.BinningColumn("double4", "double4bin"),
                                               new NormalizingEstimator.SupervisedBinningColumn("float1", "float1supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.SupervisedBinningColumn("float4", "float4supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.SupervisedBinningColumn("double1", "double1supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.SupervisedBinningColumn("double4", "double4supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.MeanVarColumn("float1", "float1mv"),
                                               new NormalizingEstimator.MeanVarColumn("float4", "float4mv"),
                                               new NormalizingEstimator.MeanVarColumn("double1", "double1mv"),
                                               new NormalizingEstimator.MeanVarColumn("double4", "double4mv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float1", "float1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float4", "float4lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double1", "double1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double4", "double4lmv"));

            var data = loader.Read(dataPath);

            var badData1 = new ColumnCopyingTransformer(Env, ("int1", "float1")).Transform(data);
            var badData2 = new ColumnCopyingTransformer(Env, ("float0", "float4")).Transform(data);

            TestEstimatorCore(est, data, null, badData1);
            TestEstimatorCore(est, data, null, badData2);

            var outputPath = GetOutputPath("NormalizerEstimator", "normalized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                using (var fs = File.Create(outputPath))
                {
                    var dataView = ColumnSelectingTransformer.CreateDrop(Env, est.Fit(data).Transform(data), "float0");
                    DataSaverUtils.SaveDataView(ch, saver, dataView, fs, keepHidden: true);
                }
            }

            CheckEquality("NormalizerEstimator", "normalized.tsv");

            Done();
        }
Example #4
0
        internal static string CreateSplitColumn(IHostEnvironment env, ref IDataView data, string samplingKeyColumn, int?seed = null, bool fallbackInEnvSeed = false)
        {
            Contracts.CheckValue(env, nameof(env));
            Contracts.CheckValueOrNull(samplingKeyColumn);

            var splitColumnName = data.Schema.GetTempColumnName("SplitColumn");
            int?seedToUse;

            if (seed.HasValue)
            {
                seedToUse = seed.Value;
            }
            else if (fallbackInEnvSeed)
            {
                ISeededEnvironment seededEnv = (ISeededEnvironment)env;
                seedToUse = seededEnv.Seed;
            }
            else
            {
                seedToUse = null;
            }

            // We need to handle two cases: if samplingKeyColumn is not provided, we generate a random number.
            if (samplingKeyColumn == null)
            {
                data = new GenerateNumberTransform(env, data, splitColumnName, (uint?)seedToUse);
            }
            else
            {
                // If samplingKeyColumn was provided we will make a new column based on it, but using a temporary
                // name, as it might be dropped elsewhere in the code

                if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int samplingColIndex))
                {
                    throw env.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
                }

                var type = data.Schema[samplingColIndex].Type;
                if (!RangeFilter.IsValidRangeFilterColumnType(env, type))
                {
                    var hashInputColumnName = samplingKeyColumn;
                    // HashingEstimator currently handles all primitive types except for DateTime, DateTimeOffset and TimeSpan.
                    var itemType = type.GetItemType();
                    if (itemType is DateTimeDataViewType || itemType is DateTimeOffsetDataViewType || itemType is TimeSpanDataViewType)
                    {
                        data = new TypeConvertingTransformer(env, splitColumnName, DataKind.Int64, samplingKeyColumn).Transform(data);
                        hashInputColumnName = splitColumnName;
                    }

                    var columnOptions =
                        seedToUse.HasValue ?
                        new HashingEstimator.ColumnOptions(splitColumnName, hashInputColumnName, 30, (uint)seedToUse.Value, combine: true) :
                        new HashingEstimator.ColumnOptions(splitColumnName, hashInputColumnName, 30, combine: true);
                    data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data);
                }
                else
                {
                    if (type != NumberDataViewType.Single && type != NumberDataViewType.Double)
                    {
                        data = new ColumnCopyingEstimator(env, (splitColumnName, samplingKeyColumn)).Fit(data).Transform(data);
                    }
                    else
                    {
                        data = new NormalizingEstimator(env, new NormalizingEstimator.MinMaxColumnOptions(splitColumnName, samplingKeyColumn, ensureZeroUntouched: false)).Fit(data).Transform(data);
                    }
                }
            }

            return(splitColumnName);
        }
Example #5
0
        public void SimpleConstructorsAndExtensions()
        {
            string dataPath = GetDataPath(TestDatasets.iris.trainFilename);

            var loader = new TextLoader(Env, new TextLoader.Options
            {
                Columns = new[] {
                    new TextLoader.Column("Label", DataKind.Single, 0),
                    new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(1, 4) }),
                }
            });

            var data = loader.Load(dataPath);

            var est1 = new NormalizingEstimator(Env, "float4");
            var est2 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.MinMax, ("float4", "float4"));
            var est3 = new NormalizingEstimator(Env, new NormalizingEstimator.MinMaxColumnOptions("float4"));
            var est4 = ML.Transforms.NormalizeMinMax("float4", "float4");
            var est5 = ML.Transforms.NormalizeMinMax("float4");

            var data1 = est1.Fit(data).Transform(data);
            var data2 = est2.Fit(data).Transform(data);
            var data3 = est3.Fit(data).Transform(data);
            var data4 = est4.Fit(data).Transform(data);
            var data5 = est5.Fit(data).Transform(data);

            CheckSameSchemas(data1.Schema, data2.Schema);
            CheckSameSchemas(data1.Schema, data3.Schema);
            CheckSameSchemas(data1.Schema, data4.Schema);
            CheckSameSchemas(data1.Schema, data5.Schema);
            CheckSameValues(data1, data2);
            CheckSameValues(data1, data3);
            CheckSameValues(data1, data4);
            CheckSameValues(data1, data5);

            // Tests for MeanVariance
            var est6 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.MeanVariance, ("float4", "float4"));
            var est7 = new NormalizingEstimator(Env, new NormalizingEstimator.MeanVarianceColumnOptions("float4"));
            var est8 = ML.Transforms.NormalizeMeanVariance("float4", "float4");

            var data6 = est6.Fit(data).Transform(data);
            var data7 = est7.Fit(data).Transform(data);
            var data8 = est8.Fit(data).Transform(data);

            CheckSameSchemas(data6.Schema, data7.Schema);
            CheckSameSchemas(data6.Schema, data8.Schema);
            CheckSameValues(data6, data7);
            CheckSameValues(data6, data8);

            // Tests for LogMeanVariance
            var est9  = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.LogMeanVariance, ("float4", "float4"));
            var est10 = new NormalizingEstimator(Env, new NormalizingEstimator.LogMeanVarianceColumnOptions("float4"));
            var est11 = ML.Transforms.NormalizeLogMeanVariance("float4", "float4");

            var data9  = est9.Fit(data).Transform(data);
            var data10 = est10.Fit(data).Transform(data);
            var data11 = est11.Fit(data).Transform(data);

            CheckSameSchemas(data9.Schema, data10.Schema);
            CheckSameSchemas(data9.Schema, data11.Schema);
            CheckSameValues(data9, data10);
            CheckSameValues(data9, data11);

            // Tests for Binning
            var est12 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.Binning, ("float4", "float4"));
            var est13 = new NormalizingEstimator(Env, new NormalizingEstimator.BinningColumnOptions("float4"));
            var est14 = ML.Transforms.NormalizeBinning("float4", "float4");

            var data12 = est12.Fit(data).Transform(data);
            var data13 = est13.Fit(data).Transform(data);
            var data14 = est14.Fit(data).Transform(data);

            CheckSameSchemas(data12.Schema, data13.Schema);
            CheckSameSchemas(data12.Schema, data14.Schema);
            CheckSameValues(data12, data13);
            CheckSameValues(data12, data14);

            // Tests for SupervisedBinning
            var est15 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.SupervisedBinning, ("float4", "float4"));
            var est16 = new NormalizingEstimator(Env, new NormalizingEstimator.SupervisedBinningColumOptions("float4"));
            var est17 = ML.Transforms.NormalizeSupervisedBinning("float4", "float4");

            var data15 = est15.Fit(data).Transform(data);
            var data16 = est16.Fit(data).Transform(data);
            var data17 = est17.Fit(data).Transform(data);

            CheckSameSchemas(data15.Schema, data16.Schema);
            CheckSameSchemas(data15.Schema, data17.Schema);
            CheckSameValues(data15, data16);
            CheckSameValues(data15, data17);

            Done();
        }