public static string CreateStratificationColumn(IHost host, ref IDataView data, string stratificationColumn = null)
        {
            host.CheckValue(data, nameof(data));
            host.CheckValueOrNull(stratificationColumn);

            // Pick a unique name for the stratificationColumn.
            const string stratColName = "StratificationKey";
            string       stratCol     = data.Schema.GetTempColumnName(stratColName);

            // Construct the stratification column. If user-provided stratification column exists, use HashJoin
            // of it to construct the strat column, otherwise generate a random number and use it.
            if (stratificationColumn == null)
            {
                data = new GenerateNumberTransform(host,
                                                   new GenerateNumberTransform.Options
                {
                    Columns = new[] { new GenerateNumberTransform.Column {
                                          Name = stratCol
                                      } }
                }, data);
            }
            else
            {
                var col = data.Schema.GetColumnOrNull(stratificationColumn);
                if (col == null)
                {
                    throw host.ExceptSchemaMismatch(nameof(stratificationColumn), "Stratification", stratificationColumn);
                }

                var type = col.Value.Type;
                if (!RangeFilter.IsValidRangeFilterColumnType(host, type))
                {
                    // HashingEstimator currently handles all primitive types except for DateTime, DateTimeOffset and TimeSpan.
                    var itemType = type.GetItemType();
                    if (itemType is DateTimeDataViewType || itemType is DateTimeOffsetDataViewType || itemType is TimeSpanDataViewType)
                    {
                        data = new TypeConvertingTransformer(host, stratificationColumn, DataKind.Int64, stratificationColumn).Transform(data);
                    }

                    var columnOptions = new HashingEstimator.ColumnOptions(stratCol, stratificationColumn, 30, combine: true);
                    data = new HashingEstimator(host, columnOptions).Fit(data).Transform(data);
                }
                else
                {
                    if (data.Schema[stratificationColumn].IsNormalized() || (type != NumberDataViewType.Single && type != NumberDataViewType.Double))
                    {
                        return(stratificationColumn);
                    }

                    data = new NormalizingEstimator(host,
                                                    new NormalizingEstimator.MinMaxColumnOptions(stratCol, stratificationColumn, ensureZeroUntouched: true))
                           .Fit(data).Transform(data);
                }
            }

            return(stratCol);
        }
예제 #2
0
 /// <summary>
 /// Describes how the transformer handles one column pair.
 /// </summary>
 /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
 /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param>
 /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
 /// <param name="seed">Hashing seed.</param>
 /// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
 /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 public ColumnOptions(string name, string inputColumnName           = null,
                      OneHotEncodingEstimator.OutputKind outputKind = Defaults.OutputKind,
                      int numberOfBits           = Defaults.NumberOfBits,
                      uint seed                  = Defaults.Seed,
                      bool useOrderedHashing     = Defaults.UseOrderedHashing,
                      int maximumNumberOfInverts = Defaults.MaximumNumberOfInverts)
 {
     HashingOptions = new HashingEstimator.ColumnOptions(name, inputColumnName ?? name, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts);
     OutputKind     = outputKind;
 }
예제 #3
0
 /// <summary>
 /// Describes how the transformer handles one column pair.
 /// </summary>
 /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
 /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param>
 /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
 /// <param name="seed">Hashing seed.</param>
 /// <param name="ordered">Whether the position of each term should be included in the hash.</param>
 /// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 public ColumnOptions(string name, string inputColumnName             = null,
                      OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutputKind,
                      int hashBits   = Defaults.HashBits,
                      uint seed      = Defaults.Seed,
                      bool ordered   = Defaults.Ordered,
                      int invertHash = Defaults.InvertHash)
 {
     HashInfo   = new HashingEstimator.ColumnOptions(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash);
     OutputKind = outputKind;
 }
예제 #4
0
        private HashingEstimator.ColumnOptions[] InitializeHashingColumnOptions(CountTableEstimator.ColumnOptionsBase[] columns, int numberOfBits, bool combine, uint hashingSeed)
        {
            var cols = new HashingEstimator.ColumnOptions[columns.Length];

            for (int i = 0; i < cols.Length; i++)
            {
                var column = columns[i];
                cols[i] = new HashingEstimator.ColumnOptions(column.Name, column.InputColumnName,
                                                             numberOfBits, hashingSeed, combine: combine);
            }
            return(cols);
        }
예제 #5
0
        private HashingEstimator.ColumnOptions[] InitializeHashingColumnOptions(Options options)
        {
            var columns = options.Columns;
            var cols    = new HashingEstimator.ColumnOptions[columns.Length];

            for (int i = 0; i < cols.Length; i++)
            {
                var column = columns[i];
                cols[i] = new HashingEstimator.ColumnOptions(column.Name, column.Source,
                                                             options.NumberOfBits, options.HashingSeed, combine: column.Combine ?? options.Combine);
            }
            return(cols);
        }
        /// <summary>
        /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null.
        /// </summary>
        internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDataView data, ref string samplingKeyColumn, int?seed = null)
        {
            // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
            // build a single hash of it. If it is not, we generate a random number.

            if (samplingKeyColumn == null)
            {
                samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn");
                data = new GenerateNumberTransform(env, data, samplingKeyColumn, (uint?)seed);
            }
            else
            {
                if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
                {
                    throw env.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
                }

                var type = data.Schema[stratCol].Type;
                if (!RangeFilter.IsValidRangeFilterColumnType(env, type))
                {
                    // Hash the samplingKeyColumn.
                    // REVIEW: this could currently crash, since Hash only accepts a limited set
                    // of column types. It used to be HashJoin, but we should probably extend Hash
                    // instead of having two hash transformations.
                    var origStratCol = samplingKeyColumn;
                    samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn);
                    HashingEstimator.ColumnOptions columnOptions;
                    if (seed.HasValue)
                    {
                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, (uint)seed.Value);
                    }
                    else
                    {
                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
                    }
                    data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data);
                }
                else
                {
                    if (!data.Schema[samplingKeyColumn].IsNormalized() && (type == NumberDataViewType.Single || type == NumberDataViewType.Double))
                    {
                        var origStratCol = samplingKeyColumn;
                        samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn);
                        data = new NormalizingEstimator(env, new NormalizingEstimator.MinMaxColumnOptions(samplingKeyColumn, origStratCol, ensureZeroUntouched: true)).Fit(data).Transform(data);
                    }
                }
            }
        }
예제 #7
0
        /// <summary>
        /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null.
        /// </summary>
        private void EnsureGroupPreservationColumn(ref IDataView data, ref string samplingKeyColumn, uint?seed = null)
        {
            // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
            // build a single hash of it. If it is not, we generate a random number.

            if (samplingKeyColumn == null)
            {
                samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn");
                data = new GenerateNumberTransform(Environment, data, samplingKeyColumn, seed);
            }
            else
            {
                if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
                {
                    throw Environment.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
                }

                var type = data.Schema[stratCol].Type;
                if (!RangeFilter.IsValidRangeFilterColumnType(Environment, type))
                {
                    // Hash the samplingKeyColumn.
                    // REVIEW: this could currently crash, since Hash only accepts a limited set
                    // of column types. It used to be HashJoin, but we should probably extend Hash
                    // instead of having two hash transformations.
                    var origStratCol = samplingKeyColumn;
                    int tmp;
                    int inc = 0;

                    // Generate a new column with the hashed samplingKeyColumn.
                    while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp))
                    {
                        samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
                    }
                    HashingEstimator.ColumnOptions columnOptions;
                    if (seed.HasValue)
                    {
                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value);
                    }
                    else
                    {
                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
                    }
                    data = new HashingEstimator(Environment, columnOptions).Fit(data).Transform(data);
                }
            }
        }
예제 #8
0
        private void InitMap <T>(T val, DataViewType type, int numberOfBits = 20, ValueGetter <T> getter = null)
        {
            if (getter == null)
            {
                getter = (ref T dst) => dst = val;
            }
            _inRow = RowImpl.Create(type, getter);
            // One million features is a nice, typical number.
            var info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: numberOfBits);
            var xf     = new HashingTransformer(_env, new[] { info });
            var mapper = ((ITransformer)xf).GetRowToRowMapper(_inRow.Schema);
            var column = mapper.OutputSchema["Bar"];
            var outRow = mapper.GetRow(_inRow, column);

            if (type is VectorType)
            {
                _vecGetter = outRow.GetGetter <VBuffer <uint> >(column);
            }
            else
            {
                _getter = outRow.GetGetter <uint>(column);
            }
        }
예제 #9
0
        private void HashTestCore <T>(T val, PrimitiveDataViewType type, uint expected, uint expectedOrdered, uint expectedOrdered3, uint expectedCombined, uint expectedCombinedSparse)
        {
            const int bits = 10;

            var builder = new DataViewSchema.Annotations.Builder();

            builder.AddPrimitiveValue("Foo", type, val);
            var inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations());

            //helper
            ValueGetter <TType> hashGetter <TType>(HashingEstimator.ColumnOptions colInfo)
            {
                var xf     = new HashingTransformer(Env, new[] { colInfo });
                var mapper = ((ITransformer)xf).GetRowToRowMapper(inRow.Schema);
                var col    = mapper.OutputSchema["Bar"];
                var outRow = mapper.GetRow(inRow, col);

                return(outRow.GetGetter <TType>(col));
            };

            // First do an unordered hash.
            var  info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits);
            var  getter = hashGetter <uint>(info);
            uint result = 0;

            getter(ref result);
            Assert.Equal(expected, result);

            // Next do an ordered hash.
            info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true);
            getter = hashGetter <uint>(info);
            getter(ref result);
            Assert.Equal(expectedOrdered, result);

            // Next build up a vector to make sure that hashing is consistent between scalar values
            // at least in the first position, and in the unordered case, the last position.
            const int vecLen   = 5;
            var       denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val));

            builder = new DataViewSchema.Annotations.Builder();
            builder.Add("Foo", new VectorDataViewType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst));
            inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations());

            info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: false);
            var            vecGetter = hashGetter <VBuffer <uint> >(info);
            VBuffer <uint> vecResult = default;

            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            // They all should equal this in this case.
            Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v));

            // Now do ordered with the dense vector.
            info      = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true);
            vecGetter = hashGetter <VBuffer <uint> >(info);
            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
            Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0)));

            // Now combine into one hash.
            info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, combine: true);
            getter = hashGetter <uint>(info);
            getter(ref result);
            Assert.Equal(expectedCombined, result);

            // Let's now do a sparse vector.
            var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 });

            builder = new DataViewSchema.Annotations.Builder();
            builder.Add("Foo", new VectorDataViewType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst));
            inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations());

            info      = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: false);
            vecGetter = hashGetter <VBuffer <uint> >(info);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expected, vecResult.GetItemOrDefault(0));
            Assert.Equal(expected, vecResult.GetItemOrDefault(3));
            Assert.Equal(expected, vecResult.GetItemOrDefault(7));

            info      = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true);
            vecGetter = hashGetter <VBuffer <uint> >(info);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));

            info   = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, combine: true);
            getter = hashGetter <uint>(info);
            getter(ref result);
            Assert.Equal(expectedCombinedSparse, result);
        }