/// <summary>
 /// Describes how the transformer handles one column pair.
 /// </summary>
 /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
 /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param>
 /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
 /// <param name="seed">Hashing seed.</param>
 /// <param name="ordered">Whether the position of each term should be included in the hash.</param>
 /// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 public ColumnInfo(string name, string inputColumnName             = null,
                   OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutputKind,
                   int hashBits   = Defaults.HashBits,
                   uint seed      = Defaults.Seed,
                   bool ordered   = Defaults.Ordered,
                   int invertHash = Defaults.InvertHash)
 {
     HashInfo   = new HashingEstimator.ColumnInfo(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash);
     OutputKind = outputKind;
 }
示例#2
0
        /// <summary>
        /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null.
        /// </summary>
        private void EnsureGroupPreservationColumn(ref IDataView data, ref string samplingKeyColumn, uint?seed = null)
        {
            // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
            // build a single hash of it. If it is not, we generate a random number.

            if (samplingKeyColumn == null)
            {
                samplingKeyColumn = data.Schema.GetTempColumnName("IdPreservationColumn");
                data = new GenerateNumberTransform(Environment, data, samplingKeyColumn, seed);
            }
            else
            {
                if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
                {
                    throw Environment.ExceptSchemaMismatch(nameof(samplingKeyColumn), "GroupPreservationColumn", samplingKeyColumn);
                }

                var type = data.Schema[stratCol].Type;
                if (!RangeFilter.IsValidRangeFilterColumnType(Environment, type))
                {
                    // Hash the samplingKeyColumn.
                    // REVIEW: this could currently crash, since Hash only accepts a limited set
                    // of column types. It used to be HashJoin, but we should probably extend Hash
                    // instead of having two hash transformations.
                    var origStratCol = samplingKeyColumn;
                    int tmp;
                    int inc = 0;

                    // Generate a new column with the hashed samplingKeyColumn.
                    while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp))
                    {
                        samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
                    }
                    HashingEstimator.ColumnInfo columnInfo;
                    if (seed.HasValue)
                    {
                        columnInfo = new HashingEstimator.ColumnInfo(samplingKeyColumn, origStratCol, 30, seed.Value);
                    }
                    else
                    {
                        columnInfo = new HashingEstimator.ColumnInfo(samplingKeyColumn, origStratCol, 30);
                    }
                    data = new HashingEstimator(Environment, columnInfo).Fit(data).Transform(data);
                }
            }
        }
        private void InitMap <T>(T val, ColumnType type, int hashBits = 20, ValueGetter <T> getter = null)
        {
            if (getter == null)
            {
                getter = (ref T dst) => dst = val;
            }
            _inRow = RowImpl.Create(type, getter);
            // One million features is a nice, typical number.
            var info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: hashBits);
            var xf     = new HashingTransformer(_env, new[] { info });
            var mapper = xf.GetRowToRowMapper(_inRow.Schema);
            var column = mapper.OutputSchema["Bar"];
            var outRow = mapper.GetRow(_inRow, c => c == column.Index);

            if (type is VectorType)
            {
                _vecGetter = outRow.GetGetter <VBuffer <uint> >(column.Index);
            }
            else
            {
                _getter = outRow.GetGetter <uint>(column.Index);
            }
        }
示例#4
0
        private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3)
        {
            const int bits = 10;

            var builder = new MetadataBuilder();

            builder.AddPrimitiveValue("Foo", type, val);
            var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata());

            // First do an unordered hash.
            var info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits);
            var xf     = new HashingTransformer(Env, new[] { info });
            var mapper = xf.GetRowToRowMapper(inRow.Schema);

            mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol);
            var outRow = mapper.GetRow(inRow, c => c == outCol);

            var  getter = outRow.GetGetter <uint>(outCol);
            uint result = 0;

            getter(ref result);
            Assert.Equal(expected, result);

            // Next do an ordered hash.
            info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow = mapper.GetRow(inRow, c => c == outCol);

            getter = outRow.GetGetter <uint>(outCol);
            getter(ref result);
            Assert.Equal(expectedOrdered, result);

            // Next build up a vector to make sure that hashing is consistent between scalar values
            // at least in the first position, and in the unordered case, the last position.
            const int vecLen   = 5;
            var       denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val));

            builder = new MetadataBuilder();
            builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst));
            inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata());

            info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow = mapper.GetRow(inRow, c => c == outCol);

            var            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            VBuffer <uint> vecResult = default;

            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            // They all should equal this in this case.
            Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v));

            // Now do ordered with the dense vector.
            info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(vecLen, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
            Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0)));

            // Let's now do a sparse vector.
            var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 });

            builder = new MetadataBuilder();
            builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst));
            inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata());

            info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expected, vecResult.GetItemOrDefault(0));
            Assert.Equal(expected, vecResult.GetItemOrDefault(3));
            Assert.Equal(expected, vecResult.GetItemOrDefault(7));

            info   = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true);
            xf     = new HashingTransformer(Env, new[] { info });
            mapper = xf.GetRowToRowMapper(inRow.Schema);
            mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol);
            outRow    = mapper.GetRow(inRow, c => c == outCol);
            vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol);
            vecGetter(ref vecResult);

            Assert.Equal(10, vecResult.Length);
            Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0));
            Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3));
        }