/// <summary> /// Describes how the transformer handles one column pair. /// </summary> /// <param name="input">Name of input column.</param> /// <param name="output">Name of output column.</param> /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param> /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param> /// <param name="seed">Hashing seed.</param> /// <param name="ordered">Whether the position of each term should be included in the hash.</param> /// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param> public ColumnInfo(string input, string output, CategoricalTransform.OutputKind outputKind = Defaults.OutputKind, int hashBits = Defaults.HashBits, uint seed = Defaults.Seed, bool ordered = Defaults.Ordered, int invertHash = Defaults.InvertHash) { HashInfo = new HashTransformer.ColumnInfo(input, output, hashBits, seed, ordered, invertHash); OutputKind = outputKind; }
private void InitMap <T>(T val, ColumnType type, int hashBits = 20) { var col = RowColumnUtils.GetColumn("Foo", type, ref val); _counted = new Counted(); var inRow = RowColumnUtils.GetRow(_counted, col); // One million features is a nice, typical number. var info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: hashBits); var xf = new HashTransformer(_env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out int outCol); var outRow = mapper.GetRow(inRow, c => c == outCol, out var _); if (type.IsVector) { _vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); } else { _getter = outRow.GetGetter <uint>(outCol); } }
private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3) { const int bits = 10; var col = RowColumnUtils.GetColumn("Foo", type, ref val); var inRow = RowColumnUtils.GetRow(new Counted(), col); // First do an unordered hash. var info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits); var xf = new HashTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out int outCol); var outRow = mapper.GetRow(inRow, c => c == outCol, out var _); var getter = outRow.GetGetter <uint>(outCol); uint result = 0; getter(ref result); Assert.Equal(expected, result); // Next do an ordered hash. info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); getter = outRow.GetGetter <uint>(outCol); getter(ref result); Assert.Equal(expectedOrdered, result); // Next build up a vector to make sure that hashing is consistent between scalar values // at least in the first position, and in the unordered case, the last position. const int vecLen = 5; var denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val)); col = RowColumnUtils.GetColumn("Foo", new VectorType(type, vecLen), ref denseVec); inRow = RowColumnUtils.GetRow(new Counted(), col); info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); var vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); VBuffer <uint> vecResult = default; vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); // They all should equal this in this case. Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0))); // Let's now do a sparse vector. var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 }); col = RowColumnUtils.GetColumn("Foo", new VectorType(type, vecLen), ref sparseVec); inRow = RowColumnUtils.GetRow(new Counted(), col); info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expected, vecResult.GetItemOrDefault(0)); Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); }