private void TestHashTransformVectorHelper(ArrayDataViewBuilder builder, uint[][] results) { var srcView = builder.GetDataView(); var col = new HashTransformer.Column(); col.Name = "F1V"; col.HashBits = 5; col.Seed = 42; var args = new HashTransformer.Arguments(); args.Column = new HashTransformer.Column[] { col }; var hashTransform = HashTransformer.Create(Env, args, srcView); using (var cursor = hashTransform.GetRowCursor(c => true)) { var resultGetter = cursor.GetGetter <VBuffer <uint> >(1); VBuffer <uint> resultRow = new VBuffer <uint>(); foreach (var r in results) { Assert.True(cursor.MoveNext()); resultGetter(ref resultRow); Assert.True(resultRow.Length == r.Length); for (int i = 0; i < r.Length; i++) { Assert.True(resultRow.GetItemOrDefault(i) == r[i]); } } } }
private void TestHashTransformHelper <T>(T[] data, uint[] results, NumberType type) { var builder = new ArrayDataViewBuilder(Env); builder.AddColumn("F1", type, data); var srcView = builder.GetDataView(); var col = new HashTransformer.Column(); col.Name = "F1"; col.HashBits = 5; col.Seed = 42; var args = new HashTransformer.Arguments(); args.Column = new HashTransformer.Column[] { col }; var hashTransform = HashTransformer.Create(Env, args, srcView); using (var cursor = hashTransform.GetRowCursor(c => true)) { var resultGetter = cursor.GetGetter <uint>(1); uint resultRow = 0; foreach (var r in results) { Assert.True(cursor.MoveNext()); resultGetter(ref resultRow); Assert.True(resultRow == r); } } }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("CategoricalHash"); using (var ch = h.Start("CategoricalHash")) { h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); if (args.HashBits < 1 || args.HashBits >= NumBitsLim) { throw h.ExceptUserArg(nameof(args.HashBits), "Number of bits must be between 1 and {0}", NumBitsLim - 1); } // creating the Hash function var hashArgs = new HashTransformer.Arguments { HashBits = args.HashBits, Seed = args.Seed, Ordered = args.Ordered, InvertHash = args.InvertHash, Column = new HashTransformer.Column[args.Column.Length] }; for (int i = 0; i < args.Column.Length; i++) { var column = args.Column[i]; if (!column.TrySanitize()) { throw h.ExceptUserArg(nameof(Column.Name)); } h.Assert(!string.IsNullOrWhiteSpace(column.Name)); h.Assert(!string.IsNullOrWhiteSpace(column.Source)); hashArgs.Column[i] = new HashTransformer.Column { HashBits = column.HashBits, Seed = column.Seed, Ordered = column.Ordered, Name = column.Name, Source = column.Source, InvertHash = column.InvertHash, }; } return(CreateTransformCore( args.OutputKind, args.Column, args.Column.Select(col => col.OutputKind).ToList(), HashTransformer.Create(h, hashArgs, input), h, args)); } }
private void InitMap <T>(T val, ColumnType type, int hashBits = 20) { var col = RowColumnUtils.GetColumn("Foo", type, ref val); _counted = new Counted(); var inRow = RowColumnUtils.GetRow(_counted, col); // One million features is a nice, typical number. var info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: hashBits); var xf = new HashTransformer(_env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out int outCol); var outRow = mapper.GetRow(inRow, c => c == outCol, out var _); if (type.IsVector) { _vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); } else { _getter = outRow.GetGetter <uint>(outCol); } }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <TermTransform.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <TermTransform.Column>(); } var hashColumns = new List <HashTransformer.Column>(); var ngramHashColumns = new NgramHashTransform.Column[args.Column.Length]; var colCount = args.Column.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new TermTransform.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add( new HashTransformer.Column { Name = tmpName, Source = termLoaderArgs == null ? column.Source[isrc] : tmpName, HashBits = 30, Seed = column.Seed, Ordered = false, InvertHash = column.InvertHash }); } ngramHashColumns[iinfo] = new NgramHashTransform.Column { Name = column.Name, Source = tmpColNames[iinfo], AllLengths = column.AllLengths, HashBits = column.HashBits, NgramLength = column.NgramLength, RehashUnigrams = false, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, InvertHash = column.InvertHash, // REVIEW: This is an ugly internal hack to get around // the problem that we want the *original* source names surfacing // in the descriptions where appropriate, rather than _tmp000 and // what have you. The alternative is we do something elaborate // with metadata or something but I'm not sure that's better. FriendlyNames = column.FriendlyNames }; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new TermTransform.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = termCols.ToArray() }; view = TermTransform.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var naDropArgs = new NADropTransform.Arguments { Column = new NADropTransform.Column[termCols.Count] }; for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { naDropArgs.Column[iinfo] = new NADropTransform.Column { Name = termCols[iinfo].Name, Source = termCols[iinfo].Name }; } view = new NADropTransform(h, naDropArgs, view); } } // Args for the Hash function with multiple columns var hashArgs = new HashTransformer.Arguments { HashBits = 31, Seed = args.Seed, Ordered = false, Column = hashColumns.ToArray(), InvertHash = args.InvertHash }; view = HashTransformer.Create(h, hashArgs, view); // creating the NgramHash function var ngramHashArgs = new NgramHashTransform.Arguments { AllLengths = args.AllLengths, HashBits = args.HashBits, NgramLength = args.NgramLength, SkipLength = args.SkipLength, RehashUnigrams = false, Ordered = args.Ordered, Seed = args.Seed, Column = ngramHashColumns, InvertHash = args.InvertHash }; view = new NgramHashTransform(h, ngramHashArgs, view); return(new DropColumnsTransform(h, new DropColumnsTransform.Arguments() { Column = tmpColNames.SelectMany(cols => cols).ToArray() }, view)); }
private void HashTestCore <T>(T val, PrimitiveType type, uint expected, uint expectedOrdered, uint expectedOrdered3) { const int bits = 10; var col = RowColumnUtils.GetColumn("Foo", type, ref val); var inRow = RowColumnUtils.GetRow(new Counted(), col); // First do an unordered hash. var info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits); var xf = new HashTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out int outCol); var outRow = mapper.GetRow(inRow, c => c == outCol, out var _); var getter = outRow.GetGetter <uint>(outCol); uint result = 0; getter(ref result); Assert.Equal(expected, result); // Next do an ordered hash. info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); getter = outRow.GetGetter <uint>(outCol); getter(ref result); Assert.Equal(expectedOrdered, result); // Next build up a vector to make sure that hashing is consistent between scalar values // at least in the first position, and in the unordered case, the last position. const int vecLen = 5; var denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val)); col = RowColumnUtils.GetColumn("Foo", new VectorType(type, vecLen), ref denseVec); inRow = RowColumnUtils.GetRow(new Counted(), col); info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); var vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); VBuffer <uint> vecResult = default; vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); // They all should equal this in this case. Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0))); // Let's now do a sparse vector. var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 }); col = RowColumnUtils.GetColumn("Foo", new VectorType(type, vecLen), ref sparseVec); inRow = RowColumnUtils.GetRow(new Counted(), col); info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: false); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expected, vecResult.GetItemOrDefault(0)); Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); info = new HashTransformer.ColumnInfo("Foo", "Bar", hashBits: bits, ordered: true); xf = new HashTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.Schema.TryGetColumnIndex("Bar", out outCol); outRow = mapper.GetRow(inRow, c => c == outCol, out var _); vecGetter = outRow.GetGetter <VBuffer <uint> >(outCol); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); }