public static string CreateStratificationColumn(IHost host, ref IDataView data, string stratificationColumn = null) { host.CheckValue(data, nameof(data)); host.CheckValueOrNull(stratificationColumn); // Pick a unique name for the stratificationColumn. const string stratColName = "StratificationKey"; string stratCol = data.Schema.GetTempColumnName(stratColName); // Construct the stratification column. If user-provided stratification column exists, use HashJoin // of it to construct the strat column, otherwise generate a random number and use it. if (stratificationColumn == null) { data = new GenerateNumberTransform(host, new GenerateNumberTransform.Options { Columns = new[] { new GenerateNumberTransform.Column { Name = stratCol } } }, data); } else { var col = data.Schema.GetColumnOrNull(stratificationColumn); if (col == null) { throw host.ExceptSchemaMismatch(nameof(stratificationColumn), "Stratification", stratificationColumn); } var type = col.Value.Type; if (!RangeFilter.IsValidRangeFilterColumnType(host, type)) { // HashingEstimator currently handles all primitive types except for DateTime, DateTimeOffset and TimeSpan. var itemType = type.GetItemType(); if (itemType is DateTimeDataViewType || itemType is DateTimeOffsetDataViewType || itemType is TimeSpanDataViewType) { data = new TypeConvertingTransformer(host, stratificationColumn, DataKind.Int64, stratificationColumn).Transform(data); } var columnOptions = new HashingEstimator.ColumnOptions(stratCol, stratificationColumn, 30, combine: true); data = new HashingEstimator(host, columnOptions).Fit(data).Transform(data); } else { if (data.Schema[stratificationColumn].IsNormalized() || (type != NumberDataViewType.Single && type != NumberDataViewType.Double)) { return(stratificationColumn); } data = new NormalizingEstimator(host, new NormalizingEstimator.MinMaxColumnOptions(stratCol, stratificationColumn, ensureZeroUntouched: true)) .Fit(data).Transform(data); } } return(stratCol); }
/// <summary> /// Describes how the transformer handles one column pair. /// </summary> /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param> /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param> /// <param name="seed">Hashing seed.</param> /// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param> /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> public ColumnOptions(string name, string inputColumnName = null, OneHotEncodingEstimator.OutputKind outputKind = Defaults.OutputKind, int numberOfBits = Defaults.NumberOfBits, uint seed = Defaults.Seed, bool useOrderedHashing = Defaults.UseOrderedHashing, int maximumNumberOfInverts = Defaults.MaximumNumberOfInverts) { HashingOptions = new HashingEstimator.ColumnOptions(name, inputColumnName ?? name, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts); OutputKind = outputKind; }
/// <summary> /// Describes how the transformer handles one column pair. /// </summary> /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param> /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param> /// <param name="seed">Hashing seed.</param> /// <param name="ordered">Whether the position of each term should be included in the hash.</param> /// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> public ColumnOptions(string name, string inputColumnName = null, OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutputKind, int hashBits = Defaults.HashBits, uint seed = Defaults.Seed, bool ordered = Defaults.Ordered, int invertHash = Defaults.InvertHash) { HashInfo = new HashingEstimator.ColumnOptions(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash); OutputKind = outputKind; }
private HashingEstimator.ColumnOptions[] InitializeHashingColumnOptions(CountTableEstimator.ColumnOptionsBase[] columns, int numberOfBits, bool combine, uint hashingSeed) { var cols = new HashingEstimator.ColumnOptions[columns.Length]; for (int i = 0; i < cols.Length; i++) { var column = columns[i]; cols[i] = new HashingEstimator.ColumnOptions(column.Name, column.InputColumnName, numberOfBits, hashingSeed, combine: combine); } return(cols); }
private HashingEstimator.ColumnOptions[] InitializeHashingColumnOptions(Options options) { var columns = options.Columns; var cols = new HashingEstimator.ColumnOptions[columns.Length]; for (int i = 0; i < cols.Length; i++) { var column = columns[i]; cols[i] = new HashingEstimator.ColumnOptions(column.Name, column.Source, options.NumberOfBits, options.HashingSeed, combine: column.Combine ?? options.Combine); } return(cols); }
/// <summary> /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null. /// </summary> internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDataView data, ref string samplingKeyColumn, int?seed = null) { // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to // build a single hash of it. If it is not, we generate a random number. if (samplingKeyColumn == null) { samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn"); data = new GenerateNumberTransform(env, data, samplingKeyColumn, (uint?)seed); } else { if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol)) { throw env.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn); } var type = data.Schema[stratCol].Type; if (!RangeFilter.IsValidRangeFilterColumnType(env, type)) { // Hash the samplingKeyColumn. // REVIEW: this could currently crash, since Hash only accepts a limited set // of column types. It used to be HashJoin, but we should probably extend Hash // instead of having two hash transformations. var origStratCol = samplingKeyColumn; samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn); HashingEstimator.ColumnOptions columnOptions; if (seed.HasValue) { columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, (uint)seed.Value); } else { columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30); } data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data); } else { if (!data.Schema[samplingKeyColumn].IsNormalized() && (type == NumberDataViewType.Single || type == NumberDataViewType.Double)) { var origStratCol = samplingKeyColumn; samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn); data = new NormalizingEstimator(env, new NormalizingEstimator.MinMaxColumnOptions(samplingKeyColumn, origStratCol, ensureZeroUntouched: true)).Fit(data).Transform(data); } } } }
/// <summary> /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null. /// </summary> private void EnsureGroupPreservationColumn(ref IDataView data, ref string samplingKeyColumn, uint?seed = null) { // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to // build a single hash of it. If it is not, we generate a random number. if (samplingKeyColumn == null) { samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn"); data = new GenerateNumberTransform(Environment, data, samplingKeyColumn, seed); } else { if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol)) { throw Environment.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn); } var type = data.Schema[stratCol].Type; if (!RangeFilter.IsValidRangeFilterColumnType(Environment, type)) { // Hash the samplingKeyColumn. // REVIEW: this could currently crash, since Hash only accepts a limited set // of column types. It used to be HashJoin, but we should probably extend Hash // instead of having two hash transformations. var origStratCol = samplingKeyColumn; int tmp; int inc = 0; // Generate a new column with the hashed samplingKeyColumn. while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp)) { samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); } HashingEstimator.ColumnOptions columnOptions; if (seed.HasValue) { columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value); } else { columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30); } data = new HashingEstimator(Environment, columnOptions).Fit(data).Transform(data); } } }
private void InitMap <T>(T val, DataViewType type, int numberOfBits = 20, ValueGetter <T> getter = null) { if (getter == null) { getter = (ref T dst) => dst = val; } _inRow = RowImpl.Create(type, getter); // One million features is a nice, typical number. var info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: numberOfBits); var xf = new HashingTransformer(_env, new[] { info }); var mapper = ((ITransformer)xf).GetRowToRowMapper(_inRow.Schema); var column = mapper.OutputSchema["Bar"]; var outRow = mapper.GetRow(_inRow, column); if (type is VectorType) { _vecGetter = outRow.GetGetter <VBuffer <uint> >(column); } else { _getter = outRow.GetGetter <uint>(column); } }
private void HashTestCore <T>(T val, PrimitiveDataViewType type, uint expected, uint expectedOrdered, uint expectedOrdered3, uint expectedCombined, uint expectedCombinedSparse) { const int bits = 10; var builder = new DataViewSchema.Annotations.Builder(); builder.AddPrimitiveValue("Foo", type, val); var inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations()); //helper ValueGetter <TType> hashGetter <TType>(HashingEstimator.ColumnOptions colInfo) { var xf = new HashingTransformer(Env, new[] { colInfo }); var mapper = ((ITransformer)xf).GetRowToRowMapper(inRow.Schema); var col = mapper.OutputSchema["Bar"]; var outRow = mapper.GetRow(inRow, col); return(outRow.GetGetter <TType>(col)); }; // First do an unordered hash. var info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits); var getter = hashGetter <uint>(info); uint result = 0; getter(ref result); Assert.Equal(expected, result); // Next do an ordered hash. info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true); getter = hashGetter <uint>(info); getter(ref result); Assert.Equal(expectedOrdered, result); // Next build up a vector to make sure that hashing is consistent between scalar values // at least in the first position, and in the unordered case, the last position. const int vecLen = 5; var denseVec = new VBuffer <T>(vecLen, Utils.CreateArray(vecLen, val)); builder = new DataViewSchema.Annotations.Builder(); builder.Add("Foo", new VectorDataViewType(type, vecLen), (ref VBuffer <T> dst) => denseVec.CopyTo(ref dst)); inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations()); info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: false); var vecGetter = hashGetter <VBuffer <uint> >(info); VBuffer <uint> vecResult = default; vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); // They all should equal this in this case. Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true); vecGetter = hashGetter <VBuffer <uint> >(info); vecGetter(ref vecResult); Assert.Equal(vecLen, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); Assert.All(vecResult.DenseValues(), v => Assert.True((v == 0) == (expectedOrdered == 0))); // Now combine into one hash. info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, combine: true); getter = hashGetter <uint>(info); getter(ref result); Assert.Equal(expectedCombined, result); // Let's now do a sparse vector. var sparseVec = new VBuffer <T>(10, 3, Utils.CreateArray(3, val), new[] { 0, 3, 7 }); builder = new DataViewSchema.Annotations.Builder(); builder.Add("Foo", new VectorDataViewType(type, vecLen), (ref VBuffer <T> dst) => sparseVec.CopyTo(ref dst)); inRow = AnnotationUtils.AnnotationsAsRow(builder.ToAnnotations()); info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: false); vecGetter = hashGetter <VBuffer <uint> >(info); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expected, vecResult.GetItemOrDefault(0)); Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, useOrderedHashing: true); vecGetter = hashGetter <VBuffer <uint> >(info); vecGetter(ref vecResult); Assert.Equal(10, vecResult.Length); Assert.Equal(expectedOrdered, vecResult.GetItemOrDefault(0)); Assert.Equal(expectedOrdered3, vecResult.GetItemOrDefault(3)); info = new HashingEstimator.ColumnOptions("Bar", "Foo", numberOfBits: bits, combine: true); getter = hashGetter <uint>(info); getter(ref result); Assert.Equal(expectedCombinedSparse, result); }