internal CountTargetEncodingEstimator(IHostEnvironment env, string labelColumnName, CountTableEstimator.SharedColumnOptions[] columnOptions, CountTableBuilderBase countTableBuilder, int numberOfBits = HashingEstimator.Defaults.NumberOfBits, bool combine = HashingEstimator.Defaults.Combine, uint hashingSeed = HashingEstimator.Defaults.Seed) : this(env, new CountTableEstimator(env, labelColumnName, countTableBuilder, columnOptions.Select(col => new CountTableEstimator.SharedColumnOptions(col.Name, col.Name, col.PriorCoefficient, col.LaplaceScale, col.Seed)).ToArray()), columnOptions, numberOfBits, combine, hashingSeed) { }
/// <summary> /// Transforms a categorical column into a set of features that includes the count of each label class, /// the log-odds for each label class and the back-off indicator. /// </summary> /// <param name="catalog">The transforms catalog.</param> /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> /// <param name="labelColumn">The name of the label column.</param> /// <param name="builder">The builder that creates the count tables from the training data.</param> /// <param name="priorCoefficient">The coefficient with which to apply the prior smoothing to the features.</param> /// <param name="laplaceScale">The Laplacian noise diversity/scale-parameter. Recommended values are between 0 and 1. Note that the noise /// will only be applied if the estimator is part of an <see cref="EstimatorChain{TLastTransformer}"/>, when fitting the next estimator in the chain.</param> /// <param name="numberOfBits">The number of bits to hash the input into. Must be between 1 and 31, inclusive.</param> /// <param name="combine">In case the input is a vector column, indicates whether the values should be combined into a single hash to create a single /// count table, or be left as a vector of hashes with multiple count tables.</param> /// <param name="hashingSeed">The seed used for hashing the input columns.</param> public static CountTargetEncodingEstimator CountTargetEncode(this TransformsCatalog catalog, string outputColumnName, string inputColumnName = null, string labelColumn = DefaultColumnNames.Label, CountTableBuilderBase builder = null, float priorCoefficient = CountTableTransformer.Defaults.PriorCoefficient, float laplaceScale = CountTableTransformer.Defaults.LaplaceScale, int numberOfBits = HashingEstimator.Defaults.NumberOfBits, bool combine = HashingEstimator.Defaults.Combine, uint hashingSeed = HashingEstimator.Defaults.Seed) { var env = CatalogUtils.GetEnvironment(catalog); env.CheckNonEmpty(outputColumnName, nameof(outputColumnName)); inputColumnName = string.IsNullOrEmpty(inputColumnName) ? outputColumnName : inputColumnName; builder = builder ?? new CMCountTableBuilder(); return(new CountTargetEncodingEstimator(env, labelColumn, new[] { new CountTableEstimator.ColumnOptions(outputColumnName, inputColumnName, builder, priorCoefficient, laplaceScale) }, numberOfBits, combine, hashingSeed)); }
/// <summary> /// Transforms a categorical column into a set of features that includes the count of each label class, /// the log-odds for each label class and the back-off indicator. /// </summary> /// <param name="catalog">The transforms catalog.</param> /// <param name="columns">The input and output columns.</param> /// <param name="labelColumn">The name of the label column.</param> /// <param name="builder">The builder that creates the count tables from the training data.</param> /// <param name="priorCoefficient">The coefficient with which to apply the prior smoothing to the features.</param> /// <param name="laplaceScale">The Laplacian noise diversity/scale-parameter. Recommended values are between 0 and 1. Note that the noise /// will only be applied if the estimator is part of an <see cref="EstimatorChain{TLastTransformer}"/>, when fitting the next estimator in the chain.</param> /// <param name="sharedTable">Indicates whether to keep counts for all columns and slots in one shared count table. If true, the keys in the count table /// will include a hash of the column and slot indices.</param> /// <param name="numberOfBits">The number of bits to hash the input into. Must be between 1 and 31, inclusive.</param> /// <param name="combine">In case the input is a vector column, indicates whether the values should be combined into a single hash to create a single /// count table, or be left as a vector of hashes with multiple count tables.</param> /// <param name="hashingSeed">The seed used for hashing the input columns.</param> /// <returns></returns> public static CountTargetEncodingEstimator CountTargetEncode(this TransformsCatalog catalog, InputOutputColumnPair[] columns, string labelColumn = DefaultColumnNames.Label, CountTableBuilderBase builder = null, float priorCoefficient = CountTableTransformer.Defaults.PriorCoefficient, float laplaceScale = CountTableTransformer.Defaults.LaplaceScale, bool sharedTable = CountTableTransformer.Defaults.SharedTable, int numberOfBits = HashingEstimator.Defaults.NumberOfBits, bool combine = HashingEstimator.Defaults.Combine, uint hashingSeed = HashingEstimator.Defaults.Seed) { var env = CatalogUtils.GetEnvironment(catalog); env.CheckValue(columns, nameof(columns)); builder = builder ?? new CMCountTableBuilder(); CountTargetEncodingEstimator estimator; if (sharedTable) { var columnOptions = new CountTableEstimator.SharedColumnOptions[columns.Length]; for (int i = 0; i < columns.Length; i++) { columnOptions[i] = new CountTableEstimator.SharedColumnOptions( columns[i].OutputColumnName, columns[i].InputColumnName, priorCoefficient, laplaceScale); } estimator = new CountTargetEncodingEstimator(env, labelColumn, columnOptions, builder, numberOfBits, combine, hashingSeed); } else { var columnOptions = new CountTableEstimator.ColumnOptions[columns.Length]; for (int i = 0; i < columns.Length; i++) { columnOptions[i] = new CountTableEstimator.ColumnOptions( columns[i].OutputColumnName, columns[i].InputColumnName, builder, priorCoefficient, laplaceScale); } estimator = new CountTargetEncodingEstimator(env, labelColumn, columnOptions, numberOfBits: numberOfBits, combine: combine, hashingSeed: hashingSeed); } return(estimator); }
public BagMultiCountTableBuilder(IHostEnvironment env, DataViewSchema.Column[] inputColumns, CountTableBuilderBase builder, long labelCardinality) { Contracts.CheckValue(env, nameof(env)); env.CheckNonEmpty(inputColumns, nameof(inputColumns)); _host = env.Register(LoaderSignature); // REVIEW: how to disallow non-zero garbage bin for bag dict count table? Or maybe just ignore? _builder = builder.GetInternalBuilder(labelCardinality); _colCount = inputColumns.Length; _slotCount = new int[_colCount]; for (int i = 0; i < _colCount; i++) { _slotCount[i] = inputColumns[i].Type.GetValueCount(); } }
internal CountTableEstimator(IHostEnvironment env, string labelColumnName, CountTableBuilderBase countTableBuilder, params SharedColumnOptions[] columns) : this(env, labelColumnName, columns) { _sharedBuilder = countTableBuilder; }
public ColumnOptions(string name, string inputColumnName, CountTableBuilderBase countTableBuilder, float priorCoefficient = 1, float laplaceScale = 0, int seed = 314489979) : base(name, inputColumnName, priorCoefficient, laplaceScale, seed) { CountTableBuilder = countTableBuilder; }