/// <summary>
 /// Convert a text column into hash-based one-hot encoded vector.
 /// </summary>
 /// <param name="catalog">The transform catalog</param>
 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
 /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
 /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 /// <param name="outputKind">The conversion mode.</param>
 public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                              string outputColumnName,
                                                              string inputColumnName     = null,
                                                              int numberOfBits           = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
                                                              int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts,
                                                              OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator)
 => new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName ?? outputColumnName, numberOfBits, maximumNumberOfInverts, outputKind);
Esempio n. 2
0
 /// <summary>
 /// Convert text columns into one-hot encoded vectors.
 /// </summary>
 /// <param name="catalog">The transform catalog</param>
 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
 /// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
 /// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
 /// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
 /// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
 /// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
 /// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
 /// <example>
 /// <format type="text/markdown">
 /// <![CDATA[
 ///  [!code-csharp[OneHotEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
 /// ]]></format>
 /// </example>
 public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                      string outputColumnName,
                                                      string inputColumnName = null,
                                                      OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind,
                                                      int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
                                                      ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
                                                      IDataView keyData = null)
 => new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
                                new[] { new OneHotEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality) }, keyData);
Esempio n. 3
0
 /// <summary>
 /// Instantiates a new instance of <see cref="OneHotHashEncodingEstimator"/>.
 /// </summary>
 /// <param name="env">Host Environment.</param>
 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of the column to transform.
 /// If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
 /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
 /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 /// <param name="outputKind">The type of output expected.</param>
 internal OneHotHashEncodingEstimator(IHostEnvironment env,
                                      string outputColumnName,
                                      string inputColumnName     = null,
                                      int numberOfBits           = Defaults.NumberOfBits,
                                      int maximumNumberOfInverts = Defaults.MaximumNumberOfInverts,
                                      OneHotEncodingEstimator.OutputKind outputKind = Defaults.OutputKind)
     : this(env, new ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName, outputKind, numberOfBits, maximumNumberOfInverts : maximumNumberOfInverts))
 {
 }
Esempio n. 4
0
 /// <summary>
 /// Convert a text column into hash-based one-hot encoded vector.
 /// </summary>
 /// <param name="catalog">The transform catalog</param>
 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
 /// <param name="outputKind">The conversion mode.</param>
 /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
 /// <param name="seed">Hashing seed.</param>
 /// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
 /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 /// <example>
 /// <format type="text/markdown">
 /// <![CDATA[
 ///  [!code-csharp[OneHotHashEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncoding.cs)]
 /// ]]></format>
 /// </example>
 public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                              string outputColumnName,
                                                              string inputColumnName = null,
                                                              OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator,
                                                              int numberOfBits           = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
                                                              uint seed                  = OneHotHashEncodingEstimator.Defaults.Seed,
                                                              bool useOrderedHashing     = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing,
                                                              int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts)
 => new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog),
                                    new[] { new OneHotHashEncodingEstimator.ColumnOptions(outputColumnName, inputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts) });
Esempio n. 5
0
 /// <summary>
 /// Describes how the transformer handles one column pair.
 /// </summary>
 /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
 /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param>
 /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
 /// <param name="seed">Hashing seed.</param>
 /// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
 /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 public ColumnOptions(string name, string inputColumnName           = null,
                      OneHotEncodingEstimator.OutputKind outputKind = Defaults.OutputKind,
                      int numberOfBits           = Defaults.NumberOfBits,
                      uint seed                  = Defaults.Seed,
                      bool useOrderedHashing     = Defaults.UseOrderedHashing,
                      int maximumNumberOfInverts = Defaults.MaximumNumberOfInverts)
 {
     HashingOptions = new HashingEstimator.ColumnOptions(name, inputColumnName ?? name, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts);
     OutputKind     = outputKind;
 }
Esempio n. 6
0
 /// <summary>
 /// A helper method to create <see cref="OneHotHashEncodingTransformer"/>.
 /// </summary>
 /// <param name="env">Host Environment.</param>
 /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
 /// <param name="name">Name of the output column.</param>
 /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
 /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
 /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
 /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
 /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
 /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
 /// <param name="outputKind">The type of output expected.</param>
 private static IDataView Create(IHostEnvironment env,
                                 IDataView input,
                                 string name,
                                 string source              = null,
                                 int numberOfBits           = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
                                 int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts,
                                 OneHotEncodingEstimator.OutputKind outputKind = OneHotHashEncodingEstimator.Defaults.OutputKind)
 {
     return(new OneHotHashEncodingEstimator(env, name, source, numberOfBits, maximumNumberOfInverts, outputKind).Fit(input).Transform(input) as IDataView);
 }
Esempio n. 7
0
        /// <summary>
        /// Convert text columns into one-hot encoded vectors.
        /// </summary>
        /// <param name="catalog">The transform catalog</param>
        /// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
        /// <param name="outputKind">Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector.</param>
        /// <param name="maximumNumberOfKeys">Maximum number of terms to keep per column when auto-training.</param>
        /// <param name="keyOrdinality">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence"/> choosen they will be in the order encountered.
        /// If <see cref="ValueToKeyMappingEstimator.KeyOrdinality.ByValue"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
        /// <param name="keyData">Specifies an ordering for the encoding. If specified, this should be a single column data view,
        /// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        ///  [!code-csharp[OneHotEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncodingMultiColumn.cs)]
        /// ]]></format>
        /// </example>
        public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                             InputOutputColumnPair[] columns,
                                                             OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.Defaults.OutKind,
                                                             int maximumNumberOfKeys = ValueToKeyMappingEstimator.Defaults.MaximumNumberOfKeys,
                                                             ValueToKeyMappingEstimator.KeyOrdinality keyOrdinality = ValueToKeyMappingEstimator.Defaults.Ordinality,
                                                             IDataView keyData = null)
        {
            var env = CatalogUtils.GetEnvironment(catalog);

            env.CheckValue(columns, nameof(columns));
            var columnOptions = columns.Select(x => new OneHotEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, maximumNumberOfKeys, keyOrdinality)).ToArray();

            return(new OneHotEncodingEstimator(env, columnOptions, keyData));
        }
        public static IEstimator <ITransformer> _OneHotHashEncoding(this MLContext MLContext, JToken componentObject)
        {
            string outputColumn = componentObject.Value <string>("OutputColumnName");
            string inputColumn  = componentObject.Value <string>("InputColumnName");

            OneHotEncodingEstimator.OutputKind outputKind = Enum.Parse <OneHotEncodingEstimator.OutputKind>(componentObject.Value <string>("OutputKind"));
            int  numBits           = componentObject.Value <int>("NumberOfBits");
            uint seed              = componentObject.Value <uint>("Seed");
            bool useOrderedHashing = componentObject.Value <bool>("UseOrderedHashing");
            int  maxNumInverts     = componentObject.Value <int>("MaximumNumberOfInverts");

            return(MLContext.Transforms.Categorical.OneHotHashEncoding(outputColumn, inputColumn, outputKind, numBits,
                                                                       seed, useOrderedHashing, maxNumInverts));
        }
Esempio n. 9
0
        /// <summary>
        /// Convert text columns into hash-based one-hot encoded vector columns.
        /// </summary>
        /// <param name="catalog">The transform catalog</param>
        /// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
        /// <param name="outputKind">The conversion mode.</param>
        /// <param name="numberOfBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
        /// <param name="seed">Hashing seed.</param>
        /// <param name="useOrderedHashing">Whether the position of each term should be included in the hash.</param>
        /// <param name="maximumNumberOfInverts">During hashing we constuct mappings between original values and the produced hash values.
        /// Text representation of original values are stored in the slot names of the  metadata for the new column.Hashing, as such, can map many initial values to one.
        /// <paramref name="maximumNumberOfInverts"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
        /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
        /// <example>
        /// <format type="text/markdown">
        /// <![CDATA[
        ///  [!code-csharp[OneHotHashEncoding](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotHashEncodingMultiColumn.cs)]
        /// ]]></format>
        /// </example>
        public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                                     InputOutputColumnPair[] columns,
                                                                     OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator,
                                                                     int numberOfBits           = OneHotHashEncodingEstimator.Defaults.NumberOfBits,
                                                                     uint seed                  = OneHotHashEncodingEstimator.Defaults.Seed,
                                                                     bool useOrderedHashing     = OneHotHashEncodingEstimator.Defaults.UseOrderedHashing,
                                                                     int maximumNumberOfInverts = OneHotHashEncodingEstimator.Defaults.MaximumNumberOfInverts)
        {
            var env = CatalogUtils.GetEnvironment(catalog);

            env.CheckValue(columns, nameof(columns));
            var columnOptions = columns.Select(x => new OneHotHashEncodingEstimator.ColumnOptions(x.OutputColumnName, x.InputColumnName, outputKind, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)).ToArray();

            return(new OneHotHashEncodingEstimator(env, columnOptions));
        }
Esempio n. 10
0
        internal OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnOptions[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(ValueToKeyMappingEstimator));
            _hash = new HashingEstimator(_host, columns.Select(x => x.HashingOptions).ToArray());
            using (var ch = _host.Start(nameof(OneHotHashEncodingEstimator)))
            {
                var binaryCols = new List <(string outputColumnName, string inputColumnName)>();
                var cols       = new List <(string outputColumnName, string inputColumnName, bool bag)>();
                for (int i = 0; i < columns.Length; i++)
                {
                    var column = columns[i];
                    OneHotEncodingEstimator.OutputKind kind = columns[i].OutputKind;
                    switch (kind)
                    {
                    default:
                        throw _host.ExceptUserArg(nameof(column.OutputKind));

                    case OneHotEncodingEstimator.OutputKind.Key:
                        continue;

                    case OneHotEncodingEstimator.OutputKind.Binary:
                        if ((column.HashingOptions.MaximumNumberOfInverts) != 0)
                        {
                            ch.Warning("Invert hashing is being used with binary encoding.");
                        }
                        binaryCols.Add((column.HashingOptions.Name, column.HashingOptions.Name));
                        break;

                    case OneHotEncodingEstimator.OutputKind.Indicator:
                        cols.Add((column.HashingOptions.Name, column.HashingOptions.Name, false));
                        break;

                    case OneHotEncodingEstimator.OutputKind.Bag:
                        cols.Add((column.HashingOptions.Name, column.HashingOptions.Name, true));
                        break;
                    }
                }
                IEstimator <ITransformer> toBinVector = null;
                IEstimator <ITransformer> toVector    = null;
                if (binaryCols.Count > 0)
                {
                    toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => (x.outputColumnName, x.inputColumnName)).ToArray());
                }
                if (cols.Count > 0)
                {
                    toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingEstimator.ColumnOptions(x.outputColumnName, x.inputColumnName, x.bag)).ToArray());
                }

                if (toBinVector != null && toVector != null)
                {
                    _toSomething = toVector.Append(toBinVector);
                }
                else
                {
                    if (toBinVector != null)
                    {
                        _toSomething = toBinVector;
                    }
                    else
                    {
                        _toSomething = toVector;
                    }
                }
            }
        }
Esempio n. 11
0
 /// <summary>
 /// Convert a text column into one-hot encoded vector.
 /// </summary>
 /// <param name="catalog">The transform catalog</param>
 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
 /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
 /// <param name="outputKind">The conversion mode.</param>
 /// <example>
 /// <format type="text/markdown">
 /// <![CDATA[
 ///  [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs)]
 /// ]]></format>
 /// </example>
 public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                      string outputColumnName,
                                                      string inputColumnName = null,
                                                      OneHotEncodingEstimator.OutputKind outputKind = OneHotEncodingEstimator.OutputKind.Indicator)
 => new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, outputKind);