public ColumnInfo(string input, string output, CategoricalTransform.OutputKind outputKind = Defaults.OutKind,
                   int maxNumTerms = TermEstimator.Defaults.MaxNumTerms, TermTransform.SortOrder sort = TermEstimator.Defaults.Sort,
                   string[] term   = null)
     : base(input, output, maxNumTerms, sort, term, true)
 {
     OutputKind = outputKind;
 }
 /// <summary>
 /// Describes how the transformer handles one column pair.
 /// </summary>
 /// <param name="input">Name of input column.</param>
 /// <param name="output">Name of output column.</param>
 /// <param name="outputKind">Kind of output: bag, indicator vector etc.</param>
 /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
 /// <param name="seed">Hashing seed.</param>
 /// <param name="ordered">Whether the position of each term should be included in the hash.</param>
 /// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
 public ColumnInfo(string input, string output,
                   CategoricalTransform.OutputKind outputKind = Defaults.OutputKind,
                   int hashBits   = Defaults.HashBits,
                   uint seed      = Defaults.Seed,
                   bool ordered   = Defaults.Ordered,
                   int invertHash = Defaults.InvertHash)
 {
     HashInfo   = new HashTransformer.ColumnInfo(input, output, hashBits, seed, ordered, invertHash);
     OutputKind = outputKind;
 }
 /// <summary>
 /// A helper method to create <see cref="CategoricalHashTransform"/>.
 /// </summary>
 /// <param name="env">Host Environment.</param>
 /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
 /// <param name="name">Name of the output column.</param>
 /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
 /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
 /// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
 /// <param name="outputKind">The type of output expected.</param>
 public static IDataView Create(IHostEnvironment env,
                                IDataView input,
                                string name,
                                string source  = null,
                                int hashBits   = OneHotHashEncodingEstimator.Defaults.HashBits,
                                int invertHash = OneHotHashEncodingEstimator.Defaults.InvertHash,
                                CategoricalTransform.OutputKind outputKind = OneHotHashEncodingEstimator.Defaults.OutputKind)
 {
     return(new OneHotHashEncodingEstimator(env, name, source, outputKind).Fit(input).Transform(input) as IDataView);
 }
        public CategoricalEstimator(IHostEnvironment env, params ColumnInfo[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(TermEstimator));
            _term = new TermEstimator(_host, columns);

            var  cols           = new List <(string input, string output, bool bag)>();
            bool binaryEncoding = false;

            for (int i = 0; i < columns.Length; i++)
            {
                var  column = columns[i];
                bool bag;
                CategoricalTransform.OutputKind kind = columns[i].OutputKind;
                switch (kind)
                {
                default:
                    throw _host.ExceptUserArg(nameof(column.OutputKind));

                case CategoricalTransform.OutputKind.Key:
                    continue;

                case CategoricalTransform.OutputKind.Bin:
                    binaryEncoding = true;
                    bag            = false;
                    break;

                case CategoricalTransform.OutputKind.Ind:
                    bag = false;
                    break;

                case CategoricalTransform.OutputKind.Bag:
                    bag = true;
                    break;
                }
                cols.Add((column.Output, column.Output, bag));
                if (binaryEncoding)
                {
                    _keyToSomething = new KeyToBinaryVectorEstimator(_host, cols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray());
                }
                else
                {
                    _keyToSomething = new KeyToVectorEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray());
                }
            }
        }
Exemple #5
0
        /// <summary>
        /// A helper method to create <see cref="CategoricalHashTransform"/> for public facing API.
        /// </summary>
        /// <param name="env">Host Environment.</param>
        /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param>
        /// <param name="name">Name of the output column.</param>
        /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
        /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 30, inclusive.</param>
        /// <param name="invertHash">Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.</param>
        /// <param name="outputKind">The type of output expected.</param>
        public static IDataTransform Create(IHostEnvironment env,
                                            IDataView input,
                                            string name,
                                            string source  = null,
                                            int hashBits   = Defaults.HashBits,
                                            int invertHash = Defaults.InvertHash,
                                            CategoricalTransform.OutputKind outputKind = Defaults.OutputKind)
        {
            var args = new Arguments()
            {
                Column = new[] { new Column()
                                 {
                                     Source = source ?? name,
                                     Name   = name
                                 } },
                HashBits   = hashBits,
                InvertHash = invertHash,
                OutputKind = outputKind
            };

            return(Create(env, args, input));
        }
        public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(ValueToKeyMappingEstimator));
            _hash = new HashingEstimator(_host, columns.Select(x => x.HashInfo).ToArray());
            using (var ch = _host.Start(nameof(OneHotHashEncodingEstimator)))
            {
                var binaryCols = new List <(string input, string output)>();
                var cols       = new List <(string input, string output, bool bag)>();
                for (int i = 0; i < columns.Length; i++)
                {
                    var column = columns[i];
                    CategoricalTransform.OutputKind kind = columns[i].OutputKind;
                    switch (kind)
                    {
                    default:
                        throw _host.ExceptUserArg(nameof(column.OutputKind));

                    case CategoricalTransform.OutputKind.Key:
                        continue;

                    case CategoricalTransform.OutputKind.Bin:
                        if ((column.HashInfo.InvertHash) != 0)
                        {
                            ch.Warning("Invert hashing is being used with binary encoding.");
                        }
                        binaryCols.Add((column.HashInfo.Output, column.HashInfo.Output));
                        break;

                    case CategoricalTransform.OutputKind.Ind:
                        cols.Add((column.HashInfo.Output, column.HashInfo.Output, false));
                        break;

                    case CategoricalTransform.OutputKind.Bag:
                        cols.Add((column.HashInfo.Output, column.HashInfo.Output, true));
                        break;
                    }
                }
                IEstimator <ITransformer> toBinVector = null;
                IEstimator <ITransformer> toVector    = null;
                if (binaryCols.Count > 0)
                {
                    toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray());
                }
                if (cols.Count > 0)
                {
                    toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray());
                }

                if (toBinVector != null && toVector != null)
                {
                    _toSomething = toVector.Append(toBinVector);
                }
                else
                {
                    if (toBinVector != null)
                    {
                        _toSomething = toBinVector;
                    }
                    else
                    {
                        _toSomething = toVector;
                    }
                }
            }
        }
 /// A helper method to create <see cref="OneHotHashEncodingEstimator"/> for public facing API.
 /// <param name="env">Host Environment.</param>
 /// <param name="inputColumn">Name of the input column.</param>
 /// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
 /// <param name="outputKind">The type of output expected.</param>
 public OneHotHashEncodingEstimator(IHostEnvironment env, string inputColumn,
                                    string outputColumn = null, CategoricalTransform.OutputKind outputKind = Defaults.OutputKind)
     : this(env, new ColumnInfo(inputColumn, outputColumn ?? inputColumn, outputKind))
 {
 }
        public CategoricalEstimator(IHostEnvironment env, params ColumnInfo[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(TermEstimator));
            _term = new TermEstimator(_host, columns);
            var binaryCols = new List <(string input, string output)>();
            var cols       = new List <(string input, string output, bool bag)>();

            for (int i = 0; i < columns.Length; i++)
            {
                var column = columns[i];
                CategoricalTransform.OutputKind kind = columns[i].OutputKind;
                switch (kind)
                {
                default:
                    throw _host.ExceptUserArg(nameof(column.OutputKind));

                case CategoricalTransform.OutputKind.Key:
                    continue;

                case CategoricalTransform.OutputKind.Bin:
                    binaryCols.Add((column.Output, column.Output));
                    break;

                case CategoricalTransform.OutputKind.Ind:
                    cols.Add((column.Output, column.Output, false));
                    break;

                case CategoricalTransform.OutputKind.Bag:
                    cols.Add((column.Output, column.Output, true));
                    break;
                }
            }
            IEstimator <ITransformer> toBinVector = null;
            IEstimator <ITransformer> toVector    = null;

            if (binaryCols.Count > 0)
            {
                toBinVector = new KeyToBinaryVectorEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray());
            }
            if (cols.Count > 0)
            {
                toVector = new KeyToVectorEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray());
            }

            if (toBinVector != null && toVector != null)
            {
                _toSomething = toVector.Append(toBinVector);
            }
            else
            {
                if (toBinVector != null)
                {
                    _toSomething = toBinVector;
                }
                else
                {
                    _toSomething = toVector;
                }
            }
        }
 /// A helper method to create <see cref="CategoricalEstimator"/> for public facing API.
 /// <param name="env">Host Environment.</param>
 /// <param name="name">Name of the output column.</param>
 /// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
 /// <param name="outputKind">The type of output expected.</param>
 public CategoricalEstimator(IHostEnvironment env, string name,
                             string source = null, CategoricalTransform.OutputKind outputKind = Defaults.OutKind)
     : this(env, new ColumnInfo(source ?? name, name, outputKind))
 {
 }
Exemple #10
0
 /// <summary>
 /// Convert a text column into hash-based one-hot encoded vector.
 /// </summary>
 /// <param name="catalog">The transform catalog</param>
 /// <param name="inputColumn">The input column</param>
 /// <param name="outputColumn">The output column. If <c>null</c>, <paramref name="inputColumn"/> is used.</param>
 /// <param name="outputKind">The conversion mode.</param>
 /// <returns></returns>
 public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog,
                                                              string inputColumn, string outputColumn = null, CategoricalTransform.OutputKind outputKind = CategoricalTransform.OutputKind.Ind)
 => new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, outputKind);
Exemple #11
0
 /// A helper method to create <see cref="CategoricalEstimator"/> for public facing API.
 /// <param name="env">Host Environment.</param>
 /// <param name="input">Name of the column to be transformed.</param>
 /// <param name="output">Name of the output column. If this is <c>null</c>, <paramref name="input"/> is used.</param>
 /// <param name="outputKind">The type of output expected.</param>
 public CategoricalEstimator(IHostEnvironment env, string input,
                             string output = null, CategoricalTransform.OutputKind outputKind = Defaults.OutKind)
     : this(env, new[] { new ColumnInfo(input, output ?? input, outputKind) })
 {
 }
        private static IDataTransform CreateTransformCore(CategoricalTransform.OutputKind argsOutputKind, OneToOneColumn[] columns,
                                                          List <CategoricalTransform.OutputKind?> columnOutputKinds, IDataTransform input, IHost h, Arguments catHashArgs = null)
        {
            Contracts.CheckValue(columns, nameof(columns));
            Contracts.CheckValue(columnOutputKinds, nameof(columnOutputKinds));
            Contracts.CheckParam(columns.Length == columnOutputKinds.Count, nameof(columns));

            using (var ch = h.Start("Create Transform Core"))
            {
                // Create the KeyToVectorTransform, if needed.
                var  cols           = new List <KeyToVectorTransform.Column>();
                bool binaryEncoding = argsOutputKind == CategoricalTransform.OutputKind.Bin;
                for (int i = 0; i < columns.Length; i++)
                {
                    var column = columns[i];
                    if (!column.TrySanitize())
                    {
                        throw h.ExceptUserArg(nameof(Column.Name));
                    }

                    bool?bag;
                    CategoricalTransform.OutputKind kind = columnOutputKinds[i] ?? argsOutputKind;
                    switch (kind)
                    {
                    default:
                        throw ch.ExceptUserArg(nameof(Column.OutputKind));

                    case CategoricalTransform.OutputKind.Key:
                        continue;

                    case CategoricalTransform.OutputKind.Bin:
                        binaryEncoding = true;
                        bag            = false;
                        break;

                    case CategoricalTransform.OutputKind.Ind:
                        bag = false;
                        break;

                    case CategoricalTransform.OutputKind.Bag:
                        bag = true;
                        break;
                    }
                    var col = new KeyToVectorTransform.Column();
                    col.Name   = column.Name;
                    col.Source = column.Name;
                    col.Bag    = bag;
                    cols.Add(col);
                }

                if (cols.Count == 0)
                {
                    return(input);
                }

                IDataTransform transform;
                if (binaryEncoding)
                {
                    if ((catHashArgs?.InvertHash ?? 0) != 0)
                    {
                        ch.Warning("Invert hashing is being used with binary encoding.");
                    }

                    var keyToBinaryVecCols = cols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.Source, x.Name)).ToArray();
                    transform = KeyToBinaryVectorTransform.Create(h, input, keyToBinaryVecCols);
                }
                else
                {
                    var keyToVecCols = cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.Source, x.Name, x.Bag ?? argsOutputKind == CategoricalTransform.OutputKind.Bag)).ToArray();

                    transform = KeyToVectorTransform.Create(h, input, keyToVecCols);
                }

                ch.Done();
                return(transform);
            }
        }
Exemple #13
0
        public OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns,
                                       string file = null, string termsColumn = null,
                                       IComponentFactory <IMultiStreamSource, IDataLoader> loaderFactory = null)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(OneHotEncodingEstimator));
            _term = new ValueToKeyMappingEstimator(_host, columns, file, termsColumn, loaderFactory);
            var binaryCols = new List <(string input, string output)>();
            var cols       = new List <(string input, string output, bool bag)>();

            for (int i = 0; i < columns.Length; i++)
            {
                var column = columns[i];
                CategoricalTransform.OutputKind kind = columns[i].OutputKind;
                switch (kind)
                {
                default:
                    throw _host.ExceptUserArg(nameof(column.OutputKind));

                case CategoricalTransform.OutputKind.Key:
                    continue;

                case CategoricalTransform.OutputKind.Bin:
                    binaryCols.Add((column.Output, column.Output));
                    break;

                case CategoricalTransform.OutputKind.Ind:
                    cols.Add((column.Output, column.Output, false));
                    break;

                case CategoricalTransform.OutputKind.Bag:
                    cols.Add((column.Output, column.Output, true));
                    break;
                }
            }
            IEstimator <ITransformer> toBinVector = null;
            IEstimator <ITransformer> toVector    = null;

            if (binaryCols.Count > 0)
            {
                toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray());
            }
            if (cols.Count > 0)
            {
                toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray());
            }

            if (toBinVector != null && toVector != null)
            {
                _toSomething = toVector.Append(toBinVector);
            }
            else
            {
                if (toBinVector != null)
                {
                    _toSomething = toBinVector;
                }
                else
                {
                    _toSomething = toVector;
                }
            }
        }