internal static ITransformer Create(IHostEnvironment env, Options options, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            var chain = new TransformerChain <ITransformer>();

            // To each input column to the NgramHashExtractorArguments, a HashTransform using 31
            // bits (to minimize collisions) is applied first, followed by an NgramHashTransform.

            var hashColumns      = new List <HashingEstimator.ColumnOptions>();
            var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length];

            var colCount = options.Columns.Length;

            // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source
            // column name when a 'name' destination column name was specified, we use temporary column names.
            string[][] tmpColNames = new string[colCount][];
            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column = options.Columns[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0 &&
                               column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                int srcCount = column.Source.Length;
                tmpColNames[iinfo] = new string[srcCount];
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]);
                    tmpColNames[iinfo][isrc] = tmpName;

                    hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, column.Source[isrc],
                                                                       30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts));
                }

                ngramHashColumns[iinfo] =
                    new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo],
                                                            column.NgramLength ?? options.NgramLength,
                                                            column.SkipLength ?? options.SkipLength,
                                                            column.UseAllLengths ?? options.UseAllLengths,
                                                            column.NumberOfBits ?? options.NumberOfBits,
                                                            column.Seed ?? options.Seed,
                                                            column.Ordered ?? options.Ordered,
                                                            column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts);
                ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames;
            }

            var hashing = new HashingEstimator(h, hashColumns.ToArray()).Fit(input);

            return(chain.Append(hashing)
                   .Append(new NgramHashingEstimator(h, ngramHashColumns).Fit(hashing.Transform(input)))
                   .Append(new ColumnSelectingTransformer(h, null, tmpColNames.SelectMany(cols => cols).ToArray())));
        }
Example #2
0
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input,
                                              TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            // To each input column to the NgramHashExtractorArguments, a HashTransform using 31
            // bits (to minimize collisions) is applied first, followed by an NgramHashTransform.
            IDataView view = input;

            List <ValueToKeyMappingTransformer.Column> termCols = null;

            if (termLoaderArgs != null)
            {
                termCols = new List <ValueToKeyMappingTransformer.Column>();
            }

            var hashColumns      = new List <HashingEstimator.ColumnOptions>();
            var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length];

            var colCount = options.Columns.Length;

            // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source
            // column name when a 'name' destination column name was specified, we use temporary column names.
            string[][] tmpColNames = new string[colCount][];
            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column = options.Columns[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0 &&
                               column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                int srcCount = column.Source.Length;
                tmpColNames[iinfo] = new string[srcCount];
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]);
                    tmpColNames[iinfo][isrc] = tmpName;
                    if (termLoaderArgs != null)
                    {
                        termCols.Add(
                            new ValueToKeyMappingTransformer.Column
                        {
                            Name   = tmpName,
                            Source = column.Source[isrc]
                        });
                    }

                    hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName,
                                                                       30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts));
                }

                ngramHashColumns[iinfo] =
                    new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo],
                                                            column.NgramLength ?? options.NgramLength,
                                                            column.SkipLength ?? options.SkipLength,
                                                            column.UseAllLengths ?? options.UseAllLengths,
                                                            column.NumberOfBits ?? options.NumberOfBits,
                                                            column.Seed ?? options.Seed,
                                                            column.Ordered ?? options.Ordered,
                                                            column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts);
                ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames;
            }

            if (termLoaderArgs != null)
            {
                h.Assert(Utils.Size(termCols) == hashColumns.Count);
                var termArgs =
                    new ValueToKeyMappingTransformer.Options()
                {
                    MaxNumTerms = int.MaxValue,
                    Term        = termLoaderArgs.Term,
                    Terms       = termLoaderArgs.Terms,
                    DataFile    = termLoaderArgs.DataFile,
                    Loader      = termLoaderArgs.Loader,
                    TermsColumn = termLoaderArgs.TermsColumn,
                    Sort        = termLoaderArgs.Sort,
                    Columns     = termCols.ToArray()
                };
                view = ValueToKeyMappingTransformer.Create(h, termArgs, view);

                if (termLoaderArgs.DropUnknowns)
                {
                    var missingDropColumns = new (string outputColumnName, string inputColumnName)[termCols.Count];