Exemplo n.º 1
        internal static ITransformer CreateTransformer(IHostEnvironment env, Options options, IDataView input)
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            // To each input column to the WordHashBagTransform, a tokenize transform is applied,
            // followed by applying WordHashVectorizeTransform.
            // Since WordHashBagTransform is a many-to-one column transform, for each
            // WordHashBagTransform.Column we may need to define multiple tokenize transform columns.
            // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns.
            // The intermediate columns are dropped at the end of using a DropColumnsTransform.
            IDataView view = input;

            var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, options.Columns, view.Schema);

            Contracts.Assert(uniqueSourceNames.Length == options.Columns.Length);

            var           tokenizeColumns = new List <WordTokenizingEstimator.ColumnOptions>();
            var           extractorCols   = new NgramHashExtractingTransformer.Column[options.Columns.Length];
            var           colCount        = options.Columns.Length;
            List <string> tmpColNames     = new List <string>();

            for (int iinfo = 0; iinfo < colCount; iinfo++)
                var column      = options.Columns[iinfo];
                int srcCount    = column.Source.Length;
                var curTmpNames = new string[srcCount];
                Contracts.Assert(uniqueSourceNames[iinfo].Length == options.Columns[iinfo].Source.Length);
                for (int isrc = 0; isrc < srcCount; isrc++)
                    tokenizeColumns.Add(new WordTokenizingEstimator.ColumnOptions(curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc], options.Columns[iinfo].Source[isrc]));

                extractorCols[iinfo] =
                    new NgramHashExtractingTransformer.Column
                    Name                   = column.Name,
                    Source                 = curTmpNames,
                    NumberOfBits           = column.NumberOfBits,
                    NgramLength            = column.NgramLength,
                    Seed                   = column.Seed,
                    SkipLength             = column.SkipLength,
                    Ordered                = column.Ordered,
                    MaximumNumberOfInverts = column.MaximumNumberOfInverts,
                    FriendlyNames          = options.Columns[iinfo].Source,
                    UseAllLengths          = column.UseAllLengths

            ITransformer t1 = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view);

            var featurizeArgs =
                new NgramHashExtractingTransformer.Options
                UseAllLengths          = options.UseAllLengths,
                NumberOfBits           = options.NumberOfBits,
                NgramLength            = options.NgramLength,
                SkipLength             = options.SkipLength,
                Ordered                = options.Ordered,
                Seed                   = options.Seed,
                Columns                = extractorCols.ToArray(),
                MaximumNumberOfInverts = options.MaximumNumberOfInverts

            view = t1.Transform(view);
            ITransformer t2 = NgramHashExtractingTransformer.Create(h, featurizeArgs, view);

            // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform.
            ITransformer t3 = new ColumnSelectingTransformer(env, null, tmpColNames.ToArray());

            return(new TransformerChain <ITransformer>(new[] { t1, t2, t3 }));
Exemplo n.º 2
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Columns) > 0, nameof(args.Columns), "Columns must be specified");

            // Compose the WordBagTransform from a tokenize transform,
            // followed by a NgramExtractionTransform.
            // Since WordBagTransform is a many-to-one column transform, for each
            // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform.

            // REVIEW: In order to not get ngrams that cross between vector slots, we need to
            // enable tokenize transforms to insert a special token between slots.

            // REVIEW: In order to make it possible to output separate bags for different columns
            // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries.

            var tokenizeColumns = new WordTokenizingTransformer.ColumnInfo[args.Columns.Length];

            var extractorArgs =
                new NgramExtractorTransform.Arguments()
                MaxNumTerms = args.MaxNumTerms,
                NgramLength = args.NgramLength,
                SkipLength  = args.SkipLength,
                AllLengths  = args.AllLengths,
                Weighting   = args.Weighting,
                Columns     = new NgramExtractorTransform.Column[args.Columns.Length]

            for (int iinfo = 0; iinfo < args.Columns.Length; iinfo++)
                var column = args.Columns[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source));
                h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                tokenizeColumns[iinfo] = new WordTokenizingTransformer.ColumnInfo(column.Name, column.Source.Length > 1 ? column.Name : column.Source[0]);

                extractorArgs.Columns[iinfo] =
                    new NgramExtractorTransform.Column()
                    Name        = column.Name,
                    Source      = column.Name,
                    MaxNumTerms = column.MaxNumTerms,
                    NgramLength = column.NgramLength,
                    SkipLength  = column.SkipLength,
                    Weighting   = column.Weighting,
                    AllLengths  = column.AllLengths

            IDataView view = input;

            view = NgramExtractionUtils.ApplyConcatOnSources(h, args.Columns, view);
            view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view);
            return(NgramExtractorTransform.Create(h, extractorArgs, view));
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");

            // To each input column to the WordHashBagTransform, a tokenize transform is applied,
            // followed by applying WordHashVectorizeTransform.
            // Since WordHashBagTransform is a many-to-one column transform, for each
            // WordHashBagTransform.Column we may need to define multiple tokenize transform columns.
            // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns.
            // The intermediate columns are dropped at the end of using a DropColumnsTransform.
            IDataView view = input;

            var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, args.Column, view.Schema);

            Contracts.Assert(uniqueSourceNames.Length == args.Column.Length);

            var           tokenizeColumns = new WordTokenizeTransform.ColumnInfo[args.Column.Length];
            var           extractorCols   = new NgramHashExtractorTransform.Column[args.Column.Length];
            var           colCount        = args.Column.Length;
            List <string> tmpColNames     = new List <string>();

            for (int iinfo = 0; iinfo < colCount; iinfo++)
                var column      = args.Column[iinfo];
                int srcCount    = column.Source.Length;
                var curTmpNames = new string[srcCount];
                Contracts.Assert(uniqueSourceNames[iinfo].Length == args.Column[iinfo].Source.Length);
                for (int isrc = 0; isrc < srcCount; isrc++)
                    tokenizeColumns[iinfo] = new WordTokenizeTransform.ColumnInfo(args.Column[iinfo].Source[isrc], curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc]);

                extractorCols[iinfo] =
                    new NgramHashExtractorTransform.Column
                    Name          = column.Name,
                    Source        = curTmpNames,
                    HashBits      = column.HashBits,
                    NgramLength   = column.NgramLength,
                    Seed          = column.Seed,
                    SkipLength    = column.SkipLength,
                    Ordered       = column.Ordered,
                    InvertHash    = column.InvertHash,
                    FriendlyNames = args.Column[iinfo].Source,
                    AllLengths    = column.AllLengths

            view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view);

            var featurizeArgs =
                new NgramHashExtractorTransform.Arguments
                AllLengths  = args.AllLengths,
                HashBits    = args.HashBits,
                NgramLength = args.NgramLength,
                SkipLength  = args.SkipLength,
                Ordered     = args.Ordered,
                Seed        = args.Seed,
                Column      = extractorCols.ToArray(),
                InvertHash  = args.InvertHash

            view = NgramHashExtractorTransform.Create(h, featurizeArgs, view);

            // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform.
            return(SelectColumnsTransform.CreateDrop(h, view, tmpColNames.ToArray()));