Exemplo n.º 1
0
        public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, NgramExtractingTransformer.Arguments input)
        {
            var h  = EntryPointUtils.CheckArgsAndCreateHost(env, "NGramTransform", input);
            var xf = NgramExtractingTransformer.Create(h, input, input.Data);

            return(new CommonOutputs.TransformOutput()
            {
                Model = new TransformModelImpl(h, xf, input.Data),
                OutputData = xf
            });
        }
Exemplo n.º 2
0
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");

            // Compose the WordBagTransform from a tokenize transform,
            // followed by a NgramExtractionTransform.
            // Since WordBagTransform is a many-to-one column transform, for each
            // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform.

            // REVIEW: In order to not get ngrams that cross between vector slots, we need to
            // enable tokenize transforms to insert a special token between slots.

            // REVIEW: In order to make it possible to output separate bags for different columns
            // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries.

            var tokenizeColumns = new WordTokenizingTransformer.ColumnInfo[args.Column.Length];

            var extractorArgs =
                new NgramExtractingTransformer.Arguments()
            {
                MaxNumTerms = args.MaxNumTerms,
                NgramLength = args.NgramLength,
                SkipLength  = args.SkipLength,
                AllLengths  = args.AllLengths,
                Weighting   = args.Weighting,
                Column      = new NgramExtractingTransformer.Column[args.Column.Length]
            };

            for (int iinfo = 0; iinfo < args.Column.Length; iinfo++)
            {
                var column = args.Column[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source));
                h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                tokenizeColumns[iinfo] = new WordTokenizingTransformer.ColumnInfo(column.Source.Length > 1 ? column.Name : column.Source[0], column.Name);

                extractorArgs.Column[iinfo] =
                    new NgramExtractingTransformer.Column()
                {
                    Name        = column.Name,
                    Source      = column.Name,
                    MaxNumTerms = column.MaxNumTerms,
                    NgramLength = column.NgramLength,
                    SkipLength  = column.SkipLength,
                    Weighting   = column.Weighting,
                    AllLengths  = column.AllLengths
                };
            }

            IDataView view = input;

            view = NgramExtractionUtils.ApplyConcatOnSources(h, args.Column, view);
            view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view);
            return(NgramExtractingTransformer.Create(h, extractorArgs, view));
        }