コード例 #1
0
        internal static IEstimator <ITransformer> CreateEstimator(IHostEnvironment env, Options options, SchemaShape inputSchema)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(options, nameof(options));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            // Compose the WordBagTransform from a tokenize transform,
            // followed by a NgramExtractionTransform.
            // Since WordBagTransform is a many-to-one column transform, for each
            // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform.

            // REVIEW: In order to not get n-grams that cross between vector slots, we need to
            // enable tokenize transforms to insert a special token between slots.

            // REVIEW: In order to make it possible to output separate bags for different columns
            // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries.

            var tokenizeColumns = new WordTokenizingEstimator.ColumnOptions[options.Columns.Length];

            var extractorArgs =
                new NgramExtractorTransform.Options()
            {
                MaxNumTerms   = options.MaxNumTerms,
                NgramLength   = options.NgramLength,
                SkipLength    = options.SkipLength,
                UseAllLengths = options.UseAllLengths,
                Weighting     = options.Weighting,
                Columns       = new NgramExtractorTransform.Column[options.Columns.Length]
            };

            for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++)
            {
                var column = options.Columns[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source));
                h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                tokenizeColumns[iinfo] = new WordTokenizingEstimator.ColumnOptions(column.Name, column.Source.Length > 1 ? column.Name : column.Source[0]);

                extractorArgs.Columns[iinfo] =
                    new NgramExtractorTransform.Column()
                {
                    Name          = column.Name,
                    Source        = column.Name,
                    MaxNumTerms   = column.MaxNumTerms,
                    NgramLength   = column.NgramLength,
                    SkipLength    = column.SkipLength,
                    Weighting     = column.Weighting,
                    UseAllLengths = column.UseAllLengths
                };
            }

            IEstimator <ITransformer> estimator = NgramExtractionUtils.GetConcatEstimator(h, options.Columns);

            estimator = estimator.Append(new WordTokenizingEstimator(env, tokenizeColumns));
            estimator = estimator.Append(NgramExtractorTransform.CreateEstimator(h, extractorArgs, estimator.GetOutputSchema(inputSchema)));
            return(estimator);
        }
コード例 #2
0
        public ITransformer Create(IHostEnvironment env, IDataView input, ExtractorColumn[] cols)
        {
            var options = NgramExtractorTransform.CreateNgramExtractorOptions(_extractorArgs, cols);

            return(NgramExtractorTransform.CreateEstimator(env, options, SchemaShape.Create(input.Schema), _termLoaderArgs).Fit(input));
        }