internal static IEstimator <ITransformer> CreateEstimator(IHostEnvironment env, Options options, SchemaShape inputSchema) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(options, nameof(options)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); // Compose the WordBagTransform from a tokenize transform, // followed by a NgramExtractionTransform. // Since WordBagTransform is a many-to-one column transform, for each // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform. // REVIEW: In order to not get n-grams that cross between vector slots, we need to // enable tokenize transforms to insert a special token between slots. // REVIEW: In order to make it possible to output separate bags for different columns // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries. var tokenizeColumns = new WordTokenizingEstimator.ColumnOptions[options.Columns.Length]; var extractorArgs = new NgramExtractorTransform.Options() { MaxNumTerms = options.MaxNumTerms, NgramLength = options.NgramLength, SkipLength = options.SkipLength, UseAllLengths = options.UseAllLengths, Weighting = options.Weighting, Columns = new NgramExtractorTransform.Column[options.Columns.Length] }; for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++) { var column = options.Columns[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source)); h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); tokenizeColumns[iinfo] = new WordTokenizingEstimator.ColumnOptions(column.Name, column.Source.Length > 1 ? column.Name : column.Source[0]); extractorArgs.Columns[iinfo] = new NgramExtractorTransform.Column() { Name = column.Name, Source = column.Name, MaxNumTerms = column.MaxNumTerms, NgramLength = column.NgramLength, SkipLength = column.SkipLength, Weighting = column.Weighting, UseAllLengths = column.UseAllLengths }; } IEstimator <ITransformer> estimator = NgramExtractionUtils.GetConcatEstimator(h, options.Columns); estimator = estimator.Append(new WordTokenizingEstimator(env, tokenizeColumns)); estimator = estimator.Append(NgramExtractorTransform.CreateEstimator(h, extractorArgs, estimator.GetOutputSchema(inputSchema))); return(estimator); }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // Compose the WordBagTransform from a tokenize transform, // followed by a NgramExtractionTransform. // Since WordBagTransform is a many-to-one column transform, for each // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform. // REVIEW: In order to not get ngrams that cross between vector slots, we need to // enable tokenize transforms to insert a special token between slots. // REVIEW: In order to make it possible to output separate bags for different columns // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries. var tokenizeColumns = new WordTokenizingTransformer.ColumnInfo[args.Column.Length]; var extractorArgs = new NgramExtractorTransform.Arguments() { MaxNumTerms = args.MaxNumTerms, NgramLength = args.NgramLength, SkipLength = args.SkipLength, AllLengths = args.AllLengths, Weighting = args.Weighting, Column = new NgramExtractorTransform.Column[args.Column.Length] }; for (int iinfo = 0; iinfo < args.Column.Length; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source)); h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); tokenizeColumns[iinfo] = new WordTokenizingTransformer.ColumnInfo(column.Source.Length > 1 ? column.Name : column.Source[0], column.Name); extractorArgs.Column[iinfo] = new NgramExtractorTransform.Column() { Name = column.Name, Source = column.Name, MaxNumTerms = column.MaxNumTerms, NgramLength = column.NgramLength, SkipLength = column.SkipLength, Weighting = column.Weighting, AllLengths = column.AllLengths }; } IDataView view = input; view = NgramExtractionUtils.ApplyConcatOnSources(h, args.Column, view); view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view); return NgramExtractorTransform.Create(h, extractorArgs, view); }
public IDataTransform Create(IHostEnvironment env, IDataView input, ExtractorColumn[] cols) { return(NgramExtractorTransform.Create(env, _extractorArgs, input, cols, _termLoaderArgs)); }
public ITransformer Create(IHostEnvironment env, IDataView input, ExtractorColumn[] cols) { var options = NgramExtractorTransform.CreateNgramExtractorOptions(_extractorArgs, cols); return(NgramExtractorTransform.CreateEstimator(env, options, SchemaShape.Create(input.Schema), _termLoaderArgs).Fit(input)); }