public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the WordHashBagTransform, a tokenize transform is applied, // followed by applying WordHashVectorizeTransform. // Since WordHashBagTransform is a many-to-one column transform, for each // WordHashBagTransform.Column we may need to define multiple tokenize transform columns. // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns. // The intermediate columns are dropped at the end of using a DropColumnsTransform. IDataView view = input; var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, args.Column, view.Schema); Contracts.Assert(uniqueSourceNames.Length == args.Column.Length); var tokenizeColumns = new List <WordTokenizingTransformer.ColumnInfo>(); var extractorCols = new NgramHashExtractingTransformer.Column[args.Column.Length]; var colCount = args.Column.Length; List <string> tmpColNames = new List <string>(); for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; int srcCount = column.Source.Length; var curTmpNames = new string[srcCount]; Contracts.Assert(uniqueSourceNames[iinfo].Length == args.Column[iinfo].Source.Length); for (int isrc = 0; isrc < srcCount; isrc++) { tokenizeColumns.Add(new WordTokenizingTransformer.ColumnInfo(args.Column[iinfo].Source[isrc], curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc])); } tmpColNames.AddRange(curTmpNames); extractorCols[iinfo] = new NgramHashExtractingTransformer.Column { Name = column.Name, Source = curTmpNames, HashBits = column.HashBits, NgramLength = column.NgramLength, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, InvertHash = column.InvertHash, FriendlyNames = args.Column[iinfo].Source, AllLengths = column.AllLengths }; } view = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view).Transform(view); var featurizeArgs = new NgramHashExtractingTransformer.Arguments { AllLengths = args.AllLengths, HashBits = args.HashBits, NgramLength = args.NgramLength, SkipLength = args.SkipLength, Ordered = args.Ordered, Seed = args.Seed, Column = extractorCols.ToArray(), InvertHash = args.InvertHash }; view = NgramHashExtractingTransformer.Create(h, featurizeArgs, view); // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform. return(ColumnSelectingTransformer.CreateDrop(h, view, tmpColNames.ToArray())); }
internal static ITransformer CreateTransformer(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); // To each input column to the WordHashBagTransform, a tokenize transform is applied, // followed by applying WordHashVectorizeTransform. // Since WordHashBagTransform is a many-to-one column transform, for each // WordHashBagTransform.Column we may need to define multiple tokenize transform columns. // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns. // The intermediate columns are dropped at the end of using a DropColumnsTransform. IDataView view = input; var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, options.Columns, view.Schema); Contracts.Assert(uniqueSourceNames.Length == options.Columns.Length); var tokenizeColumns = new List <WordTokenizingEstimator.ColumnOptions>(); var extractorCols = new NgramHashExtractingTransformer.Column[options.Columns.Length]; var colCount = options.Columns.Length; List <string> tmpColNames = new List <string>(); for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; int srcCount = column.Source.Length; var curTmpNames = new string[srcCount]; Contracts.Assert(uniqueSourceNames[iinfo].Length == options.Columns[iinfo].Source.Length); for (int isrc = 0; isrc < srcCount; isrc++) { tokenizeColumns.Add(new WordTokenizingEstimator.ColumnOptions(curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc], options.Columns[iinfo].Source[isrc])); } tmpColNames.AddRange(curTmpNames); extractorCols[iinfo] = new NgramHashExtractingTransformer.Column { Name = column.Name, Source = curTmpNames, NumberOfBits = column.NumberOfBits, NgramLength = column.NgramLength, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, MaximumNumberOfInverts = column.MaximumNumberOfInverts, FriendlyNames = options.Columns[iinfo].Source, UseAllLengths = column.UseAllLengths }; } ITransformer t1 = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view); var featurizeArgs = new NgramHashExtractingTransformer.Options { UseAllLengths = options.UseAllLengths, NumberOfBits = options.NumberOfBits, NgramLength = options.NgramLength, SkipLength = options.SkipLength, Ordered = options.Ordered, Seed = options.Seed, Columns = extractorCols.ToArray(), MaximumNumberOfInverts = options.MaximumNumberOfInverts }; view = t1.Transform(view); ITransformer t2 = NgramHashExtractingTransformer.Create(h, featurizeArgs, view); // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform. ITransformer t3 = new ColumnSelectingTransformer(env, null, tmpColNames.ToArray()); return(new TransformerChain <ITransformer>(new[] { t1, t2, t3 })); }
public IDataTransform Create(IHostEnvironment env, IDataView input, ExtractorColumn[] cols) { return(NgramHashExtractingTransformer.Create(_extractorArgs, env, input, cols, _termLoaderArgs)); }