internal static ITransformer Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); var chain = new TransformerChain <ITransformer>(); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. var hashColumns = new List <HashingEstimator.ColumnOptions>(); var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length]; var colCount = options.Columns.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, column.Source[isrc], 30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts)); } ngramHashColumns[iinfo] = new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts); ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames; } var hashing = new HashingEstimator(h, hashColumns.ToArray()).Fit(input); return(chain.Append(hashing) .Append(new NgramHashingEstimator(h, ngramHashColumns).Fit(hashing.Transform(input))) .Append(new ColumnSelectingTransformer(h, null, tmpColNames.SelectMany(cols => cols).ToArray()))); }
internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <ValueToKeyMappingTransformer.Column>(); } var hashColumns = new List <HashingEstimator.ColumnOptions>(); var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length]; var colCount = options.Columns.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new ValueToKeyMappingTransformer.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, 30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts)); } ngramHashColumns[iinfo] = new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts); ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, Terms = termLoaderArgs.Terms, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Columns = termCols.ToArray() }; view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var missingDropColumns = new (string outputColumnName, string inputColumnName)[termCols.Count];