internal static ITransformer Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); var chain = new TransformerChain <ITransformer>(); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. var hashColumns = new List <HashingEstimator.ColumnOptions>(); var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length]; var colCount = options.Columns.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, column.Source[isrc], 30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts)); } ngramHashColumns[iinfo] = new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts); ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames; } var hashing = new HashingEstimator(h, hashColumns.ToArray()).Fit(input); return(chain.Append(hashing) .Append(new NgramHashingEstimator(h, ngramHashColumns).Fit(hashing.Transform(input))) .Append(new ColumnSelectingTransformer(h, null, tmpColNames.SelectMany(cols => cols).ToArray()))); }
internal static ITransformer Create(IHostEnvironment env, Options options, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); var chain = new TransformerChain <ITransformer>(); var termCols = new List <Column>(); var isTermCol = new bool[options.Columns.Length]; for (int i = 0; i < options.Columns.Length; i++) { var col = options.Columns[i]; h.CheckNonWhiteSpace(col.Name, nameof(col.Name)); h.CheckNonWhiteSpace(col.Source, nameof(col.Source)); int colId; if (input.Schema.TryGetColumnIndex(col.Source, out colId) && input.Schema[colId].Type.GetItemType() is TextDataViewType) { termCols.Add(col); isTermCol[i] = true; } } // If the column types of args.column are text, apply term transform to convert them to keys. // Otherwise, skip term transform and apply n-gram transform directly. // This logic allows NgramExtractorTransform to handle both text and key input columns. // Note: n-gram transform handles the validation of the types natively (in case the types // of args.column are not text nor keys). if (termCols.Count > 0) { var columnOptions = new List <ValueToKeyMappingEstimator.ColumnOptionsBase>(); string[] missingDropColumns = termLoaderArgs != null && termLoaderArgs.DropUnknowns ? new string[termCols.Count] : null; for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { var column = termCols[iinfo]; var colOptions = new ValueToKeyMappingEstimator.ColumnOptions( column.Name, column.Source, maximumNumberOfKeys: Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : termLoaderArgs == null ? NgramExtractingEstimator.Defaults.MaximumNgramsCount : int.MaxValue, keyOrdinality: termLoaderArgs?.Sort ?? ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence); if (termLoaderArgs != null) { colOptions.Key = termLoaderArgs.Term; colOptions.Keys = termLoaderArgs.Terms; } columnOptions.Add(colOptions); if (missingDropColumns != null) { missingDropColumns[iinfo] = column.Name; } } IDataView keyData = null; if (termLoaderArgs?.DataFile != null) { using (var ch = env.Start("Create key data view")) keyData = ValueToKeyMappingTransformer.GetKeyDataViewOrNull(env, ch, termLoaderArgs.DataFile, termLoaderArgs.TermsColumn, termLoaderArgs.Loader, out var autoConvert); } chain = chain.Append <ITransformer>(new ValueToKeyMappingEstimator(h, columnOptions.ToArray(), keyData).Fit(input)); if (missingDropColumns != null) { chain = chain.Append <ITransformer>(new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray())); } } var ngramColumns = new NgramExtractingEstimator.ColumnOptions[options.Columns.Length]; for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++) { var column = options.Columns[iinfo]; ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name, column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.Weighting ?? options.Weighting, column.MaxNumTerms ?? options.MaxNumTerms, isTermCol[iinfo] ? column.Name : column.Source ); } input = chain.Transform(input); return(chain.Append <ITransformer>(new NgramExtractingEstimator(env, ngramColumns).Fit(input))); }