private static IDataView ApplyKeyToVec(List <KeyToVectorMappingEstimator.ColumnOptions> ktv, IDataView viewTrain, IHost host) { Contracts.AssertValueOrNull(ktv); Contracts.AssertValue(viewTrain); Contracts.AssertValue(host); if (Utils.Size(ktv) > 0) { // Instead of simply using KeyToVector, we are jumping to some hoops here to do the right thing in a very common case // when the user has slightly different key values between the training and testing set. // The solution is to apply KeyToValue, then Term using the terms from the key metadata of the original key column // and finally the KeyToVector transform. viewTrain = new KeyToValueMappingTransformer(host, ktv.Select(x => (x.Name, x.InputColumnName)).ToArray()) .Transform(viewTrain); viewTrain = ValueToKeyMappingTransformer.Create(host, new ValueToKeyMappingTransformer.Options() { Columns = ktv .Select(c => new ValueToKeyMappingTransformer.Column() { Name = c.Name, Source = c.Name, Term = GetTerms(viewTrain, c.InputColumnName) }) .ToArray(), TextKeyValues = true }, viewTrain); viewTrain = new KeyToVectorMappingTransformer(host, ktv.Select(c => new KeyToVectorMappingEstimator.ColumnOptions(c.Name, c.Name)).ToArray()).Transform(viewTrain); } return(viewTrain); }
public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvironment env, ClassificationLabelInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("PrepareClassificationLabel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var labelCol = input.Data.Schema.GetColumnOrNull(input.LabelColumn); if (!labelCol.HasValue) throw host.ExceptSchemaMismatch(nameof(input), "Label", input.LabelColumn); var labelType = labelCol.Value.Type; if (labelType is KeyType || labelType is BoolType) { var nop = NopTransform.CreateIfNeeded(env, input.Data); return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop }; } var args = new ValueToKeyMappingTransformer.Arguments() { Column = new[] { new ValueToKeyMappingTransformer.Column() { Name = input.LabelColumn, Source = input.LabelColumn, TextKeyValues = input.TextKeyValues, Sort = ValueToKeyMappingTransformer.SortOrder.Value } } }; var xf = ValueToKeyMappingTransformer.Create(host, args, input.Data); return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }; }
public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, ValueToKeyMappingTransformer.Arguments input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TermTransform", input); var xf = ValueToKeyMappingTransformer.Create(h, input, input.Data); return(new CommonOutputs.TransformOutput() { Model = new TransformModelImpl(h, xf, input.Data), OutputData = xf }); }
public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvironment env, ClassificationLabelInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("PrepareClassificationLabel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); int labelCol; if (!input.Data.Schema.TryGetColumnIndex(input.LabelColumn, out labelCol)) { throw host.Except($"Column '{input.LabelColumn}' not found."); } var labelType = input.Data.Schema[labelCol].Type; if (labelType.IsKey || labelType.IsBool) { var nop = NopTransform.CreateIfNeeded(env, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop }); } var args = new ValueToKeyMappingTransformer.Arguments() { Column = new[] { new ValueToKeyMappingTransformer.Column() { Name = input.LabelColumn, Source = input.LabelColumn, TextKeyValues = input.TextKeyValues, Sort = ValueToKeyMappingTransformer.SortOrder.Value } } }; var xf = ValueToKeyMappingTransformer.Create(host, args, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <ValueToKeyMappingTransformer.Column>(); } var hashColumns = new List <HashingTransformer.Column>(); var ngramHashColumns = new NgramHashingTransformer.Column[args.Column.Length]; var colCount = args.Column.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new ValueToKeyMappingTransformer.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add( new HashingTransformer.Column { Name = tmpName, Source = termLoaderArgs == null ? column.Source[isrc] : tmpName, HashBits = 30, Seed = column.Seed, Ordered = false, InvertHash = column.InvertHash }); } ngramHashColumns[iinfo] = new NgramHashingTransformer.Column { Name = column.Name, Source = tmpColNames[iinfo], AllLengths = column.AllLengths, HashBits = column.HashBits, NgramLength = column.NgramLength, RehashUnigrams = false, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, InvertHash = column.InvertHash, // REVIEW: This is an ugly internal hack to get around // the problem that we want the *original* source names surfacing // in the descriptions where appropriate, rather than _tmp000 and // what have you. The alternative is we do something elaborate // with metadata or something but I'm not sure that's better. FriendlyNames = column.FriendlyNames }; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = termCols.ToArray() }; view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var missingDropColumns = new (string input, string output)[termCols.Count];
internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); IDataView view = input; var termCols = new List <Column>(); var isTermCol = new bool[options.Columns.Length]; for (int i = 0; i < options.Columns.Length; i++) { var col = options.Columns[i]; h.CheckNonWhiteSpace(col.Name, nameof(col.Name)); h.CheckNonWhiteSpace(col.Source, nameof(col.Source)); int colId; if (input.Schema.TryGetColumnIndex(col.Source, out colId) && input.Schema[colId].Type.GetItemType() is TextType) { termCols.Add(col); isTermCol[i] = true; } } // If the column types of args.column are text, apply term transform to convert them to keys. // Otherwise, skip term transform and apply ngram transform directly. // This logic allows NgramExtractorTransform to handle both text and key input columns. // Note: ngram transform handles the validation of the types natively (in case the types // of args.column are not text nor keys). if (termCols.Count > 0) { ValueToKeyMappingTransformer.Options termArgs = null; string[] missingDropColumns = null; if (termLoaderArgs != null) { termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, Terms = termLoaderArgs.Terms, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; if (termLoaderArgs.DropUnknowns) { missingDropColumns = new string[termCols.Count]; } } else { termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { var column = termCols[iinfo]; termArgs.Columns[iinfo] = new ValueToKeyMappingTransformer.Column() { Name = column.Name, Source = column.Source, MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?) }; if (missingDropColumns != null) { missingDropColumns[iinfo] = column.Name; } } view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (missingDropColumns != null) { view = new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray()).Transform(view); } } var ngramColumns = new NgramExtractingEstimator.ColumnInfo[options.Columns.Length]; for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++) { var column = options.Columns[iinfo]; ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnInfo(column.Name, column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.AllLengths ?? options.AllLengths, column.Weighting ?? options.Weighting, column.MaxNumTerms ?? options.MaxNumTerms, isTermCol[iinfo] ? column.Name : column.Source ); } return(new NgramExtractingEstimator(env, ngramColumns).Fit(view).Transform(view) as IDataTransform); }
internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <ValueToKeyMappingTransformer.Column>(); } var hashColumns = new List <HashingEstimator.ColumnOptions>(); var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length]; var colCount = options.Columns.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new ValueToKeyMappingTransformer.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, 30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts)); } ngramHashColumns[iinfo] = new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts); ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, Terms = termLoaderArgs.Terms, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Columns = termCols.ToArray() }; view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var missingDropColumns = new (string outputColumnName, string inputColumnName)[termCols.Count];
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); IDataView view = input; var termCols = new List <Column>(); var isTermCol = new bool[args.Column.Length]; for (int i = 0; i < args.Column.Length; i++) { var col = args.Column[i]; h.CheckNonWhiteSpace(col.Name, nameof(col.Name)); h.CheckNonWhiteSpace(col.Source, nameof(col.Source)); int colId; if (input.Schema.TryGetColumnIndex(col.Source, out colId) && input.Schema.GetColumnType(colId).ItemType.IsText) { termCols.Add(col); isTermCol[i] = true; } } // If the column types of args.column are text, apply term transform to convert them to keys. // Otherwise, skip term transform and apply ngram transform directly. // This logic allows NgramExtractorTransform to handle both text and key input columns. // Note: ngram transform handles the validation of the types natively (in case the types // of args.column are not text nor keys). if (termCols.Count > 0) { ValueToKeyMappingTransformer.Arguments termArgs = null; MissingValueDroppingTransformer.Arguments naDropArgs = null; if (termLoaderArgs != null) { termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = new ValueToKeyMappingTransformer.Column[termCols.Count] }; if (termLoaderArgs.DropUnknowns) { naDropArgs = new MissingValueDroppingTransformer.Arguments { Column = new MissingValueDroppingTransformer.Column[termCols.Count] } } ; } else { termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = Utils.Size(args.MaxNumTerms) > 0 ? args.MaxNumTerms[0] : NgramCountingTransformer.Arguments.DefaultMaxTerms, Column = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { var column = termCols[iinfo]; termArgs.Column[iinfo] = new ValueToKeyMappingTransformer.Column() { Name = column.Name, Source = column.Source, MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?) }; if (naDropArgs != null) { naDropArgs.Column[iinfo] = new MissingValueDroppingTransformer.Column { Name = column.Name, Source = column.Name } } ; } view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (naDropArgs != null) { view = new MissingValueDroppingTransformer(h, naDropArgs, view); } } var ngramArgs = new NgramCountingTransformer.Arguments() { MaxNumTerms = args.MaxNumTerms, NgramLength = args.NgramLength, SkipLength = args.SkipLength, AllLengths = args.AllLengths, Weighting = args.Weighting, Column = new NgramCountingTransformer.Column[args.Column.Length] }; for (int iinfo = 0; iinfo < args.Column.Length; iinfo++) { var column = args.Column[iinfo]; ngramArgs.Column[iinfo] = new NgramCountingTransformer.Column() { Name = column.Name, Source = isTermCol[iinfo] ? column.Name : column.Source, AllLengths = column.AllLengths, MaxNumTerms = column.MaxNumTerms, NgramLength = column.NgramLength, SkipLength = column.SkipLength, Weighting = column.Weighting }; } return(new NgramCountingTransformer(h, ngramArgs, view)); }
internal static IEstimator <ITransformer> CreateEstimator(IHostEnvironment env, Options options, SchemaShape inputSchema, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); var chain = new EstimatorChain <ITransformer>(); var termCols = new List <Column>(); var isTermCol = new bool[options.Columns.Length]; for (int i = 0; i < options.Columns.Length; i++) { var col = options.Columns[i]; h.CheckNonWhiteSpace(col.Name, nameof(col.Name)); h.CheckNonWhiteSpace(col.Source, nameof(col.Source)); if (inputSchema.TryFindColumn(col.Source, out var colShape) && colShape.ItemType is TextDataViewType) { termCols.Add(col); isTermCol[i] = true; } } // If the column types of args.column are text, apply term transform to convert them to keys. // Otherwise, skip term transform and apply n-gram transform directly. // This logic allows NgramExtractorTransform to handle both text and key input columns. // Note: n-gram transform handles the validation of the types natively (in case the types // of args.column are not text nor keys). if (termCols.Count > 0) { var columnOptions = new List <ValueToKeyMappingEstimator.ColumnOptionsBase>(); string[] missingDropColumns = termLoaderArgs != null && termLoaderArgs.DropUnknowns ? new string[termCols.Count] : null; for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { var column = termCols[iinfo]; var colOptions = new ValueToKeyMappingEstimator.ColumnOptions( column.Name, column.Source, maximumNumberOfKeys: Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : termLoaderArgs == null ? NgramExtractingEstimator.Defaults.MaximumNgramsCount : int.MaxValue, keyOrdinality: termLoaderArgs?.Sort ?? ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence); if (termLoaderArgs != null) { colOptions.Key = termLoaderArgs.Term; colOptions.Keys = termLoaderArgs.Terms; } columnOptions.Add(colOptions); if (missingDropColumns != null) { missingDropColumns[iinfo] = column.Name; } } IDataView keyData = null; if (termLoaderArgs?.DataFile != null) { using (var ch = env.Start("Create key data view")) keyData = ValueToKeyMappingTransformer.GetKeyDataViewOrNull(env, ch, termLoaderArgs.DataFile, termLoaderArgs.TermsColumn, termLoaderArgs.Loader, out var autoConvert); } chain = chain.Append <ITransformer>(new ValueToKeyMappingEstimator(h, columnOptions.ToArray(), keyData)); if (missingDropColumns != null) { chain = chain.Append <ITransformer>(new MissingValueDroppingEstimator(h, missingDropColumns.Select(x => (x, x)).ToArray())); } } var ngramColumns = new NgramExtractingEstimator.ColumnOptions[options.Columns.Length]; for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++) { var column = options.Columns[iinfo]; ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name, column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.Weighting ?? options.Weighting, column.MaxNumTerms ?? options.MaxNumTerms, isTermCol[iinfo] ? column.Name : column.Source ); } return(chain.Append <ITransformer>(new NgramExtractingEstimator(env, ngramColumns))); }