public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvironment env, ClassificationLabelInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("PrepareClassificationLabel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var labelCol = input.Data.Schema.GetColumnOrNull(input.LabelColumn); if (!labelCol.HasValue) { throw host.ExceptSchemaMismatch(nameof(input), "Label", input.LabelColumn); } var labelType = labelCol.Value.Type; if (labelType is KeyType || labelType is BoolType) { var nop = NopTransform.CreateIfNeeded(env, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop }); } var args = new ValueToKeyMappingTransformer.Arguments() { Column = new[] { new ValueToKeyMappingTransformer.Column() { Name = input.LabelColumn, Source = input.LabelColumn, TextKeyValues = input.TextKeyValues, Sort = ValueToKeyMappingTransformer.SortOrder.Value } } }; var xf = ValueToKeyMappingTransformer.Create(host, args, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvironment env, ClassificationLabelInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("PrepareClassificationLabel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); int labelCol; if (!input.Data.Schema.TryGetColumnIndex(input.LabelColumn, out labelCol)) { throw host.Except($"Column '{input.LabelColumn}' not found."); } var labelType = input.Data.Schema[labelCol].Type; if (labelType.IsKey || labelType.IsBool) { var nop = NopTransform.CreateIfNeeded(env, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop }); } var args = new ValueToKeyMappingTransformer.Arguments() { Column = new[] { new ValueToKeyMappingTransformer.Column() { Name = input.LabelColumn, Source = input.LabelColumn, TextKeyValues = input.TextKeyValues, Sort = ValueToKeyMappingTransformer.SortOrder.Value } } }; var xf = ValueToKeyMappingTransformer.Create(host, args, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, ValueToKeyMappingTransformer.Arguments input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TermTransform", input); var xf = ValueToKeyMappingTransformer.Create(h, input, input.Data); return(new CommonOutputs.TransformOutput() { Model = new TransformModelImpl(h, xf, input.Data), OutputData = xf }); }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <ValueToKeyMappingTransformer.Column>(); } var hashColumns = new List <HashingTransformer.Column>(); var ngramHashColumns = new NgramHashingTransformer.Column[args.Column.Length]; var colCount = args.Column.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new ValueToKeyMappingTransformer.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add( new HashingTransformer.Column { Name = tmpName, Source = termLoaderArgs == null ? column.Source[isrc] : tmpName, HashBits = 30, Seed = column.Seed, Ordered = false, InvertHash = column.InvertHash }); } ngramHashColumns[iinfo] = new NgramHashingTransformer.Column { Name = column.Name, Source = tmpColNames[iinfo], AllLengths = column.AllLengths, HashBits = column.HashBits, NgramLength = column.NgramLength, RehashUnigrams = false, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, InvertHash = column.InvertHash, // REVIEW: This is an ugly internal hack to get around // the problem that we want the *original* source names surfacing // in the descriptions where appropriate, rather than _tmp000 and // what have you. The alternative is we do something elaborate // with metadata or something but I'm not sure that's better. FriendlyNames = column.FriendlyNames }; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = termCols.ToArray() }; view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var missingDropColumns = new (string input, string output)[termCols.Count];
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); IDataView view = input; var termCols = new List <Column>(); var isTermCol = new bool[args.Column.Length]; for (int i = 0; i < args.Column.Length; i++) { var col = args.Column[i]; h.CheckNonWhiteSpace(col.Name, nameof(col.Name)); h.CheckNonWhiteSpace(col.Source, nameof(col.Source)); int colId; if (input.Schema.TryGetColumnIndex(col.Source, out colId) && input.Schema.GetColumnType(colId).ItemType.IsText) { termCols.Add(col); isTermCol[i] = true; } } // If the column types of args.column are text, apply term transform to convert them to keys. // Otherwise, skip term transform and apply ngram transform directly. // This logic allows NgramExtractorTransform to handle both text and key input columns. // Note: ngram transform handles the validation of the types natively (in case the types // of args.column are not text nor keys). if (termCols.Count > 0) { ValueToKeyMappingTransformer.Arguments termArgs = null; string[] missingDropColumns = null; if (termLoaderArgs != null) { termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = new ValueToKeyMappingTransformer.Column[termCols.Count] }; if (termLoaderArgs.DropUnknowns) { missingDropColumns = new string[termCols.Count]; } } else { termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = Utils.Size(args.MaxNumTerms) > 0 ? args.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms, Column = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { var column = termCols[iinfo]; termArgs.Column[iinfo] = new ValueToKeyMappingTransformer.Column() { Name = column.Name, Source = column.Source, MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?) }; if (missingDropColumns != null) { missingDropColumns[iinfo] = column.Name; } } view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (missingDropColumns != null) { view = new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray()).Transform(view); } } var ngramColumns = new NgramExtractingTransformer.ColumnInfo[args.Column.Length]; for (int iinfo = 0; iinfo < args.Column.Length; iinfo++) { var column = args.Column[iinfo]; ngramColumns[iinfo] = new NgramExtractingTransformer.ColumnInfo(isTermCol[iinfo] ? column.Name : column.Source, column.Name, column.NgramLength ?? args.NgramLength, column.SkipLength ?? args.SkipLength, column.AllLengths ?? args.AllLengths, column.Weighting ?? args.Weighting, column.MaxNumTerms ?? args.MaxNumTerms ); } return(new NgramExtractingEstimator(env, ngramColumns).Fit(view).Transform(view) as IDataTransform); }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <ValueToKeyMappingTransformer.Column>(); } var hashColumns = new List <HashingTransformer.ColumnInfo>(); var ngramHashColumns = new NgramHashingTransformer.ColumnInfo[args.Column.Length]; var colCount = args.Column.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new ValueToKeyMappingTransformer.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add(new HashingTransformer.ColumnInfo(termLoaderArgs == null ? column.Source[isrc] : tmpName, tmpName, 30, column.Seed ?? args.Seed, false, column.InvertHash ?? args.InvertHash)); } ngramHashColumns[iinfo] = new NgramHashingTransformer.ColumnInfo(tmpColNames[iinfo], column.Name, column.NgramLength ?? args.NgramLength, column.SkipLength ?? args.SkipLength, column.AllLengths ?? args.AllLengths, column.HashBits ?? args.HashBits, column.Seed ?? args.Seed, column.Ordered ?? args.Ordered, column.InvertHash ?? args.InvertHash); ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new ValueToKeyMappingTransformer.Arguments() { MaxNumTerms = int.MaxValue, Terms = termLoaderArgs.Terms, Term = termLoaderArgs.Term, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Column = termCols.ToArray() }; view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var missingDropColumns = new (string input, string output)[termCols.Count];