public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, ValueToKeyMappingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("Term"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var xf = ValueToKeyMappingTransformer.Create(host, input, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvironment env, ClassificationLabelInput input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("PrepareClassificationLabel"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); var labelCol = input.Data.Schema.GetColumnOrNull(input.LabelColumn); if (!labelCol.HasValue) { throw host.ExceptSchemaMismatch(nameof(input), "predicted label", input.LabelColumn); } var labelType = labelCol.Value.Type; if (labelType is KeyType || labelType is BooleanDataViewType) { var nop = NopTransform.CreateIfNeeded(env, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop }); } var args = new ValueToKeyMappingTransformer.Options() { Columns = new[] { new ValueToKeyMappingTransformer.Column() { Name = input.LabelColumn, Source = input.LabelColumn, TextKeyValues = input.TextKeyValues, Sort = ValueToKeyMappingEstimator.KeyOrdinality.ByValue } } }; var xf = ValueToKeyMappingTransformer.Create(host, args, input.Data); return(new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }); }
internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); // To each input column to the NgramHashExtractorArguments, a HashTransform using 31 // bits (to minimize collisions) is applied first, followed by an NgramHashTransform. IDataView view = input; List <ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) { termCols = new List <ValueToKeyMappingTransformer.Column>(); } var hashColumns = new List <HashingEstimator.ColumnOptions>(); var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length]; var colCount = options.Columns.Length; // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source // column name when a 'name' destination column name was specified, we use temporary column names. string[][] tmpColNames = new string[colCount][]; for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0 && column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); int srcCount = column.Source.Length; tmpColNames[iinfo] = new string[srcCount]; for (int isrc = 0; isrc < srcCount; isrc++) { var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]); tmpColNames[iinfo][isrc] = tmpName; if (termLoaderArgs != null) { termCols.Add( new ValueToKeyMappingTransformer.Column { Name = tmpName, Source = column.Source[isrc] }); } hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, 30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts)); } ngramHashColumns[iinfo] = new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts); ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames; } if (termLoaderArgs != null) { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, Terms = termLoaderArgs.Terms, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Columns = termCols.ToArray() }; view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (termLoaderArgs.DropUnknowns) { var missingDropColumns = new (string outputColumnName, string inputColumnName)[termCols.Count];
internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input, TermLoaderArguments termLoaderArgs = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(LoaderSignature); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); IDataView view = input; var termCols = new List <Column>(); var isTermCol = new bool[options.Columns.Length]; for (int i = 0; i < options.Columns.Length; i++) { var col = options.Columns[i]; h.CheckNonWhiteSpace(col.Name, nameof(col.Name)); h.CheckNonWhiteSpace(col.Source, nameof(col.Source)); int colId; if (input.Schema.TryGetColumnIndex(col.Source, out colId) && input.Schema[colId].Type.GetItemType() is TextType) { termCols.Add(col); isTermCol[i] = true; } } // If the column types of args.column are text, apply term transform to convert them to keys. // Otherwise, skip term transform and apply ngram transform directly. // This logic allows NgramExtractorTransform to handle both text and key input columns. // Note: ngram transform handles the validation of the types natively (in case the types // of args.column are not text nor keys). if (termCols.Count > 0) { ValueToKeyMappingTransformer.Options termArgs = null; string[] missingDropColumns = null; if (termLoaderArgs != null) { termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, Terms = termLoaderArgs.Terms, DataFile = termLoaderArgs.DataFile, Loader = termLoaderArgs.Loader, TermsColumn = termLoaderArgs.TermsColumn, Sort = termLoaderArgs.Sort, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; if (termLoaderArgs.DropUnknowns) { missingDropColumns = new string[termCols.Count]; } } else { termArgs = new ValueToKeyMappingTransformer.Options() { MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } for (int iinfo = 0; iinfo < termCols.Count; iinfo++) { var column = termCols[iinfo]; termArgs.Columns[iinfo] = new ValueToKeyMappingTransformer.Column() { Name = column.Name, Source = column.Source, MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?) }; if (missingDropColumns != null) { missingDropColumns[iinfo] = column.Name; } } view = ValueToKeyMappingTransformer.Create(h, termArgs, view); if (missingDropColumns != null) { view = new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray()).Transform(view); } } var ngramColumns = new NgramExtractingEstimator.ColumnInfo[options.Columns.Length]; for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++) { var column = options.Columns[iinfo]; ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnInfo(column.Name, column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, column.AllLengths ?? options.AllLengths, column.Weighting ?? options.Weighting, column.MaxNumTerms ?? options.MaxNumTerms, isTermCol[iinfo] ? column.Name : column.Source ); } return(new NgramExtractingEstimator(env, ngramColumns).Fit(view).Transform(view) as IDataTransform); }
public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, ValueToKeyMappingTransformer.Options input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TermTransform", input); var xf = ValueToKeyMappingTransformer.Create(h, input, input.Data); return(new CommonOutputs.TransformOutput() { Model = new TransformModelImpl(h, xf, input.Data), OutputData = xf }); }