示例#1
0
        public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, ValueToKeyMappingTransformer.Options input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("Term");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            var xf = ValueToKeyMappingTransformer.Create(host, input, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf
            });
        }
        public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvironment env, ClassificationLabelInput input)
        {
            Contracts.CheckValue(env, nameof(env));
            var host = env.Register("PrepareClassificationLabel");

            host.CheckValue(input, nameof(input));
            EntryPointUtils.CheckInputArgs(host, input);

            var labelCol = input.Data.Schema.GetColumnOrNull(input.LabelColumn);

            if (!labelCol.HasValue)
            {
                throw host.ExceptSchemaMismatch(nameof(input), "predicted label", input.LabelColumn);
            }

            var labelType = labelCol.Value.Type;

            if (labelType is KeyType || labelType is BooleanDataViewType)
            {
                var nop = NopTransform.CreateIfNeeded(env, input.Data);
                return(new CommonOutputs.TransformOutput {
                    Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop
                });
            }

            var args = new ValueToKeyMappingTransformer.Options()
            {
                Columns = new[]
                {
                    new ValueToKeyMappingTransformer.Column()
                    {
                        Name          = input.LabelColumn,
                        Source        = input.LabelColumn,
                        TextKeyValues = input.TextKeyValues,
                        Sort          = ValueToKeyMappingEstimator.KeyOrdinality.ByValue
                    }
                }
            };
            var xf = ValueToKeyMappingTransformer.Create(host, args, input.Data);

            return(new CommonOutputs.TransformOutput {
                Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf
            });
        }
示例#3
0
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input,
                                              TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            // To each input column to the NgramHashExtractorArguments, a HashTransform using 31
            // bits (to minimize collisions) is applied first, followed by an NgramHashTransform.
            IDataView view = input;

            List <ValueToKeyMappingTransformer.Column> termCols = null;

            if (termLoaderArgs != null)
            {
                termCols = new List <ValueToKeyMappingTransformer.Column>();
            }

            var hashColumns      = new List <HashingEstimator.ColumnOptions>();
            var ngramHashColumns = new NgramHashingEstimator.ColumnOptions[options.Columns.Length];

            var colCount = options.Columns.Length;

            // The NGramHashExtractor has a ManyToOne column type. To avoid stepping over the source
            // column name when a 'name' destination column name was specified, we use temporary column names.
            string[][] tmpColNames = new string[colCount][];
            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column = options.Columns[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0 &&
                               column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                int srcCount = column.Source.Length;
                tmpColNames[iinfo] = new string[srcCount];
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    var tmpName = input.Schema.GetTempColumnName(column.Source[isrc]);
                    tmpColNames[iinfo][isrc] = tmpName;
                    if (termLoaderArgs != null)
                    {
                        termCols.Add(
                            new ValueToKeyMappingTransformer.Column
                        {
                            Name   = tmpName,
                            Source = column.Source[isrc]
                        });
                    }

                    hashColumns.Add(new HashingEstimator.ColumnOptions(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName,
                                                                       30, column.Seed ?? options.Seed, false, column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts));
                }

                ngramHashColumns[iinfo] =
                    new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo],
                                                            column.NgramLength ?? options.NgramLength,
                                                            column.SkipLength ?? options.SkipLength,
                                                            column.UseAllLengths ?? options.UseAllLengths,
                                                            column.NumberOfBits ?? options.NumberOfBits,
                                                            column.Seed ?? options.Seed,
                                                            column.Ordered ?? options.Ordered,
                                                            column.MaximumNumberOfInverts ?? options.MaximumNumberOfInverts);
                ngramHashColumns[iinfo].FriendlyNames = column.FriendlyNames;
            }

            if (termLoaderArgs != null)
            {
                h.Assert(Utils.Size(termCols) == hashColumns.Count);
                var termArgs =
                    new ValueToKeyMappingTransformer.Options()
                {
                    MaxNumTerms = int.MaxValue,
                    Term        = termLoaderArgs.Term,
                    Terms       = termLoaderArgs.Terms,
                    DataFile    = termLoaderArgs.DataFile,
                    Loader      = termLoaderArgs.Loader,
                    TermsColumn = termLoaderArgs.TermsColumn,
                    Sort        = termLoaderArgs.Sort,
                    Columns     = termCols.ToArray()
                };
                view = ValueToKeyMappingTransformer.Create(h, termArgs, view);

                if (termLoaderArgs.DropUnknowns)
                {
                    var missingDropColumns = new (string outputColumnName, string inputColumnName)[termCols.Count];
示例#4
0
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input,
                                              TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            IDataView view      = input;
            var       termCols  = new List <Column>();
            var       isTermCol = new bool[options.Columns.Length];

            for (int i = 0; i < options.Columns.Length; i++)
            {
                var col = options.Columns[i];

                h.CheckNonWhiteSpace(col.Name, nameof(col.Name));
                h.CheckNonWhiteSpace(col.Source, nameof(col.Source));
                int colId;
                if (input.Schema.TryGetColumnIndex(col.Source, out colId) &&
                    input.Schema[colId].Type.GetItemType() is TextType)
                {
                    termCols.Add(col);
                    isTermCol[i] = true;
                }
            }

            // If the column types of args.column are text, apply term transform to convert them to keys.
            // Otherwise, skip term transform and apply ngram transform directly.
            // This logic allows NgramExtractorTransform to handle both text and key input columns.
            // Note: ngram transform handles the validation of the types natively (in case the types
            // of args.column are not text nor keys).
            if (termCols.Count > 0)
            {
                ValueToKeyMappingTransformer.Options termArgs = null;
                string[] missingDropColumns = null;
                if (termLoaderArgs != null)
                {
                    termArgs =
                        new ValueToKeyMappingTransformer.Options()
                    {
                        MaxNumTerms = int.MaxValue,
                        Term        = termLoaderArgs.Term,
                        Terms       = termLoaderArgs.Terms,
                        DataFile    = termLoaderArgs.DataFile,
                        Loader      = termLoaderArgs.Loader,
                        TermsColumn = termLoaderArgs.TermsColumn,
                        Sort        = termLoaderArgs.Sort,
                        Columns     = new ValueToKeyMappingTransformer.Column[termCols.Count]
                    };
                    if (termLoaderArgs.DropUnknowns)
                    {
                        missingDropColumns = new string[termCols.Count];
                    }
                }
                else
                {
                    termArgs =
                        new ValueToKeyMappingTransformer.Options()
                    {
                        MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms,
                        Columns     = new ValueToKeyMappingTransformer.Column[termCols.Count]
                    };
                }

                for (int iinfo = 0; iinfo < termCols.Count; iinfo++)
                {
                    var column = termCols[iinfo];
                    termArgs.Columns[iinfo] =
                        new ValueToKeyMappingTransformer.Column()
                    {
                        Name        = column.Name,
                        Source      = column.Source,
                        MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?)
                    };

                    if (missingDropColumns != null)
                    {
                        missingDropColumns[iinfo] = column.Name;
                    }
                }

                view = ValueToKeyMappingTransformer.Create(h, termArgs, view);
                if (missingDropColumns != null)
                {
                    view = new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray()).Transform(view);
                }
            }

            var ngramColumns = new NgramExtractingEstimator.ColumnInfo[options.Columns.Length];

            for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++)
            {
                var column = options.Columns[iinfo];
                ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnInfo(column.Name,
                                                                              column.NgramLength ?? options.NgramLength,
                                                                              column.SkipLength ?? options.SkipLength,
                                                                              column.AllLengths ?? options.AllLengths,
                                                                              column.Weighting ?? options.Weighting,
                                                                              column.MaxNumTerms ?? options.MaxNumTerms,
                                                                              isTermCol[iinfo] ? column.Name : column.Source
                                                                              );
            }

            return(new NgramExtractingEstimator(env, ngramColumns).Fit(view).Transform(view) as IDataTransform);
        }
示例#5
0
        public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, ValueToKeyMappingTransformer.Options input)
        {
            var h  = EntryPointUtils.CheckArgsAndCreateHost(env, "TermTransform", input);
            var xf = ValueToKeyMappingTransformer.Create(h, input, input.Data);

            return(new CommonOutputs.TransformOutput()
            {
                Model = new TransformModelImpl(h, xf, input.Data),
                OutputData = xf
            });
        }