public TransformInfo(NgramExtractingEstimator.ColumnOptions info)
 {
     NgramLength    = info.NgramLength;
     SkipLength     = info.SkipLength;
     Weighting      = info.Weighting;
     NonEmptyLevels = new bool[NgramLength];
 }
Example #2
0
        internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input,
                                              TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            IDataView view      = input;
            var       termCols  = new List <Column>();
            var       isTermCol = new bool[options.Columns.Length];

            for (int i = 0; i < options.Columns.Length; i++)
            {
                var col = options.Columns[i];

                h.CheckNonWhiteSpace(col.Name, nameof(col.Name));
                h.CheckNonWhiteSpace(col.Source, nameof(col.Source));
                int colId;
                if (input.Schema.TryGetColumnIndex(col.Source, out colId) &&
                    input.Schema[colId].Type.GetItemType() is TextDataViewType)
                {
                    termCols.Add(col);
                    isTermCol[i] = true;
                }
            }

            // If the column types of args.column are text, apply term transform to convert them to keys.
            // Otherwise, skip term transform and apply ngram transform directly.
            // This logic allows NgramExtractorTransform to handle both text and key input columns.
            // Note: ngram transform handles the validation of the types natively (in case the types
            // of args.column are not text nor keys).
            if (termCols.Count > 0)
            {
                ValueToKeyMappingTransformer.Options termArgs = null;
                string[] missingDropColumns = null;
                if (termLoaderArgs != null)
                {
                    termArgs =
                        new ValueToKeyMappingTransformer.Options()
                    {
                        MaxNumTerms = int.MaxValue,
                        Term        = termLoaderArgs.Term,
                        Terms       = termLoaderArgs.Terms,
                        DataFile    = termLoaderArgs.DataFile,
                        Loader      = termLoaderArgs.Loader,
                        TermsColumn = termLoaderArgs.TermsColumn,
                        Sort        = termLoaderArgs.Sort,
                        Columns     = new ValueToKeyMappingTransformer.Column[termCols.Count]
                    };
                    if (termLoaderArgs.DropUnknowns)
                    {
                        missingDropColumns = new string[termCols.Count];
                    }
                }
                else
                {
                    termArgs =
                        new ValueToKeyMappingTransformer.Options()
                    {
                        MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumNgramsCount,
                        Columns     = new ValueToKeyMappingTransformer.Column[termCols.Count]
                    };
                }

                for (int iinfo = 0; iinfo < termCols.Count; iinfo++)
                {
                    var column = termCols[iinfo];
                    termArgs.Columns[iinfo] =
                        new ValueToKeyMappingTransformer.Column()
                    {
                        Name        = column.Name,
                        Source      = column.Source,
                        MaxNumTerms = Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] : default(int?)
                    };

                    if (missingDropColumns != null)
                    {
                        missingDropColumns[iinfo] = column.Name;
                    }
                }

                view = ValueToKeyMappingTransformer.Create(h, termArgs, view);
                if (missingDropColumns != null)
                {
                    view = new MissingValueDroppingTransformer(h, missingDropColumns.Select(x => (x, x)).ToArray()).Transform(view);
                }
            }

            var ngramColumns = new NgramExtractingEstimator.ColumnOptions[options.Columns.Length];

            for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++)
            {
                var column = options.Columns[iinfo];
                ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name,
                                                                                 column.NgramLength ?? options.NgramLength,
                                                                                 column.SkipLength ?? options.SkipLength,
                                                                                 column.UseAllLengths ?? options.UseAllLengths,
                                                                                 column.Weighting ?? options.Weighting,
                                                                                 column.MaxNumTerms ?? options.MaxNumTerms,
                                                                                 isTermCol[iinfo] ? column.Name : column.Source
                                                                                 );
            }

            return(new NgramExtractingEstimator(env, ngramColumns).Fit(view).Transform(view) as IDataTransform);
        }
        internal static IEstimator <ITransformer> CreateEstimator(IHostEnvironment env, Options options, SchemaShape inputSchema, TermLoaderArguments termLoaderArgs = null)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(LoaderSignature);

            h.CheckValue(options, nameof(options));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            var chain = new EstimatorChain <ITransformer>();

            var termCols  = new List <Column>();
            var isTermCol = new bool[options.Columns.Length];

            for (int i = 0; i < options.Columns.Length; i++)
            {
                var col = options.Columns[i];

                h.CheckNonWhiteSpace(col.Name, nameof(col.Name));
                h.CheckNonWhiteSpace(col.Source, nameof(col.Source));
                if (inputSchema.TryFindColumn(col.Source, out var colShape) &&
                    colShape.ItemType is TextDataViewType)
                {
                    termCols.Add(col);
                    isTermCol[i] = true;
                }
            }

            // If the column types of args.column are text, apply term transform to convert them to keys.
            // Otherwise, skip term transform and apply n-gram transform directly.
            // This logic allows NgramExtractorTransform to handle both text and key input columns.
            // Note: n-gram transform handles the validation of the types natively (in case the types
            // of args.column are not text nor keys).
            if (termCols.Count > 0)
            {
                var      columnOptions      = new List <ValueToKeyMappingEstimator.ColumnOptionsBase>();
                string[] missingDropColumns = termLoaderArgs != null && termLoaderArgs.DropUnknowns ? new string[termCols.Count] : null;

                for (int iinfo = 0; iinfo < termCols.Count; iinfo++)
                {
                    var column     = termCols[iinfo];
                    var colOptions = new ValueToKeyMappingEstimator.ColumnOptions(
                        column.Name,
                        column.Source,
                        maximumNumberOfKeys: Utils.Size(column.MaxNumTerms) > 0 ? column.MaxNumTerms[0] :
                        Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] :
                        termLoaderArgs == null ? NgramExtractingEstimator.Defaults.MaximumNgramsCount : int.MaxValue,
                        keyOrdinality: termLoaderArgs?.Sort ?? ValueToKeyMappingEstimator.KeyOrdinality.ByOccurrence);
                    if (termLoaderArgs != null)
                    {
                        colOptions.Key  = termLoaderArgs.Term;
                        colOptions.Keys = termLoaderArgs.Terms;
                    }
                    columnOptions.Add(colOptions);

                    if (missingDropColumns != null)
                    {
                        missingDropColumns[iinfo] = column.Name;
                    }
                }

                IDataView keyData = null;
                if (termLoaderArgs?.DataFile != null)
                {
                    using (var ch = env.Start("Create key data view"))
                        keyData = ValueToKeyMappingTransformer.GetKeyDataViewOrNull(env, ch, termLoaderArgs.DataFile, termLoaderArgs.TermsColumn, termLoaderArgs.Loader, out var autoConvert);
                }
                chain = chain.Append <ITransformer>(new ValueToKeyMappingEstimator(h, columnOptions.ToArray(), keyData));
                if (missingDropColumns != null)
                {
                    chain = chain.Append <ITransformer>(new MissingValueDroppingEstimator(h, missingDropColumns.Select(x => (x, x)).ToArray()));
                }
            }

            var ngramColumns = new NgramExtractingEstimator.ColumnOptions[options.Columns.Length];

            for (int iinfo = 0; iinfo < options.Columns.Length; iinfo++)
            {
                var column = options.Columns[iinfo];
                ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name,
                                                                                 column.NgramLength ?? options.NgramLength,
                                                                                 column.SkipLength ?? options.SkipLength,
                                                                                 column.UseAllLengths ?? options.UseAllLengths,
                                                                                 column.Weighting ?? options.Weighting,
                                                                                 column.MaxNumTerms ?? options.MaxNumTerms,
                                                                                 isTermCol[iinfo] ? column.Name : column.Source
                                                                                 );
            }
            return(chain.Append <ITransformer>(new NgramExtractingEstimator(env, ngramColumns)));
        }