Ejemplo n.º 1
0
        public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, bool hasHeader,
                                                          TextFileContents.ColumnSplitResult splitInference, ColumnTypeInference.InferenceResult typeInference,
                                                          bool trimWhitespace, bool groupColumns)
        {
            var loaderColumns      = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns);
            var typedLoaderOptions = new TextLoader.Options
            {
                Columns        = loaderColumns,
                Separators     = new[] { splitInference.Separator.Value },
                AllowSparse    = splitInference.AllowSparse,
                AllowQuoting   = splitInference.AllowQuote,
                ReadMultilines = splitInference.ReadMultilines,
                HasHeader      = hasHeader,
                TrimWhitespace = trimWhitespace
            };
            var textLoader = context.Data.CreateTextLoader(typedLoaderOptions);
            var dataView   = textLoader.Load(path);

            // Validate all columns specified in column info exist in inferred data view
            ColumnInferenceValidationUtil.ValidateSpecifiedColumnsExist(columnInfo, dataView);

            var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, columnInfo);

            // start building result objects
            IEnumerable <TextLoader.Column>       columnResults  = null;
            IEnumerable <(string, ColumnPurpose)> purposeResults = null;

            // infer column grouping and generate column names
            if (groupColumns)
            {
                var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader,
                                                                                   typeInference.Columns, purposeInferenceResult);

                columnResults  = groupingResult.Select(c => c.GenerateTextLoaderColumn());
                purposeResults = groupingResult.Select(c => (c.SuggestedName, c.Purpose));
            }
            else
            {
                columnResults  = loaderColumns;
                purposeResults = purposeInferenceResult.Select(p => (dataView.Schema[p.ColumnIndex].Name, p.Purpose));
            }

            var textLoaderOptions = new TextLoader.Options()
            {
                Columns        = columnResults.ToArray(),
                AllowQuoting   = splitInference.AllowQuote,
                AllowSparse    = splitInference.AllowSparse,
                Separators     = new char[] { splitInference.Separator.Value },
                ReadMultilines = splitInference.ReadMultilines,
                HasHeader      = hasHeader,
                TrimWhitespace = trimWhitespace
            };

            return(new ColumnInferenceResults()
            {
                TextLoaderOptions = textLoaderOptions,
                ColumnInformation = ColumnInformationUtil.BuildColumnInfo(purposeResults)
            });
        }
Ejemplo n.º 2
0
        public static DatasetColumnInfo[] GetDatasetColumnInfo(MLContext context, IDataView data, ColumnInformation columnInfo)
        {
            var purposes      = PurposeInference.InferPurposes(context, data, columnInfo);
            var colDimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, purposes);
            var cols          = new DatasetColumnInfo[data.Schema.Count];

            for (var i = 0; i < cols.Length; i++)
            {
                var schemaCol = data.Schema[i];
                var col       = new DatasetColumnInfo(schemaCol.Name, schemaCol.Type, purposes[i].Purpose, colDimensions[i]);
                cols[i] = col;
            }
            return(cols);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Create a single featurize pipeline according to <paramref name="columnInformation"/>. This function will collect all columns in <paramref name="columnInformation"/>,
        /// featurizing them using <see cref="CatalogFeaturizer(string[], string[])"/>, <see cref="NumericFeaturizer(string[], string[])"/> or <see cref="TextFeaturizer(string, string)"/>. And combine
        /// them into a single feature column as output.
        /// </summary>
        /// <param name="data">input data.</param>
        /// <param name="columnInformation">column information.</param>
        /// <param name="outputColumnName">output feature column.</param>
        /// <returns>A <see cref="MultiModelPipeline"/> for featurization.</returns>
        public MultiModelPipeline Featurizer(IDataView data, ColumnInformation columnInformation, string outputColumnName = "Features")
        {
            Contracts.CheckValue(data, nameof(data));
            Contracts.CheckValue(columnInformation, nameof(columnInformation));

            var columnPurposes            = PurposeInference.InferPurposes(this._context, data, columnInformation);
            var textFeatures              = columnPurposes.Where(c => c.Purpose == ColumnPurpose.TextFeature);
            var numericFeatures           = columnPurposes.Where(c => c.Purpose == ColumnPurpose.NumericFeature);
            var catalogFeatures           = columnPurposes.Where(c => c.Purpose == ColumnPurpose.CategoricalFeature);
            var textFeatureColumnNames    = textFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray();
            var numericFeatureColumnNames = numericFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray();
            var catalogFeatureColumnNames = catalogFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray();

            var pipeline = new MultiModelPipeline();

            if (numericFeatureColumnNames.Length > 0)
            {
                pipeline = pipeline.Append(this.NumericFeaturizer(numericFeatureColumnNames, numericFeatureColumnNames));
            }

            if (catalogFeatureColumnNames.Length > 0)
            {
                pipeline = pipeline.Append(this.CatalogFeaturizer(catalogFeatureColumnNames, catalogFeatureColumnNames));
            }

            foreach (var textColumn in textFeatureColumnNames)
            {
                pipeline = pipeline.Append(this.TextFeaturizer(textColumn, textColumn));
            }

            var option = new ConcatOption
            {
                InputColumnNames = textFeatureColumnNames.Concat(numericFeatureColumnNames).Concat(catalogFeatureColumnNames).ToArray(),
                OutputColumnName = outputColumnName,
            };

            if (option.InputColumnNames.Length > 0)
            {
                pipeline = pipeline.Append(SweepableEstimatorFactory.CreateConcatenate(option));
            }

            return(pipeline);
        }