Exemple #1
0
        public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, bool hasHeader,
                                                          TextFileContents.ColumnSplitResult splitInference, ColumnTypeInference.InferenceResult typeInference,
                                                          bool trimWhitespace, bool groupColumns)
        {
            var loaderColumns      = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns);
            var typedLoaderOptions = new TextLoader.Options
            {
                Columns        = loaderColumns,
                Separators     = new[] { splitInference.Separator.Value },
                AllowSparse    = splitInference.AllowSparse,
                AllowQuoting   = splitInference.AllowQuote,
                ReadMultilines = splitInference.ReadMultilines,
                HasHeader      = hasHeader,
                TrimWhitespace = trimWhitespace
            };
            var textLoader = context.Data.CreateTextLoader(typedLoaderOptions);
            var dataView   = textLoader.Load(path);

            // Validate all columns specified in column info exist in inferred data view
            ColumnInferenceValidationUtil.ValidateSpecifiedColumnsExist(columnInfo, dataView);

            var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, columnInfo);

            // start building result objects
            IEnumerable <TextLoader.Column>       columnResults  = null;
            IEnumerable <(string, ColumnPurpose)> purposeResults = null;

            // infer column grouping and generate column names
            if (groupColumns)
            {
                var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader,
                                                                                   typeInference.Columns, purposeInferenceResult);

                columnResults  = groupingResult.Select(c => c.GenerateTextLoaderColumn());
                purposeResults = groupingResult.Select(c => (c.SuggestedName, c.Purpose));
            }
            else
            {
                columnResults  = loaderColumns;
                purposeResults = purposeInferenceResult.Select(p => (dataView.Schema[p.ColumnIndex].Name, p.Purpose));
            }

            var textLoaderOptions = new TextLoader.Options()
            {
                Columns        = columnResults.ToArray(),
                AllowQuoting   = splitInference.AllowQuote,
                AllowSparse    = splitInference.AllowSparse,
                Separators     = new char[] { splitInference.Separator.Value },
                ReadMultilines = splitInference.ReadMultilines,
                HasHeader      = hasHeader,
                TrimWhitespace = trimWhitespace
            };

            return(new ColumnInferenceResults()
            {
                TextLoaderOptions = textLoaderOptions,
                ColumnInformation = ColumnInformationUtil.BuildColumnInfo(purposeResults)
            });
        }
        private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext context, TextFileSample sample,
                                                                            TextFileContents.ColumnSplitResult splitInference, bool hasHeader, uint?labelColumnIndex, string label)
        {
            // infer column types
            var typeInferenceResult = ColumnTypeInference.InferTextFileColumnTypes(context, sample,
                                                                                   new ColumnTypeInference.Arguments
            {
                ColumnCount      = splitInference.ColumnCount,
                Separator        = splitInference.Separator.Value,
                AllowSparse      = splitInference.AllowSparse,
                AllowQuote       = splitInference.AllowQuote,
                HasHeader        = hasHeader,
                LabelColumnIndex = labelColumnIndex,
                Label            = label
            });

            if (!typeInferenceResult.IsSuccess)
            {
                throw new InferenceException(InferenceExceptionType.ColumnDataType, "Unable to infer column types of the file provided.");
            }

            return(typeInferenceResult);
        }