public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, bool hasHeader, TextFileContents.ColumnSplitResult splitInference, ColumnTypeInference.InferenceResult typeInference, bool trimWhitespace, bool groupColumns) { var loaderColumns = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns); var typedLoaderOptions = new TextLoader.Options { Columns = loaderColumns, Separators = new[] { splitInference.Separator.Value }, AllowSparse = splitInference.AllowSparse, AllowQuoting = splitInference.AllowQuote, ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, TrimWhitespace = trimWhitespace }; var textLoader = context.Data.CreateTextLoader(typedLoaderOptions); var dataView = textLoader.Load(path); // Validate all columns specified in column info exist in inferred data view ColumnInferenceValidationUtil.ValidateSpecifiedColumnsExist(columnInfo, dataView); var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, columnInfo); // start building result objects IEnumerable <TextLoader.Column> columnResults = null; IEnumerable <(string, ColumnPurpose)> purposeResults = null; // infer column grouping and generate column names if (groupColumns) { var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader, typeInference.Columns, purposeInferenceResult); columnResults = groupingResult.Select(c => c.GenerateTextLoaderColumn()); purposeResults = groupingResult.Select(c => (c.SuggestedName, c.Purpose)); } else { columnResults = loaderColumns; purposeResults = purposeInferenceResult.Select(p => (dataView.Schema[p.ColumnIndex].Name, p.Purpose)); } var textLoaderOptions = new TextLoader.Options() { Columns = columnResults.ToArray(), AllowQuoting = splitInference.AllowQuote, AllowSparse = splitInference.AllowSparse, Separators = new char[] { splitInference.Separator.Value }, ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, TrimWhitespace = trimWhitespace }; return(new ColumnInferenceResults() { TextLoaderOptions = textLoaderOptions, ColumnInformation = ColumnInformationUtil.BuildColumnInfo(purposeResults) }); }
public static DatasetColumnInfo[] GetDatasetColumnInfo(MLContext context, IDataView data, ColumnInformation columnInfo) { var purposes = PurposeInference.InferPurposes(context, data, columnInfo); var colDimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, purposes); var cols = new DatasetColumnInfo[data.Schema.Count]; for (var i = 0; i < cols.Length; i++) { var schemaCol = data.Schema[i]; var col = new DatasetColumnInfo(schemaCol.Name, schemaCol.Type, purposes[i].Purpose, colDimensions[i]); cols[i] = col; } return(cols); }
/// <summary> /// Create a single featurize pipeline according to <paramref name="columnInformation"/>. This function will collect all columns in <paramref name="columnInformation"/>, /// featurizing them using <see cref="CatalogFeaturizer(string[], string[])"/>, <see cref="NumericFeaturizer(string[], string[])"/> or <see cref="TextFeaturizer(string, string)"/>. And combine /// them into a single feature column as output. /// </summary> /// <param name="data">input data.</param> /// <param name="columnInformation">column information.</param> /// <param name="outputColumnName">output feature column.</param> /// <returns>A <see cref="MultiModelPipeline"/> for featurization.</returns> public MultiModelPipeline Featurizer(IDataView data, ColumnInformation columnInformation, string outputColumnName = "Features") { Contracts.CheckValue(data, nameof(data)); Contracts.CheckValue(columnInformation, nameof(columnInformation)); var columnPurposes = PurposeInference.InferPurposes(this._context, data, columnInformation); var textFeatures = columnPurposes.Where(c => c.Purpose == ColumnPurpose.TextFeature); var numericFeatures = columnPurposes.Where(c => c.Purpose == ColumnPurpose.NumericFeature); var catalogFeatures = columnPurposes.Where(c => c.Purpose == ColumnPurpose.CategoricalFeature); var textFeatureColumnNames = textFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray(); var numericFeatureColumnNames = numericFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray(); var catalogFeatureColumnNames = catalogFeatures.Select(c => data.Schema[c.ColumnIndex].Name).ToArray(); var pipeline = new MultiModelPipeline(); if (numericFeatureColumnNames.Length > 0) { pipeline = pipeline.Append(this.NumericFeaturizer(numericFeatureColumnNames, numericFeatureColumnNames)); } if (catalogFeatureColumnNames.Length > 0) { pipeline = pipeline.Append(this.CatalogFeaturizer(catalogFeatureColumnNames, catalogFeatureColumnNames)); } foreach (var textColumn in textFeatureColumnNames) { pipeline = pipeline.Append(this.TextFeaturizer(textColumn, textColumn)); } var option = new ConcatOption { InputColumnNames = textFeatureColumnNames.Concat(numericFeatureColumnNames).Concat(catalogFeatureColumnNames).ToArray(), OutputColumnName = outputColumnName, }; if (option.InputColumnNames.Length > 0) { pipeline = pipeline.Append(SweepableEstimatorFactory.CreateConcatenate(option)); } return(pipeline); }