public static (IDataView[] trainDatasets, IDataView[] validationDatasets) CrossValSplit(MLContext context, IDataView trainData, uint numFolds, string samplingKeyColumn) { var originalColumnNames = trainData.Schema.Select(c => c.Name); var splits = context.Data.CrossValidationSplit(trainData, (int)numFolds, samplingKeyColumnName: samplingKeyColumn); var trainDatasets = new List <IDataView>(); var validationDatasets = new List <IDataView>(); foreach (var split in splits) { if (DatasetDimensionsUtil.IsDataViewEmpty(split.TrainSet) || DatasetDimensionsUtil.IsDataViewEmpty(split.TestSet)) { continue; } var trainDataset = DropAllColumnsExcept(context, split.TrainSet, originalColumnNames); var validationDataset = DropAllColumnsExcept(context, split.TestSet, originalColumnNames); trainDatasets.Add(trainDataset); validationDatasets.Add(validationDataset); } if (!trainDatasets.Any()) { throw new InvalidOperationException("All cross validation folds have empty train or test data. " + "Try increasing the number of rows provided in training data, or lowering specified number of " + "cross validation folds."); } return(trainDatasets.ToArray(), validationDatasets.ToArray()); }
private static void ValidateTrainData(IDataView trainData, ColumnInformation columnInformation) { if (trainData == null) { throw new ArgumentNullException(nameof(trainData), "Training data cannot be null"); } if (DatasetDimensionsUtil.IsDataViewEmpty(trainData)) { throw new ArgumentException("Training data has 0 rows", nameof(trainData)); } foreach (var column in trainData.Schema) { if (column.Name == DefaultColumnNames.Features && column.Type.GetItemType() != NumberDataViewType.Single) { throw new ArgumentException($"{DefaultColumnNames.Features} column must be of data type {NumberDataViewType.Single}", nameof(trainData)); } if ((column.Name != columnInformation.LabelColumnName && column.Name != columnInformation.UserIdColumnName && column.Name != columnInformation.ItemIdColumnName && column.Name != columnInformation.GroupIdColumnName) && column.Type.GetItemType() != BooleanDataViewType.Instance && column.Type.GetItemType() != NumberDataViewType.Single && column.Type.GetItemType() != TextDataViewType.Instance) { throw new ArgumentException($"Only supported feature column types are " + $"{BooleanDataViewType.Instance}, {NumberDataViewType.Single}, and {TextDataViewType.Instance}. " + $"Please change the feature column {column.Name} of type {column.Type} to one of " + $"the supported types.", nameof(trainData)); } } }
private static void ValidateValidationData(IDataView trainData, IDataView validationData) { if (validationData == null) { return; } if (DatasetDimensionsUtil.IsDataViewEmpty(validationData)) { throw new ArgumentException("Validation data has 0 rows", nameof(validationData)); } const string schemaMismatchError = "Training data and validation data schemas do not match."; if (trainData.Schema.Count != validationData.Schema.Count) { throw new ArgumentException($"{schemaMismatchError} Train data has '{trainData.Schema.Count}' columns," + $"and validation data has '{validationData.Schema.Count}' columns.", nameof(validationData)); } foreach (var trainCol in trainData.Schema) { var validCol = validationData.Schema.GetColumnOrNull(trainCol.Name); if (validCol == null) { throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' exsits in train data, but not in validation data.", nameof(validationData)); } if (trainCol.Type != validCol.Value.Type) { throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' is of type {trainCol.Type} in train data, and type " + $"{validCol.Value.Type} in validation data.", nameof(validationData)); } } }
public override ExperimentResult <BinaryClassificationMetrics> Execute(IDataView trainData, ColumnInformation columnInformation, IEstimator <ITransformer> preFeaturizer = null, IProgress <RunDetail <BinaryClassificationMetrics> > progressHandler = null) { var label = columnInformation.LabelColumnName; _experiment.SetEvaluateMetric(Settings.OptimizingMetric, label); _experiment.SetTrainingTimeInSeconds(Settings.MaxExperimentTimeInSeconds); // Cross val threshold for # of dataset rows -- // If dataset has < threshold # of rows, use cross val. // Else, run experiment using train-validate split. const int crossValRowCountThreshold = 15000; var rowCount = DatasetDimensionsUtil.CountRows(trainData, crossValRowCountThreshold); // TODO // split cross validation result according to sample key as well. if (rowCount < crossValRowCountThreshold) { const int numCrossValFolds = 10; _experiment.SetDataset(trainData, numCrossValFolds); } else { var splitData = Context.Data.TrainTestSplit(trainData); _experiment.SetDataset(splitData.TrainSet, splitData.TestSet); } MultiModelPipeline pipeline = new MultiModelPipeline(); if (preFeaturizer != null) { pipeline = pipeline.Append(preFeaturizer); } pipeline = pipeline.Append(Context.Auto().Featurizer(trainData, columnInformation, Features)) .Append(Context.Auto().BinaryClassification(label, Features)); _experiment.SetPipeline(pipeline); var monitor = new BinaryClassificationTrialResultMonitor(); monitor.OnTrialCompleted += (o, e) => { var detail = ToRunDetail(e); progressHandler?.Report(detail); }; _experiment.SetMonitor(monitor); _experiment.Run(); var runDetails = monitor.RunDetails.Select(e => ToRunDetail(e)); var bestRun = ToRunDetail(monitor.BestRun); var result = new ExperimentResult <BinaryClassificationMetrics>(runDetails, bestRun); return(result); }
private static void ValidateValidationData(IDataView trainData, IDataView validationData) { if (validationData == null) { return; } if (DatasetDimensionsUtil.IsDataViewEmpty(validationData)) { throw new ArgumentException("Validation data has 0 rows", nameof(validationData)); } const string schemaMismatchError = "Training data and validation data schemas do not match."; if (trainData.Schema.Count(c => !c.IsHidden) != validationData.Schema.Count(c => !c.IsHidden)) { throw new ArgumentException($"{schemaMismatchError} Train data has '{trainData.Schema.Count}' columns," + $"and validation data has '{validationData.Schema.Count}' columns.", nameof(validationData)); } // Validate that every active column in the train data corresponds to an active column in the validation data. // (Indirectly, since we asserted above that the train and validation data have the same number of active columns, this also // ensures the reverse -- that every active column in the validation data corresponds to an active column in the train data.) foreach (var trainCol in trainData.Schema) { if (trainCol.IsHidden) { continue; } var validCol = validationData.Schema.GetColumnOrNull(trainCol.Name); if (validCol == null) { throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' exists in train data, but not in validation data.", nameof(validationData)); } if (trainCol.Type != validCol.Value.Type) { throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' is of type {trainCol.Type} in train data, and type " + $"{validCol.Value.Type} in validation data.", nameof(validationData)); } } }
/// <summary> /// Executes an AutoML experiment. /// </summary> /// <param name="trainData">The training data to be used by the AutoML experiment.</param> /// <param name="columnInformation">Column information for the dataset.</param> /// <param name="preFeaturizer">Pre-featurizer that AutoML will apply to the data during an /// experiment. (The pre-featurizer will be fit only on the training data split to produce a /// trained transform. Then, the trained transform will be applied to both the training /// data split and corresponding validation data split.)</param> /// <param name="progressHandler">A user-defined object that implements /// the <see cref="IProgress{T}"/> interface. AutoML will invoke the method /// <see cref="IProgress{T}.Report(T)"/> after each model it produces during the /// course of the experiment. /// </param> /// <returns>The experiment result.</returns> /// <remarks> /// Depending on the size of your data, the AutoML experiment could take a long time to execute. /// </remarks> public ExperimentResult <TMetrics> Execute(IDataView trainData, ColumnInformation columnInformation, IEstimator <ITransformer> preFeaturizer = null, IProgress <RunDetail <TMetrics> > progressHandler = null) { // Cross val threshold for # of dataset rows -- // If dataset has < threshold # of rows, use cross val. // Else, run experiment using train-validate split. const int crossValRowCountThreshold = 15000; var rowCount = DatasetDimensionsUtil.CountRows(trainData, crossValRowCountThreshold); if (rowCount < crossValRowCountThreshold) { const int numCrossValFolds = 10; var splitResult = SplitUtil.CrossValSplit(Context, trainData, numCrossValFolds, columnInformation?.SamplingKeyColumnName); return(ExecuteCrossValSummary(splitResult.trainDatasets, columnInformation, splitResult.validationDatasets, preFeaturizer, progressHandler)); } else { var splitResult = SplitUtil.TrainValidateSplit(Context, trainData, columnInformation?.SamplingKeyColumnName); return(ExecuteTrainValidate(splitResult.trainData, columnInformation, splitResult.validationData, preFeaturizer, progressHandler)); } }
public static ColumnDimensions[] CalcColumnDimensions(MLContext context, IDataView data, PurposeInference.Column[] purposes) { data = context.Data.TakeRows(data, MaxRowsToRead); var colDimensions = new ColumnDimensions[data.Schema.Count]; for (var i = 0; i < data.Schema.Count; i++) { var column = data.Schema[i]; var purpose = purposes[i]; // default column dimensions int? cardinality = null; bool?hasMissing = null; var itemType = column.Type.GetItemType(); // If categorical text feature, calculate cardinality if (itemType.IsText() && purpose.Purpose == ColumnPurpose.CategoricalFeature) { cardinality = DatasetDimensionsUtil.GetTextColumnCardinality(data, column); } // If numeric feature, discover missing values if (itemType == NumberDataViewType.Single) { hasMissing = column.Type.IsVector() ? DatasetDimensionsUtil.HasMissingNumericVector(data, column) : DatasetDimensionsUtil.HasMissingNumericSingleValue(data, column); } colDimensions[i] = new ColumnDimensions(cardinality, hasMissing); } return(colDimensions); }