예제 #1
0
        public static (IDataView[] trainDatasets, IDataView[] validationDatasets) CrossValSplit(MLContext context,
                                                                                                IDataView trainData, uint numFolds, string samplingKeyColumn)
        {
            var originalColumnNames = trainData.Schema.Select(c => c.Name);
            var splits             = context.Data.CrossValidationSplit(trainData, (int)numFolds, samplingKeyColumnName: samplingKeyColumn);
            var trainDatasets      = new List <IDataView>();
            var validationDatasets = new List <IDataView>();

            foreach (var split in splits)
            {
                if (DatasetDimensionsUtil.IsDataViewEmpty(split.TrainSet) ||
                    DatasetDimensionsUtil.IsDataViewEmpty(split.TestSet))
                {
                    continue;
                }

                var trainDataset      = DropAllColumnsExcept(context, split.TrainSet, originalColumnNames);
                var validationDataset = DropAllColumnsExcept(context, split.TestSet, originalColumnNames);

                trainDatasets.Add(trainDataset);
                validationDatasets.Add(validationDataset);
            }

            if (!trainDatasets.Any())
            {
                throw new InvalidOperationException("All cross validation folds have empty train or test data. " +
                                                    "Try increasing the number of rows provided in training data, or lowering specified number of " +
                                                    "cross validation folds.");
            }

            return(trainDatasets.ToArray(), validationDatasets.ToArray());
        }
        private static void ValidateTrainData(IDataView trainData, ColumnInformation columnInformation)
        {
            if (trainData == null)
            {
                throw new ArgumentNullException(nameof(trainData), "Training data cannot be null");
            }

            if (DatasetDimensionsUtil.IsDataViewEmpty(trainData))
            {
                throw new ArgumentException("Training data has 0 rows", nameof(trainData));
            }

            foreach (var column in trainData.Schema)
            {
                if (column.Name == DefaultColumnNames.Features && column.Type.GetItemType() != NumberDataViewType.Single)
                {
                    throw new ArgumentException($"{DefaultColumnNames.Features} column must be of data type {NumberDataViewType.Single}", nameof(trainData));
                }

                if ((column.Name != columnInformation.LabelColumnName &&
                     column.Name != columnInformation.UserIdColumnName &&
                     column.Name != columnInformation.ItemIdColumnName &&
                     column.Name != columnInformation.GroupIdColumnName)
                    &&
                    column.Type.GetItemType() != BooleanDataViewType.Instance &&
                    column.Type.GetItemType() != NumberDataViewType.Single &&
                    column.Type.GetItemType() != TextDataViewType.Instance)
                {
                    throw new ArgumentException($"Only supported feature column types are " +
                                                $"{BooleanDataViewType.Instance}, {NumberDataViewType.Single}, and {TextDataViewType.Instance}. " +
                                                $"Please change the feature column {column.Name} of type {column.Type} to one of " +
                                                $"the supported types.", nameof(trainData));
                }
            }
        }
        private static void ValidateValidationData(IDataView trainData, IDataView validationData)
        {
            if (validationData == null)
            {
                return;
            }

            if (DatasetDimensionsUtil.IsDataViewEmpty(validationData))
            {
                throw new ArgumentException("Validation data has 0 rows", nameof(validationData));
            }

            const string schemaMismatchError = "Training data and validation data schemas do not match.";

            if (trainData.Schema.Count != validationData.Schema.Count)
            {
                throw new ArgumentException($"{schemaMismatchError} Train data has '{trainData.Schema.Count}' columns," +
                                            $"and validation data has '{validationData.Schema.Count}' columns.", nameof(validationData));
            }

            foreach (var trainCol in trainData.Schema)
            {
                var validCol = validationData.Schema.GetColumnOrNull(trainCol.Name);
                if (validCol == null)
                {
                    throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' exsits in train data, but not in validation data.", nameof(validationData));
                }

                if (trainCol.Type != validCol.Value.Type)
                {
                    throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' is of type {trainCol.Type} in train data, and type " +
                                                $"{validCol.Value.Type} in validation data.", nameof(validationData));
                }
            }
        }
        public override ExperimentResult <BinaryClassificationMetrics> Execute(IDataView trainData, ColumnInformation columnInformation, IEstimator <ITransformer> preFeaturizer = null, IProgress <RunDetail <BinaryClassificationMetrics> > progressHandler = null)
        {
            var label = columnInformation.LabelColumnName;

            _experiment.SetEvaluateMetric(Settings.OptimizingMetric, label);
            _experiment.SetTrainingTimeInSeconds(Settings.MaxExperimentTimeInSeconds);

            // Cross val threshold for # of dataset rows --
            // If dataset has < threshold # of rows, use cross val.
            // Else, run experiment using train-validate split.
            const int crossValRowCountThreshold = 15000;
            var       rowCount = DatasetDimensionsUtil.CountRows(trainData, crossValRowCountThreshold);

            // TODO
            // split cross validation result according to sample key as well.
            if (rowCount < crossValRowCountThreshold)
            {
                const int numCrossValFolds = 10;
                _experiment.SetDataset(trainData, numCrossValFolds);
            }
            else
            {
                var splitData = Context.Data.TrainTestSplit(trainData);
                _experiment.SetDataset(splitData.TrainSet, splitData.TestSet);
            }

            MultiModelPipeline pipeline = new MultiModelPipeline();

            if (preFeaturizer != null)
            {
                pipeline = pipeline.Append(preFeaturizer);
            }

            pipeline = pipeline.Append(Context.Auto().Featurizer(trainData, columnInformation, Features))
                       .Append(Context.Auto().BinaryClassification(label, Features));
            _experiment.SetPipeline(pipeline);

            var monitor = new BinaryClassificationTrialResultMonitor();

            monitor.OnTrialCompleted += (o, e) =>
            {
                var detail = ToRunDetail(e);
                progressHandler?.Report(detail);
            };

            _experiment.SetMonitor(monitor);
            _experiment.Run();

            var runDetails = monitor.RunDetails.Select(e => ToRunDetail(e));
            var bestRun    = ToRunDetail(monitor.BestRun);
            var result     = new ExperimentResult <BinaryClassificationMetrics>(runDetails, bestRun);

            return(result);
        }
        private static void ValidateValidationData(IDataView trainData, IDataView validationData)
        {
            if (validationData == null)
            {
                return;
            }

            if (DatasetDimensionsUtil.IsDataViewEmpty(validationData))
            {
                throw new ArgumentException("Validation data has 0 rows", nameof(validationData));
            }

            const string schemaMismatchError = "Training data and validation data schemas do not match.";

            if (trainData.Schema.Count(c => !c.IsHidden) != validationData.Schema.Count(c => !c.IsHidden))
            {
                throw new ArgumentException($"{schemaMismatchError} Train data has '{trainData.Schema.Count}' columns," +
                                            $"and validation data has '{validationData.Schema.Count}' columns.", nameof(validationData));
            }

            // Validate that every active column in the train data corresponds to an active column in the validation data.
            // (Indirectly, since we asserted above that the train and validation data have the same number of active columns, this also
            // ensures the reverse -- that every active column in the validation data corresponds to an active column in the train data.)
            foreach (var trainCol in trainData.Schema)
            {
                if (trainCol.IsHidden)
                {
                    continue;
                }

                var validCol = validationData.Schema.GetColumnOrNull(trainCol.Name);
                if (validCol == null)
                {
                    throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' exists in train data, but not in validation data.", nameof(validationData));
                }

                if (trainCol.Type != validCol.Value.Type)
                {
                    throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' is of type {trainCol.Type} in train data, and type " +
                                                $"{validCol.Value.Type} in validation data.", nameof(validationData));
                }
            }
        }
예제 #6
0
        /// <summary>
        /// Executes an AutoML experiment.
        /// </summary>
        /// <param name="trainData">The training data to be used by the AutoML experiment.</param>
        /// <param name="columnInformation">Column information for the dataset.</param>
        /// <param name="preFeaturizer">Pre-featurizer that AutoML will apply to the data during an
        /// experiment. (The pre-featurizer will be fit only on the training data split to produce a
        /// trained transform. Then, the trained transform will be applied to both the training
        /// data split and corresponding validation data split.)</param>
        /// <param name="progressHandler">A user-defined object that implements
        /// the <see cref="IProgress{T}"/> interface. AutoML will invoke the method
        /// <see cref="IProgress{T}.Report(T)"/> after each model it produces during the
        /// course of the experiment.
        /// </param>
        /// <returns>The experiment result.</returns>
        /// <remarks>
        /// Depending on the size of your data, the AutoML experiment could take a long time to execute.
        /// </remarks>
        public ExperimentResult <TMetrics> Execute(IDataView trainData, ColumnInformation columnInformation,
                                                   IEstimator <ITransformer> preFeaturizer = null, IProgress <RunDetail <TMetrics> > progressHandler = null)
        {
            // Cross val threshold for # of dataset rows --
            // If dataset has < threshold # of rows, use cross val.
            // Else, run experiment using train-validate split.
            const int crossValRowCountThreshold = 15000;

            var rowCount = DatasetDimensionsUtil.CountRows(trainData, crossValRowCountThreshold);

            if (rowCount < crossValRowCountThreshold)
            {
                const int numCrossValFolds = 10;
                var       splitResult      = SplitUtil.CrossValSplit(Context, trainData, numCrossValFolds, columnInformation?.SamplingKeyColumnName);
                return(ExecuteCrossValSummary(splitResult.trainDatasets, columnInformation, splitResult.validationDatasets, preFeaturizer, progressHandler));
            }
            else
            {
                var splitResult = SplitUtil.TrainValidateSplit(Context, trainData, columnInformation?.SamplingKeyColumnName);
                return(ExecuteTrainValidate(splitResult.trainData, columnInformation, splitResult.validationData, preFeaturizer, progressHandler));
            }
        }
        public static ColumnDimensions[] CalcColumnDimensions(MLContext context, IDataView data, PurposeInference.Column[] purposes)
        {
            data = context.Data.TakeRows(data, MaxRowsToRead);

            var colDimensions = new ColumnDimensions[data.Schema.Count];

            for (var i = 0; i < data.Schema.Count; i++)
            {
                var column  = data.Schema[i];
                var purpose = purposes[i];

                // default column dimensions
                int? cardinality = null;
                bool?hasMissing  = null;

                var itemType = column.Type.GetItemType();

                // If categorical text feature, calculate cardinality
                if (itemType.IsText() && purpose.Purpose == ColumnPurpose.CategoricalFeature)
                {
                    cardinality = DatasetDimensionsUtil.GetTextColumnCardinality(data, column);
                }

                // If numeric feature, discover missing values
                if (itemType == NumberDataViewType.Single)
                {
                    hasMissing = column.Type.IsVector() ?
                                 DatasetDimensionsUtil.HasMissingNumericVector(data, column) :
                                 DatasetDimensionsUtil.HasMissingNumericSingleValue(data, column);
                }

                colDimensions[i] = new ColumnDimensions(cardinality, hasMissing);
            }

            return(colDimensions);
        }