예제 #1
0
        private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEstimator <ITransformer> estimator,
                                                                     int numFolds, string samplingKeyColumn, int?seed = null)
        {
            Environment.CheckValue(data, nameof(data));
            Environment.CheckValue(estimator, nameof(estimator));
            Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1");
            Environment.CheckValueOrNull(samplingKeyColumn);

            DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed);
            var result = new CrossValidationResult[numFolds];
            int fold   = 0;

            // Sequential per-fold training.
            // REVIEW: we could have a parallel implementation here. We would need to
            // spawn off a separate host per fold in that case.
            foreach (var split in DataOperationsCatalog.CrossValidationSplit(Environment, data, numFolds, samplingKeyColumn))
            {
                var model      = estimator.Fit(split.TrainSet);
                var scoredTest = model.Transform(split.TestSet);
                result[fold] = new CrossValidationResult(model, scoredTest, fold);
                fold++;
            }

            return(result);
        }
예제 #2
0
        private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEstimator <ITransformer> estimator,
                                                                     int numFolds, string samplingKeyColumn, int?seed = null)
        {
            Environment.CheckValue(data, nameof(data));
            Environment.CheckValue(estimator, nameof(estimator));
            Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1");
            Environment.CheckValueOrNull(samplingKeyColumn);

            DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed);

            Func <int, CrossValidationResult> foldFunction =
                fold =>
            {
                var trainFilter = new RangeFilter(Environment, new RangeFilter.Options
                {
                    Column     = samplingKeyColumn,
                    Min        = (double)fold / numFolds,
                    Max        = (double)(fold + 1) / numFolds,
                    Complement = true
                }, data);
                var testFilter = new RangeFilter(Environment, new RangeFilter.Options
                {
                    Column     = samplingKeyColumn,
                    Min        = (double)fold / numFolds,
                    Max        = (double)(fold + 1) / numFolds,
                    Complement = false
                }, data);

                var model      = estimator.Fit(trainFilter);
                var scoredTest = model.Transform(testFilter);
                return(new CrossValidationResult(model, scoredTest, fold));
            };

            // Sequential per-fold training.
            // REVIEW: we could have a parallel implementation here. We would need to
            // spawn off a separate host per fold in that case.
            var result = new CrossValidationResult[numFolds];

            for (int fold = 0; fold < numFolds; fold++)
            {
                result[fold] = foldFunction(fold);
            }

            return(result);
        }