Beispiel #1
0
        public void AutoFitMaxExperimentTimeTest()
        {
            // A single binary classification experiment takes less than 5 seconds.
            // System.OperationCanceledException is thrown when ongoing experiment
            // is canceled and at least one model has been generated.
            // BinaryClassificationExperiment includes LightGBM, which is not 32-bit
            // compatible.
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var experiment      = context.Auto()
                                  .CreateBinaryClassificationExperiment(15)
                                  .Execute(trainData, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            // Ensure the (last) model that was training when maximum experiment time was reached has been stopped,
            // and that its MLContext has been canceled. Sometimes during CI unit testing, the host machines can run slower than normal, which
            // can increase the run time of unit tests, and may not produce multiple runs.
            if (experiment.RunDetails.Select(r => r.Exception == null).Count() > 1 && experiment.RunDetails.Last().Exception != null)
            {
                Assert.True(experiment.RunDetails.Last().Exception.Message.Contains("Operation was canceled"),
                            "Training process was not successfully canceled after maximum experiment time was reached.");
                // Ensure that the best found model can still run after maximum experiment time was reached.
                IDataView predictions = experiment.BestRun.Model.Transform(trainData);
            }
        }
Beispiel #2
0
        public void IncorrectLabelColumnThrows()
        {
            var dataPath = DatasetUtil.GetUciAdultDataset();
            var context  = new MLContext(1);

            Assert.Throws <ArgumentException>(new System.Action(() => context.Auto().InferColumns(dataPath, "Junk", groupColumns: false)));
        }
Beispiel #3
0
        public void IdentifyLabelColumnThroughIndexWithHeader()
        {
            var result = new MLContext(1).Auto().InferColumns(DatasetUtil.GetUciAdultDataset(), 14, hasHeader: true);

            Assert.True(result.TextLoaderOptions.HasHeader);
            var labelCol = result.TextLoaderOptions.Columns.First(c => c.Source[0].Min == 14 && c.Source[0].Max == 14);

            Assert.Equal("hours-per-week", labelCol.Name);
            Assert.Equal("hours-per-week", result.ColumnInformation.LabelColumnName);
        }
Beispiel #4
0
        public void AutoFitMaxExperimentTimeTest()
        {
            // A single binary classification experiment takes less than 5 seconds.
            // System.OperationCanceledException is thrown when ongoing experiment
            // is canceled and at least one model has been generated.
            // BinaryClassificationExperiment includes LightGBM, which is not 32-bit
            // compatible.
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var experiment      = context.Auto()
                                  .CreateBinaryClassificationExperiment(15)
                                  .Execute(trainData, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            // Ensure the (last) model that was training when maximum experiment time was reached has been stopped,
            // and that its MLContext has been canceled. Sometimes during CI unit testing, the host machines can run slower than normal, which
            // can increase the run time of unit tests, and may not produce multiple runs.
            if (experiment.RunDetails.Select(r => r.Exception == null).Count() > 1 && experiment.RunDetails.Last().Exception != null)
            {
                var expectedExceptionMessage = "Operation was canceled";
                var lastException            = experiment.RunDetails.Last().Exception;
                var containsMessage          = lastException.Message.Contains(expectedExceptionMessage);

                if (lastException is AggregateException lastAggregateException)
                {
                    // Sometimes multiple threads might throw the same "Operation was cancelled"
                    // exception and all of them are grouped inside an AggregateException
                    // Must check that all exceptions are the expected one.
                    containsMessage = true;
                    foreach (var ex in lastAggregateException.Flatten().InnerExceptions)
                    {
                        if (!ex.Message.Contains(expectedExceptionMessage))
                        {
                            containsMessage = false;
                        }
                    }
                }


                Assert.True(containsMessage,
                            $"Did not obtain '{expectedExceptionMessage}' error." +
                            $"Obtained unexpected error of type {lastException.GetType()} with message: {lastException.Message}");

                // Ensure that the best found model can still run after maximum experiment time was reached.
                IDataView predictions = experiment.BestRun.Model.Transform(trainData);
            }
        }
Beispiel #5
0
        public void AutoFit_UCI_Adult_CrossValidation_10_Test()
        {
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var result          = context.Auto()
                                  .CreateBinaryClassificationExperiment(1)
                                  .Execute(trainData, 10, DatasetUtil.UciAdultLabel);

            Assert.True(result.BestRun.Results.Select(x => x.ValidationMetrics.Accuracy).Min() > 0.70);
            Assert.NotNull(result.BestRun.Estimator);
            Assert.NotNull(result.BestRun.TrainerName);
        }
Beispiel #6
0
        public void UnGroupReturnsMoreColumnsThanGroup()
        {
            var dataPath = DatasetUtil.GetUciAdultDataset();
            var context  = new MLContext(1);
            var columnInferenceWithoutGrouping = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel, groupColumns: false);

            foreach (var col in columnInferenceWithoutGrouping.TextLoaderOptions.Columns)
            {
                Assert.False(col.Source.Length > 1 || col.Source[0].Min != col.Source[0].Max);
            }

            var columnInferenceWithGrouping = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel, groupColumns: true);

            Assert.True(columnInferenceWithGrouping.TextLoaderOptions.Columns.Count() < columnInferenceWithoutGrouping.TextLoaderOptions.Columns.Count());
        }
Beispiel #7
0
        public void AutoFit_UCI_Adult_Train_Test_Split_Test()
        {
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var dataTrainTest   = context.Data.TrainTestSplit(trainData);
            var result          = context.Auto()
                                  .CreateBinaryClassificationExperiment(1)
                                  .Execute(dataTrainTest.TrainSet, dataTrainTest.TestSet, DatasetUtil.UciAdultLabel);

            Assert.True(result.BestRun.ValidationMetrics.Accuracy > 0.70);
            Assert.NotNull(result.BestRun.Estimator);
            Assert.NotNull(result.BestRun.Model);
            Assert.NotNull(result.BestRun.TrainerName);
        }
 public void ValidateInferColsPath()
 {
     UserInputValidationUtil.ValidateInferColumnsArgs(DatasetUtil.GetUciAdultDataset());
 }
Beispiel #9
0
        public void AutoFitWithPresplittedData()
        {
            // Models created in AutoML should work over the same data,
            // no matter how that data is splitted before passing it to the experiment execution
            // or to the model for prediction

            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var dataFull        = textLoader.Load(dataPath);
            var dataTrainTest   = context.Data.TrainTestSplit(dataFull);
            var dataCV          = context.Data.CrossValidationSplit(dataFull, numberOfFolds: 2);

            var modelFull = context.Auto()
                            .CreateBinaryClassificationExperiment(0)
                            .Execute(dataFull,
                                     new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                            .BestRun
                            .Model;

            var modelTrainTest = context.Auto()
                                 .CreateBinaryClassificationExperiment(0)
                                 .Execute(dataTrainTest.TrainSet,
                                          new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                                 .BestRun
                                 .Model;

            var modelCV = context.Auto()
                          .CreateBinaryClassificationExperiment(0)
                          .Execute(dataCV.First().TrainSet,
                                   new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                          .BestRun
                          .Model;

            var models = new[] { modelFull, modelTrainTest, modelCV };

            foreach (var model in models)
            {
                var resFull      = model.Transform(dataFull);
                var resTrainTest = model.Transform(dataTrainTest.TrainSet);
                var resCV        = model.Transform(dataCV.First().TrainSet);

                Assert.Equal(30, resFull.Schema.Count);
                Assert.Equal(30, resTrainTest.Schema.Count);
                Assert.Equal(30, resCV.Schema.Count);

                foreach (var col in resFull.Schema)
                {
                    Assert.Equal(col.Name, resTrainTest.Schema[col.Index].Name);
                    Assert.Equal(col.Name, resCV.Schema[col.Index].Name);
                }
            }
        }
Beispiel #10
0
 public void LabelIndexOutOfBoundsThrows()
 {
     Assert.Throws <ArgumentOutOfRangeException>(() => new MLContext(1).Auto().InferColumns(DatasetUtil.GetUciAdultDataset(), 100));
 }