예제 #1
0
        public void IncorrectLabelColumnThrows()
        {
            var dataPath = DatasetUtil.GetUciAdultDataset();
            var context  = new MLContext(1);

            Assert.Throws <ArgumentException>(new System.Action(() => context.Auto().InferColumns(dataPath, "Junk", groupColumns: false)));
        }
예제 #2
0
        public void AutoFitImageClassificationTrainTest()
        {
            var           context             = new MLContext(seed: 1);
            var           datasetPath         = DatasetUtil.GetFlowersDataset();
            var           columnInference     = context.Auto().InferColumns(datasetPath, "Label");
            var           textLoader          = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var           trainData           = context.Data.ShuffleRows(textLoader.Load(datasetPath), seed: 1);
            var           originalColumnNames = trainData.Schema.Select(c => c.Name);
            TrainTestData trainTestData       = context.Data.TrainTestSplit(trainData, testFraction: 0.2, seed: 1);
            IDataView     trainDataset        = SplitUtil.DropAllColumnsExcept(context, trainTestData.TrainSet, originalColumnNames);
            IDataView     testDataset         = SplitUtil.DropAllColumnsExcept(context, trainTestData.TestSet, originalColumnNames);
            var           result = context.Auto()
                                   .CreateMulticlassClassificationExperiment(0)
                                   .Execute(trainDataset, testDataset, columnInference.ColumnInformation);

            //Known issue, where on Ubuntu there is degradation in accuracy.
            if (!(RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ||
                  RuntimeInformation.IsOSPlatform(OSPlatform.OSX)))
            {
                Assert.Equal(0.778, result.BestRun.ValidationMetrics.MicroAccuracy, 3);
            }
            else
            {
                Assert.Equal(1, result.BestRun.ValidationMetrics.MicroAccuracy, 3);
            }

            var scoredData = result.BestRun.Model.Transform(trainData);

            Assert.Equal(TextDataViewType.Instance, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type);
        }
        public void GetNextPipeline()
        {
            var context  = new MLContext(1);
            var uciAdult = DatasetUtil.GetUciAdultDataView();
            var columns  = DatasetColumnInfoUtil.GetDatasetColumnInfo(context, uciAdult, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            // get next pipeline
            var pipeline = PipelineSuggester.GetNextPipeline(context, new List <PipelineScore>(), columns, TaskKind.BinaryClassification);

            // serialize & deserialize pipeline
            var serialized = JsonConvert.SerializeObject(pipeline);

            Console.WriteLine(serialized);
            var deserialized = JsonConvert.DeserializeObject <Pipeline>(serialized);

            // run pipeline
            var estimator  = deserialized.ToEstimator(context);
            var scoredData = estimator.Fit(uciAdult).Transform(uciAdult);
            var score      = context.BinaryClassification.EvaluateNonCalibrated(scoredData).Accuracy;
            var result     = new PipelineScore(deserialized, score, true);

            Assert.NotNull(result);
        }
예제 #4
0
        public void AutoFitMaxExperimentTimeTest()
        {
            // A single binary classification experiment takes less than 5 seconds.
            // System.OperationCanceledException is thrown when ongoing experiment
            // is canceled and at least one model has been generated.
            // BinaryClassificationExperiment includes LightGBM, which is not 32-bit
            // compatible.
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var experiment      = context.Auto()
                                  .CreateBinaryClassificationExperiment(15)
                                  .Execute(trainData, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            // Ensure the (last) model that was training when maximum experiment time was reached has been stopped,
            // and that its MLContext has been canceled. Sometimes during CI unit testing, the host machines can run slower than normal, which
            // can increase the run time of unit tests, and may not produce multiple runs.
            if (experiment.RunDetails.Select(r => r.Exception == null).Count() > 1 && experiment.RunDetails.Last().Exception != null)
            {
                Assert.True(experiment.RunDetails.Last().Exception.Message.Contains("Operation was canceled"),
                            "Training process was not successfully canceled after maximum experiment time was reached.");
                // Ensure that the best found model can still run after maximum experiment time was reached.
                IDataView predictions = experiment.BestRun.Model.Transform(trainData);
            }
        }
예제 #5
0
        public void AutoFeaturizer_iris_test()
        {
            var context  = new MLContext(1);
            var dataset  = DatasetUtil.GetIrisDataView();
            var pipeline = context.Auto().Featurizer(dataset, excludeColumns: new[] { "Label" });

            Approvals.Verify(JsonSerializer.Serialize(pipeline, _jsonSerializerOptions));
        }
예제 #6
0
        public void AutoFeaturizer_uci_adult_test()
        {
            var context  = new MLContext(1);
            var dataset  = DatasetUtil.GetUciAdultDataView();
            var pipeline = context.Auto().Featurizer(dataset, outputColumnName: "OutputFeature", excludeColumns: new[] { "Label" });

            Approvals.Verify(JsonSerializer.Serialize(pipeline, _jsonSerializerOptions));
        }
예제 #7
0
        public void IdentifyLabelColumnThroughIndexWithHeader()
        {
            var result = new MLContext().Auto().InferColumns(DatasetUtil.DownloadUciAdultDataset(), 14, hasHeader: true);

            Assert.True(result.TextLoaderOptions.HasHeader);
            var labelCol = result.TextLoaderOptions.Columns.First(c => c.Source[0].Min == 14 && c.Source[0].Max == 14);

            Assert.Equal("hours-per-week", labelCol.Name);
            Assert.Equal("hours-per-week", result.ColumnInformation.LabelColumnName);
        }
예제 #8
0
        public void AutoFitRankingTest()
        {
            string labelColumnName           = "Label";
            string scoreColumnName           = "Score";
            string groupIdColumnName         = "GroupId";
            string featuresColumnVectorNameA = "FeatureVectorA";
            string featuresColumnVectorNameB = "FeatureVectorB";
            var    mlContext = new MLContext(1);

            // STEP 1: Load data
            var reader        = new TextLoader(mlContext, GetLoaderArgsRank(labelColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB));
            var trainDataView = reader.Load(new MultiFileSource(DatasetUtil.GetMLSRDataset()));
            var testDataView  = mlContext.Data.TakeRows(trainDataView, 500);

            trainDataView = mlContext.Data.SkipRows(trainDataView, 500);

            // STEP 2: Run AutoML experiment
            var experiment = mlContext.Auto()
                             .CreateRankingExperiment(5);

            ExperimentResult <RankingMetrics>[] experimentResults =
            {
                experiment.Execute(trainDataView, labelColumnName, groupIdColumnName),
                experiment.Execute(trainDataView, testDataView),
                experiment.Execute(trainDataView, testDataView,
                                   new ColumnInformation()
                {
                    LabelColumnName   = labelColumnName,
                    GroupIdColumnName = groupIdColumnName,
                }),
                experiment.Execute(trainDataView, testDataView,
                                   new ColumnInformation()
                {
                    LabelColumnName       = labelColumnName,
                    GroupIdColumnName     = groupIdColumnName,
                    SamplingKeyColumnName = groupIdColumnName
                })
            };

            for (int i = 0; i < experimentResults.Length; i++)
            {
                RunDetail <RankingMetrics> bestRun = experimentResults[i].BestRun;
                Assert.True(experimentResults[i].RunDetails.Count() > 0);
                Assert.NotNull(bestRun.ValidationMetrics);
                Assert.True(bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains.Last() > 0.4);
                Assert.True(bestRun.ValidationMetrics.DiscountedCumulativeGains.Last() > 20);
                var outputSchema        = bestRun.Model.GetOutputSchema(trainDataView.Schema);
                var expectedOutputNames = new string[] { labelColumnName, groupIdColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB,
                                                         "Features", scoreColumnName };
                foreach (var col in outputSchema)
                {
                    Assert.True(col.Name == expectedOutputNames[col.Index]);
                }
            }
        }
예제 #9
0
        public void IdentifyLabelColumnThroughIndexWithoutHeader()
        {
            var result = new MLContext().Auto().InferColumns(DatasetUtil.DownloadIrisDataset(), DatasetUtil.IrisDatasetLabelColIndex);

            Assert.False(result.TextLoaderOptions.HasHeader);
            var labelCol = result.TextLoaderOptions.Columns.First(c => c.Source[0].Min == DatasetUtil.IrisDatasetLabelColIndex &&
                                                                  c.Source[0].Max == DatasetUtil.IrisDatasetLabelColIndex);

            Assert.Equal(DefaultColumnNames.Label, labelCol.Name);
            Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName);
        }
예제 #10
0
        public void AutoFitMaxExperimentTimeTest()
        {
            // A single binary classification experiment takes less than 5 seconds.
            // System.OperationCanceledException is thrown when ongoing experiment
            // is canceled and at least one model has been generated.
            // BinaryClassificationExperiment includes LightGBM, which is not 32-bit
            // compatible.
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var experiment      = context.Auto()
                                  .CreateBinaryClassificationExperiment(15)
                                  .Execute(trainData, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            // Ensure the (last) model that was training when maximum experiment time was reached has been stopped,
            // and that its MLContext has been canceled. Sometimes during CI unit testing, the host machines can run slower than normal, which
            // can increase the run time of unit tests, and may not produce multiple runs.
            if (experiment.RunDetails.Select(r => r.Exception == null).Count() > 1 && experiment.RunDetails.Last().Exception != null)
            {
                var expectedExceptionMessage = "Operation was canceled";
                var lastException            = experiment.RunDetails.Last().Exception;
                var containsMessage          = lastException.Message.Contains(expectedExceptionMessage);

                if (lastException is AggregateException lastAggregateException)
                {
                    // Sometimes multiple threads might throw the same "Operation was cancelled"
                    // exception and all of them are grouped inside an AggregateException
                    // Must check that all exceptions are the expected one.
                    containsMessage = true;
                    foreach (var ex in lastAggregateException.Flatten().InnerExceptions)
                    {
                        if (!ex.Message.Contains(expectedExceptionMessage))
                        {
                            containsMessage = false;
                        }
                    }
                }


                Assert.True(containsMessage,
                            $"Did not obtain '{expectedExceptionMessage}' error." +
                            $"Obtained unexpected error of type {lastException.GetType()} with message: {lastException.Message}");

                // Ensure that the best found model can still run after maximum experiment time was reached.
                IDataView predictions = experiment.BestRun.Model.Transform(trainData);
            }
        }
예제 #11
0
        public void AutoFitRegressionTest(string culture)
        {
            var originalCulture = Thread.CurrentThread.CurrentCulture;

            try
            {
                Thread.CurrentThread.CurrentCulture = new CultureInfo(culture);

                // If users run AutoML with a different locale, sometimes
                // the sweeper encounters problems when parsing some strings.
                // So testing in another culture is necessary.
                // Furthermore, these issues might only occur after ~70
                // iterations, so more experiment time is needed for this to
                // occur.
                uint experimentTime = (uint)(culture == "en-US" ? 0 : 180);

                var experimentSettings = new RegressionExperimentSettings {
                    MaxExperimentTimeInSeconds = experimentTime
                };
                if (!Environment.Is64BitProcess)
                {
                    // LightGBM isn't available on x86 machines
                    experimentSettings.Trainers.Remove(RegressionTrainer.LightGbm);
                }

                var context         = new MLContext(1);
                var dataPath        = DatasetUtil.GetMlNetGeneratedRegressionDataset();
                var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
                var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
                var trainData       = textLoader.Load(dataPath);
                var validationData  = context.Data.TakeRows(trainData, 20);
                trainData = context.Data.SkipRows(trainData, 20);
                var result = context.Auto()
                             .CreateRegressionExperiment(experimentSettings)
                             .Execute(trainData, validationData,
                                      new ColumnInformation()
                {
                    LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel
                });

                Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9));

                // Ensure experimentTime allows enough iterations to fully test the internationalization code
                // If the below assertion fails, increase the experiment time so the number of iterations is met
                Assert.True(culture == "en-US" || result.RunDetails.Count() >= 75, $"RunDetails.Count() = {result.RunDetails.Count()}, below 75");
            }
            finally
            {
                Thread.CurrentThread.CurrentCulture = originalCulture;
            }
        }
예제 #12
0
        public void AutoFitRegressionTest(string culture)
        {
            var originalCulture = Thread.CurrentThread.CurrentCulture;

            try
            {
                Thread.CurrentThread.CurrentCulture = new CultureInfo(culture);

                // If users run AutoML with a different locale, sometimes
                // the sweeper encounters problems when parsing some strings.
                // So testing in another culture is necessary.
                // Furthermore, these issues might only occur after ~70
                // iterations, so setting the internal maxModels parameter.
                int maxModels = culture == "en-US" ? 1 : 75;

                var experimentSettings = new RegressionExperimentSettings {
                    MaxModels = maxModels
                };

                if (!Environment.Is64BitProcess)
                {
                    // LightGBM isn't available on x86 machines
                    experimentSettings.Trainers.Remove(RegressionTrainer.LightGbm);
                }

                var context         = new MLContext(1);
                var dataPath        = DatasetUtil.GetMlNetGeneratedRegressionDataset();
                var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
                var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
                var trainData       = textLoader.Load(dataPath);
                var validationData  = context.Data.TakeRows(trainData, 20);
                trainData = context.Data.SkipRows(trainData, 20);
                var result = context.Auto()
                             .CreateRegressionExperiment(experimentSettings)
                             .Execute(trainData, validationData,
                                      new ColumnInformation()
                {
                    LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel
                });

                Assert.True(result.RunDetails.Max(i => i?.ValidationMetrics?.RSquared) > 0.99);

                // Test the internal maxModels parameter
                Assert.True(culture == "en-US" || result.RunDetails.Count() == 75, $"RunDetails.Count() = {result.RunDetails.Count()}, is not 75");
            }
            finally
            {
                Thread.CurrentThread.CurrentCulture = originalCulture;
            }
        }
예제 #13
0
        public void AutoFit_UCI_Adult_CrossValidation_10_Test()
        {
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var result          = context.Auto()
                                  .CreateBinaryClassificationExperiment(1)
                                  .Execute(trainData, 10, DatasetUtil.UciAdultLabel);

            Assert.True(result.BestRun.Results.Select(x => x.ValidationMetrics.Accuracy).Min() > 0.70);
            Assert.NotNull(result.BestRun.Estimator);
            Assert.NotNull(result.BestRun.TrainerName);
        }
예제 #14
0
        public void UnGroupReturnsMoreColumnsThanGroup()
        {
            var dataPath = DatasetUtil.DownloadUciAdultDataset();
            var context  = new MLContext();
            var columnInferenceWithoutGrouping = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel, groupColumns: false);

            foreach (var col in columnInferenceWithoutGrouping.TextLoaderOptions.Columns)
            {
                Assert.False(col.Source.Length > 1 || col.Source[0].Min != col.Source[0].Max);
            }

            var columnInferenceWithGrouping = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel, groupColumns: true);

            Assert.True(columnInferenceWithGrouping.TextLoaderOptions.Columns.Count() < columnInferenceWithoutGrouping.TextLoaderOptions.Columns.Count());
        }
예제 #15
0
        public void InferColumnsColumnInfoParam()
        {
            var columnInfo = new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel
            };
            var result = new MLContext().Auto().InferColumns(DatasetUtil.DownloadMlNetGeneratedRegressionDataset(),
                                                             columnInfo);
            var labelCol = result.TextLoaderOptions.Columns.First(c => c.Name == DatasetUtil.MlNetGeneratedRegressionLabel);

            Assert.Equal(DataKind.Single, labelCol.DataKind);
            Assert.Equal(DatasetUtil.MlNetGeneratedRegressionLabel, result.ColumnInformation.LabelColumnName);
            Assert.Single(result.ColumnInformation.NumericColumnNames);
            Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First());
            Assert.Null(result.ColumnInformation.ExampleWeightColumnName);
        }
예제 #16
0
        public void AutoFit_UCI_Adult_Train_Test_Split_Test()
        {
            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var dataTrainTest   = context.Data.TrainTestSplit(trainData);
            var result          = context.Auto()
                                  .CreateBinaryClassificationExperiment(1)
                                  .Execute(dataTrainTest.TrainSet, dataTrainTest.TestSet, DatasetUtil.UciAdultLabel);

            Assert.True(result.BestRun.ValidationMetrics.Accuracy > 0.70);
            Assert.NotNull(result.BestRun.Estimator);
            Assert.NotNull(result.BestRun.Model);
            Assert.NotNull(result.BestRun.TrainerName);
        }
        public void GetNextPipelineMock()
        {
            var context  = new MLContext(1);
            var uciAdult = DatasetUtil.GetUciAdultDataView();
            var columns  = DatasetColumnInfoUtil.GetDatasetColumnInfo(context, uciAdult, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            // Get next pipeline loop
            var history       = new List <PipelineScore>();
            var task          = TaskKind.BinaryClassification;
            var maxIterations = 60;

            for (var i = 0; i < maxIterations; i++)
            {
                // Get next pipeline
                var pipeline = PipelineSuggester.GetNextPipeline(context, history, columns, task, ((IChannelProvider)context).Start("AutoMLTest"));
                if (pipeline == null)
                {
                    break;
                }

                var result = new PipelineScore(pipeline, AutoMlUtils.Random.Value.NextDouble(), true);
                history.Add(result);
            }

            Assert.Equal(maxIterations, history.Count);

            // Get all 'Stage 1' and 'Stage 2' runs from Pipeline Suggester
            var allAvailableTrainers = RecipeInference.AllowedTrainers(context, task, new ColumnInformation(), null);
            var stage1Runs           = history.Take(allAvailableTrainers.Count());
            var stage2Runs           = history.Skip(allAvailableTrainers.Count());

            // Get the trainer names from top 3 Stage 1 runs
            var topStage1Runs         = stage1Runs.OrderByDescending(r => r.Score).Take(3);
            var topStage1TrainerNames = topStage1Runs.Select(r => r.Pipeline.Nodes.Last().Name);

            // Get unique trainer names from Stage 2 runs
            var stage2TrainerNames = stage2Runs.Select(r => r.Pipeline.Nodes.Last().Name).Distinct();

            // Assert that are only 3 unique trainers used in stage 2
            Assert.Equal(3, stage2TrainerNames.Count());
            // Assert that all trainers in stage 2 were the top trainers from stage 1
            Assert.False(topStage1TrainerNames.Except(stage2TrainerNames).Any());
        }
예제 #18
0
        public void AutoFitRankingTest()
        {
            string labelColumnName           = "Label";
            string scoreColumnName           = "Score";
            string groupIdColumnName         = "CustomGroupId";
            string featuresColumnVectorNameA = "FeatureVectorA";
            string featuresColumnVectorNameB = "FeatureVectorB";
            var    mlContext = new MLContext(1);

            // STEP 1: Load data
            var reader        = new TextLoader(mlContext, GetLoaderArgsRank(labelColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB));
            var trainDataView = reader.Load(new MultiFileSource(DatasetUtil.GetMLSRDataset()));
            var testDataView  = mlContext.Data.TakeRows(trainDataView, 500);

            trainDataView = mlContext.Data.SkipRows(trainDataView, 500);
            // STEP 2: Run AutoML experiment
            ExperimentResult <RankingMetrics> experimentResult = mlContext.Auto()
                                                                 .CreateRankingExperiment(new RankingExperimentSettings()
            {
                GroupIdColumnName = "CustomGroupId", MaxExperimentTimeInSeconds = 5
            })
                                                                 .Execute(trainDataView, testDataView,
                                                                          new ColumnInformation()
            {
                LabelColumnName   = labelColumnName,
                GroupIdColumnName = groupIdColumnName
            });

            RunDetail <RankingMetrics> bestRun = experimentResult.BestRun;

            Assert.True(experimentResult.RunDetails.Count() > 0);
            Assert.NotNull(bestRun.ValidationMetrics);
            Assert.True(experimentResult.RunDetails.Max(i => i.ValidationMetrics.NormalizedDiscountedCumulativeGains.Max() > .5));
            Assert.True(experimentResult.RunDetails.Max(i => i.ValidationMetrics.DiscountedCumulativeGains.Max() > 34));
            var outputSchema        = bestRun.Model.GetOutputSchema(trainDataView.Schema);
            var expectedOutputNames = new string[] { labelColumnName, groupIdColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB,
                                                     "Features", scoreColumnName };

            foreach (var col in outputSchema)
            {
                Assert.True(col.Name == expectedOutputNames[col.Index]);
            }
        }
예제 #19
0
        public void AutoFitBinaryTest()
        {
            var context         = new MLContext();
            var dataPath        = DatasetUtil.DownloadUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var result          = context.Auto()
                                  .CreateBinaryClassificationExperiment(0)
                                  .Execute(trainData, new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            });

            Assert.True(result.BestRun.ValidationMetrics.Accuracy > 0.70);
            Assert.NotNull(result.BestRun.Estimator);
            Assert.NotNull(result.BestRun.Model);
            Assert.NotNull(result.BestRun.TrainerName);
        }
예제 #20
0
        public void AutoFitRankingCVTest()
        {
            string labelColumnName           = "Label";
            string groupIdColumnName         = "GroupIdCustom";
            string featuresColumnVectorNameA = "FeatureVectorA";
            string featuresColumnVectorNameB = "FeatureVectorB";
            uint   numFolds = 3;

            var mlContext = new MLContext(1);
            var reader    = new TextLoader(mlContext, GetLoaderArgsRank(labelColumnName, groupIdColumnName,
                                                                        featuresColumnVectorNameA, featuresColumnVectorNameB));
            var trainDataView = reader.Load(DatasetUtil.GetMLSRDataset());

            // Take less than 1500 rows of data to satisfy CrossValSummaryRunner's
            // limit.
            trainDataView = mlContext.Data.TakeRows(trainDataView, 1499);

            var experiment = mlContext.Auto()
                             .CreateRankingExperiment(5);

            CrossValidationExperimentResult <RankingMetrics>[] experimentResults =
            {
                experiment.Execute(trainDataView, numFolds,
                                   new ColumnInformation()
                {
                    LabelColumnName   = labelColumnName,
                    GroupIdColumnName = groupIdColumnName
                }),
                experiment.Execute(trainDataView, numFolds,labelColumnName, groupIdColumnName)
            };
            for (int i = 0; i < experimentResults.Length; i++)
            {
                CrossValidationRunDetail <RankingMetrics> bestRun = experimentResults[i].BestRun;
                Assert.True(experimentResults[i].RunDetails.Count() > 0);
                var enumerator = bestRun.Results.GetEnumerator();
                while (enumerator.MoveNext())
                {
                    var model = enumerator.Current;
                    Assert.True(model.ValidationMetrics.NormalizedDiscountedCumulativeGains.Max() > 0.31);
                    Assert.True(model.ValidationMetrics.DiscountedCumulativeGains.Max() > 15);
                }
            }
        }
예제 #21
0
        public void AutoFitContextLogTest()
        {
            // This test confirms that logs produced from contexts made during AutoML experiment
            // runs are correctly relayed to the main Experiment MLContext.
            _markerAutoFitContextLogTest = false;
            var context = new MLContext(1);

            context.Log += MlContextLog;
            var datasetPath     = DatasetUtil.GetFlowersDataset();
            var columnInference = context.Auto().InferColumns(datasetPath, "Label");
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(datasetPath);
            var result          = context.Auto()
                                  .CreateMulticlassClassificationExperiment(15)
                                  .Execute(trainData, columnInference.ColumnInformation);

            Assert.True(_markerAutoFitContextLogTest, "Image classification trainer logs from Experiment's sub contexts" +
                        "were not relayed to the main MLContext.");
        }
예제 #22
0
        public void AutoFitRegressionTest()
        {
            var context         = new MLContext();
            var dataPath        = DatasetUtil.DownloadMlNetGeneratedRegressionDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(dataPath);
            var validationData  = context.Data.TakeRows(trainData, 20);

            trainData = context.Data.SkipRows(trainData, 20);
            var result = context.Auto()
                         .CreateRegressionExperiment(0)
                         .Execute(trainData, validationData,
                                  new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel
            });

            Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9));
        }
예제 #23
0
        public void AutoFitImageClassification()
        {
            // This test executes the code path that model builder code will take to get a model using image
            // classification API.

            var context = new MLContext(1);

            context.Log += Context_Log;
            var datasetPath     = DatasetUtil.GetFlowersDataset();
            var columnInference = context.Auto().InferColumns(datasetPath, "Label");
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var trainData       = textLoader.Load(datasetPath);
            var result          = context.Auto()
                                  .CreateMulticlassClassificationExperiment(0)
                                  .Execute(trainData, columnInference.ColumnInformation);

            Assert.InRange(result.BestRun.ValidationMetrics.MicroAccuracy, 0.80, 0.9);
            var scoredData = result.BestRun.Model.Transform(trainData);

            Assert.Equal(TextDataViewType.Instance, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type);
        }
예제 #24
0
        public void AutoFitRegressionTest(string culture)
        {
            var originalCulture = Thread.CurrentThread.CurrentCulture;

            try
            {
                Thread.CurrentThread.CurrentCulture = new CultureInfo(culture);

                // If users run AutoML with a different locale, sometimes
                // the sweeper encounters problems when parsing some strings.
                // So testing in another culture is necessary.
                // Furthermore, these issues might only occur after ~70
                // iterations, so more experiment time is needed for this to
                // occur.
                uint experimentTime = (uint)(culture == "en-US" ? 0 : 180);

                var experimentSettings = new RegressionExperimentSettings {
                    MaxExperimentTimeInSeconds = experimentTime
                };
                if (!Environment.Is64BitProcess)
                {
                    // LightGBM isn't available on x86 machines
                    experimentSettings.Trainers.Remove(RegressionTrainer.LightGbm);
                }

                var context         = new MLContext(1);
                var dataPath        = DatasetUtil.GetMlNetGeneratedRegressionDataset();
                var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel);
                var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
                var trainData       = textLoader.Load(dataPath);
                var validationData  = context.Data.TakeRows(trainData, 20);
                trainData = context.Data.SkipRows(trainData, 20);
                var result = context.Auto()
                             .CreateRegressionExperiment(experimentSettings)
                             .Execute(trainData, validationData,
                                      new ColumnInformation()
                {
                    LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel
                });

                Assert.True(result.RunDetails.Max(i => i?.ValidationMetrics?.RSquared) > 0.9);

                // Ensure experimentTime allows enough iterations to fully test the internationalization code
                // If the below assertion fails, increase the experiment time so the number of iterations is met
                Assert.True(culture == "en-US" || result.RunDetails.Count() >= 75, $"RunDetails.Count() = {result.RunDetails.Count()}, below 75");
            }
            catch (AggregateException ae)
            {
                // During CI unit testing, the host machines can run slower than normal, which
                // can increase the run time of unit tests and throw OperationCanceledExceptions
                // from multiple threads in the form of a single AggregateException.
                foreach (var ex in ae.Flatten().InnerExceptions)
                {
                    var ignoredExceptions = new List <Exception>();
                    if (ex is OperationCanceledException)
                    {
                        continue;
                    }
                    else
                    {
                        ignoredExceptions.Add(ex);
                    }
                    if (ignoredExceptions.Count > 0)
                    {
                        throw new AggregateException(ignoredExceptions);
                    }
                }
            }
            finally
            {
                Thread.CurrentThread.CurrentCulture = originalCulture;
            }
        }
예제 #25
0
        public void AutoFitWithPresplittedData()
        {
            // Models created in AutoML should work over the same data,
            // no matter how that data is splitted before passing it to the experiment execution
            // or to the model for prediction

            var context         = new MLContext(1);
            var dataPath        = DatasetUtil.GetUciAdultDataset();
            var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel);
            var textLoader      = context.Data.CreateTextLoader(columnInference.TextLoaderOptions);
            var dataFull        = textLoader.Load(dataPath);
            var dataTrainTest   = context.Data.TrainTestSplit(dataFull);
            var dataCV          = context.Data.CrossValidationSplit(dataFull, numberOfFolds: 2);

            var modelFull = context.Auto()
                            .CreateBinaryClassificationExperiment(0)
                            .Execute(dataFull,
                                     new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                            .BestRun
                            .Model;

            var modelTrainTest = context.Auto()
                                 .CreateBinaryClassificationExperiment(0)
                                 .Execute(dataTrainTest.TrainSet,
                                          new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                                 .BestRun
                                 .Model;

            var modelCV = context.Auto()
                          .CreateBinaryClassificationExperiment(0)
                          .Execute(dataCV.First().TrainSet,
                                   new ColumnInformation()
            {
                LabelColumnName = DatasetUtil.UciAdultLabel
            })
                          .BestRun
                          .Model;

            var models = new[] { modelFull, modelTrainTest, modelCV };

            foreach (var model in models)
            {
                var resFull      = model.Transform(dataFull);
                var resTrainTest = model.Transform(dataTrainTest.TrainSet);
                var resCV        = model.Transform(dataCV.First().TrainSet);

                Assert.Equal(30, resFull.Schema.Count);
                Assert.Equal(30, resTrainTest.Schema.Count);
                Assert.Equal(30, resCV.Schema.Count);

                foreach (var col in resFull.Schema)
                {
                    Assert.Equal(col.Name, resTrainTest.Schema[col.Index].Name);
                    Assert.Equal(col.Name, resCV.Schema[col.Index].Name);
                }
            }
        }
 public void ValidateInferColsPath()
 {
     UserInputValidationUtil.ValidateInferColumnsArgs(DatasetUtil.GetUciAdultDataset());
 }
예제 #27
0
 public void LabelIndexOutOfBoundsThrows()
 {
     Assert.Throws <ArgumentOutOfRangeException>(() => new MLContext().Auto().InferColumns(DatasetUtil.DownloadUciAdultDataset(), 100));
 }