Esempio n. 1
0
        static void Main(string[] args)
        {
            var context = new MLContext();

            var data = context.Data.LoadFromTextFile <RankingData>("./ranking.tsv", separatorChar: '\t');

            var trainTestSplit = context.Data.TrainTestSplit(data, testFraction: 0.2);

            var settings = new RankingExperimentSettings
            {
                MaxExperimentTimeInSeconds = 300,
                OptimizingMetric           = RankingMetric.Ndcg,
            };

            var experiment = context.Auto().CreateRankingExperiment(settings);

            var progressHandler = new Progress <RunDetail <RankingMetrics> >(ph =>
            {
                if (ph.ValidationMetrics != null)
                {
                    Console.WriteLine($"Current trainer - {ph.TrainerName} with nDCG {ph.ValidationMetrics.NormalizedDiscountedCumulativeGains.Average()}");
                }
            });

            var results = experiment.Execute(trainTestSplit.TrainSet, validationData: trainTestSplit.TestSet,
                                             progressHandler: progressHandler);

            var bestRun = results.BestRun;

            var metrics = bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains;

            Console.WriteLine(Environment.NewLine);
            Console.WriteLine($"Best model {bestRun.TrainerName} - with nDCG {metrics.Average()}");
        }
Esempio n. 2
0
        static void Main(string[] args)
        {
            var context = new MLContext();

            var data = context.Data.LoadFromTextFile <RankingData>("./ranking.tsv", separatorChar: '\t');

            var settings = new RankingExperimentSettings
            {
                MaxExperimentTimeInSeconds = 300,
                OptimizingMetric           = RankingMetric.Ndcg
            };

            var experiment = context.Auto().CreateRankingExperiment(settings);

            var results = experiment.Execute(data);

            var bestModel = results.BestRun.Model;
        }
Esempio n. 3
0
        public void AutoFitRankingTest()
        {
            string labelColumnName           = "Label";
            string scoreColumnName           = "Score";
            string groupIdColumnName         = "GroupId";
            string featuresColumnVectorNameA = "FeatureVectorA";
            string featuresColumnVectorNameB = "FeatureVectorB";
            var    mlContext = new MLContext(1);

            // STEP 1: Load data
            var reader        = new TextLoader(mlContext, GetLoaderArgsRank(labelColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB));
            var trainDataView = reader.Load(new MultiFileSource(DatasetUtil.GetMLSRDataset()));
            var testDataView  = mlContext.Data.TakeRows(trainDataView, 500);

            trainDataView = mlContext.Data.SkipRows(trainDataView, 500);

            // STEP 2: Run AutoML experiment
            var settings = new RankingExperimentSettings()
            {
                MaxExperimentTimeInSeconds        = 5,
                OptimizationMetricTruncationLevel = 3
            };
            var experiment = mlContext.Auto()
                             .CreateRankingExperiment(settings);

            ExperimentResult <RankingMetrics>[] experimentResults =
            {
                experiment.Execute(trainDataView, labelColumnName, groupIdColumnName),
                experiment.Execute(trainDataView, testDataView),
                experiment.Execute(trainDataView, testDataView,
                                   new ColumnInformation()
                {
                    LabelColumnName   = labelColumnName,
                    GroupIdColumnName = groupIdColumnName,
                }),
                experiment.Execute(trainDataView, testDataView,
                                   new ColumnInformation()
                {
                    LabelColumnName       = labelColumnName,
                    GroupIdColumnName     = groupIdColumnName,
                    SamplingKeyColumnName = groupIdColumnName
                })
            };

            for (int i = 0; i < experimentResults.Length; i++)
            {
                RunDetail <RankingMetrics> bestRun = experimentResults[i].BestRun;
                // The user requested 3, but we always return at least 10.
                Assert.Equal(10, bestRun.ValidationMetrics.DiscountedCumulativeGains.Count);
                Assert.Equal(10, bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains.Count);
                Assert.True(experimentResults[i].RunDetails.Count() > 0);
                Assert.NotNull(bestRun.ValidationMetrics);
                Assert.True(bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains.Last() > 0.4);
                Assert.True(bestRun.ValidationMetrics.DiscountedCumulativeGains.Last() > 19);
                var outputSchema        = bestRun.Model.GetOutputSchema(trainDataView.Schema);
                var expectedOutputNames = new string[] { labelColumnName, groupIdColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB,
                                                         "Features", scoreColumnName };
                foreach (var col in outputSchema)
                {
                    Assert.True(col.Name == expectedOutputNames[col.Index]);
                }
            }
        }
        private static (IDataView, IDataView) BuildTrainEvaluateAndSaveModel(MLContext mlContext)
        {
            // STEP 1: Download and load the data
            GetData(InputPath, OutputPath, TrainDatasetPath, TrainDatasetUrl, TestDatasetUrl, TestDatasetPath,
                    ValidationDatasetUrl, ValidationDatasetPath);

            //ColumnInferenceResults columnInference = mlContext.Auto().InferColumns(TrainDatasetPath, labelColumnIndex: 0,
            //    separatorChar: '\t', hasHeader: true, groupColumns: false, allowSparse: true);

            var textLoaderOptions = new TextLoader.Options
            {
                Separators = new[] { '\t' },
                HasHeader  = true,
                Columns    = new[]
                {
                    new TextLoader.Column("Label", DataKind.Single, 0),
                    new TextLoader.Column("GroupId", DataKind.Int32, 1),
                    new TextLoader.Column("Features", DataKind.Single, 2, 133),
                }
            };

            TextLoader textLoader         = mlContext.Data.CreateTextLoader(textLoaderOptions);
            IDataView  trainDataView      = textLoader.Load(TrainDatasetPath);
            IDataView  validationDataView = textLoader.Load(ValidationDatasetPath);
            IDataView  testDataView       = textLoader.Load(TestDatasetPath);

            // STEP 2: Display first few rows of training data
            ConsoleHelper.ShowDataViewInConsole(mlContext, trainDataView);

            // STEP 3: Initialize our user-defined progress handler that AutoML will
            // invoke after each model it produces and evaluates.
            var progressHandler = new RankingExperimentProgressHandler();

            // STEP 4: Run AutoML ranking experiment
            ConsoleHelper.ConsoleWriteHeader("=============== Running AutoML experiment ===============");
            Console.WriteLine($"Running AutoML ranking experiment for {ExperimentTime} seconds...");

            var experimentSettings = new RankingExperimentSettings
            {
                MaxExperimentTimeInSeconds        = ExperimentTime,
                OptimizingMetric                  = RankingMetric.Ndcg,
                OptimizationMetricTruncationLevel = 10
            };

            ExperimentResult <RankingMetrics> experimentResult = mlContext.Auto()
                                                                 .CreateRankingExperiment(experimentSettings)
                                                                 .Execute(
                trainData: trainDataView,
                validationData: validationDataView,
                progressHandler: progressHandler);

            // Print top models found by AutoML
            Console.WriteLine("\n===== Evaluating model's NDCG (on validation data) =====");
            PrintTopModels(experimentResult, experimentSettings.OptimizationMetricTruncationLevel);

            var rankingEvaluatorOptions = new RankingEvaluatorOptions
            {
                DcgTruncationLevel = Math.Min(10, (int)experimentSettings.OptimizationMetricTruncationLevel * 2)
            };

            Console.WriteLine("\n===== Evaluating model's NDCG (on test data) =====");
            IDataView predictions = experimentResult.BestRun.Model.Transform(testDataView);
            var       metrics     = mlContext.Ranking.Evaluate(predictions, rankingEvaluatorOptions);

            ConsoleHelper.PrintRankingMetrics(experimentResult.BestRun.TrainerName, metrics, experimentSettings.OptimizationMetricTruncationLevel);

            // STEP 5: Refit the model and get final metrics
            // Re-fit best pipeline on train and validation data, to produce
            // a model that is trained on as much data as is available while
            // still having test data for the final estimate of how well the
            // model will do in production.
            Console.WriteLine("\n===== Refitting on train+valid and evaluating model's NDCG (on test data) =====");
            var       trainPlusValidationDataView = textLoader.Load(new MultiFileSource(TrainDatasetPath, ValidationDatasetPath));
            var       refitModel = experimentResult.BestRun.Estimator.Fit(trainPlusValidationDataView);
            IDataView predictionsRefitOnTrainPlusValidation = refitModel.Transform(testDataView);
            var       metricsRefitOnTrainPlusValidation     = mlContext.Ranking.Evaluate(predictionsRefitOnTrainPlusValidation, rankingEvaluatorOptions);

            ConsoleHelper.PrintRankingMetrics(experimentResult.BestRun.TrainerName, metricsRefitOnTrainPlusValidation, experimentSettings.OptimizationMetricTruncationLevel);

            // STEP 6: Refit the model with all available data
            // Re-fit best pipeline again on train, validation, and test data, to
            // produce a model that is trained on as much data as is available.
            // This is the final model that can be deployed to production.
            // No metrics are printed since we no longer have an independent
            // scoring dataset.
            Console.WriteLine("\n===== Refitting on train+valid+test to get the final model to launch to production =====");
            var trainPlusValidationPlusTestDataView = textLoader.Load(new MultiFileSource(TrainDatasetPath, ValidationDatasetPath, TestDatasetPath));
            var refitModelOnTrainValidTest          = experimentResult.BestRun.Estimator.Fit(trainPlusValidationPlusTestDataView);

            // STEP 7: Save/persist the trained model to a .ZIP file
            mlContext.Model.Save(refitModelOnTrainValidTest, trainDataView.Schema, ModelPath);

            Console.WriteLine("The model is saved to {0}", ModelPath);

            return(predictionsRefitOnTrainPlusValidation, testDataView);
        }