static void Main(string[] args) { var context = new MLContext(); var data = context.Data.LoadFromTextFile <RankingData>("./ranking.tsv", separatorChar: '\t'); var trainTestSplit = context.Data.TrainTestSplit(data, testFraction: 0.2); var settings = new RankingExperimentSettings { MaxExperimentTimeInSeconds = 300, OptimizingMetric = RankingMetric.Ndcg, }; var experiment = context.Auto().CreateRankingExperiment(settings); var progressHandler = new Progress <RunDetail <RankingMetrics> >(ph => { if (ph.ValidationMetrics != null) { Console.WriteLine($"Current trainer - {ph.TrainerName} with nDCG {ph.ValidationMetrics.NormalizedDiscountedCumulativeGains.Average()}"); } }); var results = experiment.Execute(trainTestSplit.TrainSet, validationData: trainTestSplit.TestSet, progressHandler: progressHandler); var bestRun = results.BestRun; var metrics = bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains; Console.WriteLine(Environment.NewLine); Console.WriteLine($"Best model {bestRun.TrainerName} - with nDCG {metrics.Average()}"); }
static void Main(string[] args) { var context = new MLContext(); var data = context.Data.LoadFromTextFile <RankingData>("./ranking.tsv", separatorChar: '\t'); var settings = new RankingExperimentSettings { MaxExperimentTimeInSeconds = 300, OptimizingMetric = RankingMetric.Ndcg }; var experiment = context.Auto().CreateRankingExperiment(settings); var results = experiment.Execute(data); var bestModel = results.BestRun.Model; }
public void AutoFitRankingTest() { string labelColumnName = "Label"; string scoreColumnName = "Score"; string groupIdColumnName = "GroupId"; string featuresColumnVectorNameA = "FeatureVectorA"; string featuresColumnVectorNameB = "FeatureVectorB"; var mlContext = new MLContext(1); // STEP 1: Load data var reader = new TextLoader(mlContext, GetLoaderArgsRank(labelColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB)); var trainDataView = reader.Load(new MultiFileSource(DatasetUtil.GetMLSRDataset())); var testDataView = mlContext.Data.TakeRows(trainDataView, 500); trainDataView = mlContext.Data.SkipRows(trainDataView, 500); // STEP 2: Run AutoML experiment var settings = new RankingExperimentSettings() { MaxExperimentTimeInSeconds = 5, OptimizationMetricTruncationLevel = 3 }; var experiment = mlContext.Auto() .CreateRankingExperiment(settings); ExperimentResult <RankingMetrics>[] experimentResults = { experiment.Execute(trainDataView, labelColumnName, groupIdColumnName), experiment.Execute(trainDataView, testDataView), experiment.Execute(trainDataView, testDataView, new ColumnInformation() { LabelColumnName = labelColumnName, GroupIdColumnName = groupIdColumnName, }), experiment.Execute(trainDataView, testDataView, new ColumnInformation() { LabelColumnName = labelColumnName, GroupIdColumnName = groupIdColumnName, SamplingKeyColumnName = groupIdColumnName }) }; for (int i = 0; i < experimentResults.Length; i++) { RunDetail <RankingMetrics> bestRun = experimentResults[i].BestRun; // The user requested 3, but we always return at least 10. Assert.Equal(10, bestRun.ValidationMetrics.DiscountedCumulativeGains.Count); Assert.Equal(10, bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains.Count); Assert.True(experimentResults[i].RunDetails.Count() > 0); Assert.NotNull(bestRun.ValidationMetrics); Assert.True(bestRun.ValidationMetrics.NormalizedDiscountedCumulativeGains.Last() > 0.4); Assert.True(bestRun.ValidationMetrics.DiscountedCumulativeGains.Last() > 19); var outputSchema = bestRun.Model.GetOutputSchema(trainDataView.Schema); var expectedOutputNames = new string[] { labelColumnName, groupIdColumnName, groupIdColumnName, featuresColumnVectorNameA, featuresColumnVectorNameB, "Features", scoreColumnName }; foreach (var col in outputSchema) { Assert.True(col.Name == expectedOutputNames[col.Index]); } } }
private static (IDataView, IDataView) BuildTrainEvaluateAndSaveModel(MLContext mlContext) { // STEP 1: Download and load the data GetData(InputPath, OutputPath, TrainDatasetPath, TrainDatasetUrl, TestDatasetUrl, TestDatasetPath, ValidationDatasetUrl, ValidationDatasetPath); //ColumnInferenceResults columnInference = mlContext.Auto().InferColumns(TrainDatasetPath, labelColumnIndex: 0, // separatorChar: '\t', hasHeader: true, groupColumns: false, allowSparse: true); var textLoaderOptions = new TextLoader.Options { Separators = new[] { '\t' }, HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.Single, 0), new TextLoader.Column("GroupId", DataKind.Int32, 1), new TextLoader.Column("Features", DataKind.Single, 2, 133), } }; TextLoader textLoader = mlContext.Data.CreateTextLoader(textLoaderOptions); IDataView trainDataView = textLoader.Load(TrainDatasetPath); IDataView validationDataView = textLoader.Load(ValidationDatasetPath); IDataView testDataView = textLoader.Load(TestDatasetPath); // STEP 2: Display first few rows of training data ConsoleHelper.ShowDataViewInConsole(mlContext, trainDataView); // STEP 3: Initialize our user-defined progress handler that AutoML will // invoke after each model it produces and evaluates. var progressHandler = new RankingExperimentProgressHandler(); // STEP 4: Run AutoML ranking experiment ConsoleHelper.ConsoleWriteHeader("=============== Running AutoML experiment ==============="); Console.WriteLine($"Running AutoML ranking experiment for {ExperimentTime} seconds..."); var experimentSettings = new RankingExperimentSettings { MaxExperimentTimeInSeconds = ExperimentTime, OptimizingMetric = RankingMetric.Ndcg, OptimizationMetricTruncationLevel = 10 }; ExperimentResult <RankingMetrics> experimentResult = mlContext.Auto() .CreateRankingExperiment(experimentSettings) .Execute( trainData: trainDataView, validationData: validationDataView, progressHandler: progressHandler); // Print top models found by AutoML Console.WriteLine("\n===== Evaluating model's NDCG (on validation data) ====="); PrintTopModels(experimentResult, experimentSettings.OptimizationMetricTruncationLevel); var rankingEvaluatorOptions = new RankingEvaluatorOptions { DcgTruncationLevel = Math.Min(10, (int)experimentSettings.OptimizationMetricTruncationLevel * 2) }; Console.WriteLine("\n===== Evaluating model's NDCG (on test data) ====="); IDataView predictions = experimentResult.BestRun.Model.Transform(testDataView); var metrics = mlContext.Ranking.Evaluate(predictions, rankingEvaluatorOptions); ConsoleHelper.PrintRankingMetrics(experimentResult.BestRun.TrainerName, metrics, experimentSettings.OptimizationMetricTruncationLevel); // STEP 5: Refit the model and get final metrics // Re-fit best pipeline on train and validation data, to produce // a model that is trained on as much data as is available while // still having test data for the final estimate of how well the // model will do in production. Console.WriteLine("\n===== Refitting on train+valid and evaluating model's NDCG (on test data) ====="); var trainPlusValidationDataView = textLoader.Load(new MultiFileSource(TrainDatasetPath, ValidationDatasetPath)); var refitModel = experimentResult.BestRun.Estimator.Fit(trainPlusValidationDataView); IDataView predictionsRefitOnTrainPlusValidation = refitModel.Transform(testDataView); var metricsRefitOnTrainPlusValidation = mlContext.Ranking.Evaluate(predictionsRefitOnTrainPlusValidation, rankingEvaluatorOptions); ConsoleHelper.PrintRankingMetrics(experimentResult.BestRun.TrainerName, metricsRefitOnTrainPlusValidation, experimentSettings.OptimizationMetricTruncationLevel); // STEP 6: Refit the model with all available data // Re-fit best pipeline again on train, validation, and test data, to // produce a model that is trained on as much data as is available. // This is the final model that can be deployed to production. // No metrics are printed since we no longer have an independent // scoring dataset. Console.WriteLine("\n===== Refitting on train+valid+test to get the final model to launch to production ====="); var trainPlusValidationPlusTestDataView = textLoader.Load(new MultiFileSource(TrainDatasetPath, ValidationDatasetPath, TestDatasetPath)); var refitModelOnTrainValidTest = experimentResult.BestRun.Estimator.Fit(trainPlusValidationPlusTestDataView); // STEP 7: Save/persist the trained model to a .ZIP file mlContext.Model.Save(refitModelOnTrainValidTest, trainDataView.Schema, ModelPath); Console.WriteLine("The model is saved to {0}", ModelPath); return(predictionsRefitOnTrainPlusValidation, testDataView); }