public static void TrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { Microsoft.ML.MLContext mlContext = new MLContext(seed: 1); var trainingDataView = mlContext.Data.LoadFromTextFile <ChatIssue>(DataSetLocation, hasHeader: true, separatorChar: '\t', allowSparse: false); var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: DefaultColumnNames.Label, inputColumnName: nameof(ChatIssue.Answer)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "TitleFeaturized", inputColumnName: nameof(ChatIssue.Question))) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(ChatIssue.More))) .Append(mlContext.Transforms.Concatenate(outputColumnName: DefaultColumnNames.Features, "TitleFeaturized", "DescriptionFeaturized")) .AppendCacheCheckpoint(mlContext); // Use in-memory cache for small/medium datasets to lower training time. // Do NOT use it (remove .AppendCacheCheckpoint()) when handling very large datasets. IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label, DefaultColumnNames.Features, numIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); break; } default: break; } var trainingPipeline = dataProcessPipeline.Append(trainer).Append(mlContext.Transforms.Conversion.MapKeyToValue(DefaultColumnNames.PredictedLabel)); var trainedModel = trainingPipeline.Fit(trainingDataView); using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write)) mlContext.Model.Save(trainedModel, fs); }
public static void TrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { Microsoft.ML.MLContext mlContext = new MLContext(seed: 1); var trainingDataView = mlContext.Data.LoadFromTextFile <QuestionsIssue>(DataSetLocation, hasHeader: true, separatorChar: '\t', allowSparse: false); TrainTestData trainTestSplit = mlContext.Data.TrainTestSplit(trainingDataView, testFraction: 0.2); IDataView trainingData = trainTestSplit.TrainSet; IDataView testData = trainTestSplit.TestSet; var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: "Label", inputColumnName: nameof(QuestionsIssue.Answer)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(QuestionsIssue.Question))) .Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", "DescriptionFeaturized")) .AppendCacheCheckpoint(mlContext); IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.SdcaNonCalibrated("Label", "Features"); break; default: break; } var trainingPipeline = dataProcessPipeline.Append(trainer).Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); var trainedModel = trainingPipeline.Fit(trainingDataView); // STEP 5: Evaluate the model and show accuracy stats var predictions = trainedModel.Transform(testData); var metrics = mlContext.MulticlassClassification.Evaluate(data: predictions, labelColumnName: "Label", scoreColumnName: "Score"); // STEP 6: Save/persist the trained model to a .ZIP file mlContext.Model.Save(trainedModel, trainingData.Schema, ModelPath); }
/// <summary> /// Build and train new model /// </summary> /// <param name="sortedDirectories"></param> /// <param name="unsortedFiles"></param> /// <param name="trainerStrategy"></param> /// <param name="rebuildModel"></param> public static void BuildModel(IEnumerable <string> sortedDirectories, IEnumerable <string> unsortedFiles, MyTrainerStrategy trainerStrategy = MyTrainerStrategy.OVAAveragedPerceptronTrainer, bool rebuildModel = false) { try { SetupConfiguration(); if (File.Exists(ModelPath) && !rebuildModel) { return; } GetInputFiles(unsortedFiles); //Prepair DataSet BuildDataSet(sortedDirectories);//for learning BuildAndTrainModel(DataSetPath, ModelPath, trainerStrategy); } catch (Exception ex) { Errors = ex; } }
public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 0); // STEP 1: Common data loading configuration var trainingDataView = mlContext.Data.ReadFromTextFile <GitHubIssue>(DataSetLocation, hasHeader: true, separatorChar: '\t'); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: DefaultColumnNames.Label, inputColumnName: nameof(GitHubIssue.Area)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "TitleFeaturized", inputColumnName: nameof(GitHubIssue.Title))) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(GitHubIssue.Description))) .Append(mlContext.Transforms.Concatenate(outputColumnName: DefaultColumnNames.Features, "TitleFeaturized", "DescriptionFeaturized")) //Sample Caching the DataView so estimators iterating over the data multiple times, instead of always reading from file, using the cache might get better performance .AppendCacheCheckpoint(mlContext); //In this sample, only when using OVA (Not SDCA) the cache improves the training time, since OVA works multiple times/iterations over the same data // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features" Common.ConsoleHelper.PeekDataViewInConsole <GitHubIssue>(mlContext, trainingDataView, dataProcessPipeline, 2); //Common.ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "Features", trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label, DefaultColumnNames.Features, numIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer) .Append(mlContext.Transforms.Conversion.MapKeyToValue(DefaultColumnNames.PredictedLabel)); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); //Measure cross-validation time var watchCrossValTime = System.Diagnostics.Stopwatch.StartNew(); var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(data: trainingDataView, estimator: trainingPipeline, numFolds: 6, labelColumn: DefaultColumnNames.Label); //Stop measuring time watchCrossValTime.Stop(); long elapsedMs = watchCrossValTime.ElapsedMilliseconds; Console.WriteLine($"Time Cross-Validating: {elapsedMs} miliSecs"); //(CDLTLL-Pending-TODO) // ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("=============== Training the model ==============="); //Measure training time var watch = System.Diagnostics.Stopwatch.StartNew(); var trainedModel = trainingPipeline.Fit(trainingDataView); //Stop measuring time watch.Stop(); long elapsedCrossValMs = watch.ElapsedMilliseconds; Console.WriteLine($"Time Training the model: {elapsedCrossValMs} miliSecs"); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) GitHubIssue issue = new GitHubIssue() { ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; // Create prediction engine related to the loaded trained model var predEngine = trainedModel.CreatePredictionEngine <GitHubIssue, GitHubIssuePrediction>(mlContext); //Score var prediction = predEngine.Predict(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine("=============== Saving the model to a file ==============="); using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write)) mlContext.Model.Save(trainedModel, fs); Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized"); }
public static void BuildAndTrainModel(string inputDataSetPath, string outputModelPath, MyTrainerStrategy selectedStrategy) { var stopWatch = Stopwatch.StartNew(); Console.WriteLine($"Reading input TSV {inputDataSetPath}..."); // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 0); // STEP 1: Common data loading configuration var trainingDataView = mlContext.Data.LoadFromTextFile <GitHubIssue>(inputDataSetPath, hasHeader: true, separatorChar: '\t', allowSparse: false); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: "Label", inputColumnName: nameof(GitHubIssue.Area)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "TitleFeaturized", inputColumnName: nameof(GitHubIssue.Title))) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(GitHubIssue.Description))) .Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", "TitleFeaturized", "DescriptionFeaturized")) .AppendCacheCheckpoint(mlContext); // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features"); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features", numberOfIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("Training the model..."); var trainedModel = trainingPipeline.Fit(trainingDataView); // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine($"Saving the model to {outputModelPath}..."); mlContext.Model.Save(trainedModel, trainingDataView.Schema, outputModelPath); stopWatch.Stop(); Console.WriteLine($"Done creating model in {stopWatch.ElapsedMilliseconds}ms"); }
public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 0); // STEP 1: Common data loading configuration TextLoader textLoader = mlContext.Data.TextReader(new TextLoader.Arguments() { Separator = "tab", HasHeader = true, Column = new[] { new TextLoader.Column("ID", DataKind.Text, 0), new TextLoader.Column("Area", DataKind.Text, 1), new TextLoader.Column("Title", DataKind.Text, 2), new TextLoader.Column("Description", DataKind.Text, 3), } }); var trainingDataView = textLoader.Read(DataSetLocation); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = mlContext.Transforms.Categorical.MapValueToKey("Area", "Label") .Append(mlContext.Transforms.Text.FeaturizeText("Title", "TitleFeaturized")) .Append(mlContext.Transforms.Text.FeaturizeText("Description", "DescriptionFeaturized")) .Append(mlContext.Transforms.Concatenate("Features", "TitleFeaturized", "DescriptionFeaturized")); // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features" Common.ConsoleHelper.PeekDataViewInConsole <GitHubIssue>(mlContext, trainingDataView, dataProcessPipeline, 2); //Common.ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "Features", trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label, DefaultColumnNames.Features, numIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = new Ova(mlContext, averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingDataView, trainingPipeline, numFolds: 6, labelColumn: "Label"); ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("=============== Training the model ==============="); var trainedModel = trainingPipeline.Fit(trainingDataView); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) GitHubIssue issue = new GitHubIssue() { ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; // Create prediction engine related to the loaded trained model var predFunction = trainedModel.MakePredictionFunction <GitHubIssue, GitHubIssuePrediction>(mlContext); //Score var prediction = predFunction.Predict(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine("=============== Saving the model to a file ==============="); using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write)) mlContext.Model.Save(trainedModel, fs); Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized"); }
public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 1); // STEP 1: Common data loading configuration var trainingDataView = mlContext.Data.LoadFromTextFile<GitHubIssue>(DataSetLocation, hasHeader: true, separatorChar:'\t', allowSparse: false); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: "Label",inputColumnName:nameof(GitHubIssue.Area)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "TitleFeaturized",inputColumnName:nameof(GitHubIssue.Title))) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(GitHubIssue.Description))) .Append(mlContext.Transforms.Concatenate(outputColumnName:"Features", "TitleFeaturized", "DescriptionFeaturized")) .AppendCacheCheckpoint(mlContext); // Use in-memory cache for small/medium datasets to lower training time. // Do NOT use it (remove .AppendCacheCheckpoint()) when handling very large datasets. // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features" Common.ConsoleHelper.PeekDataViewInConsole(mlContext, trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer IEstimator<ITransformer> trainer = null; switch(selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features"); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features",numberOfIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValidationResults= mlContext.MulticlassClassification.CrossValidate(data:trainingDataView, estimator:trainingPipeline, numberOfFolds: 6, labelColumnName:"Label"); ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("=============== Training the model ==============="); var trainedModel = trainingPipeline.Fit(trainingDataView); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) GitHubIssue issue = new GitHubIssue() { ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; // Create prediction engine related to the loaded trained model var predEngine = mlContext.Model.CreatePredictionEngine<GitHubIssue, GitHubIssuePrediction>(trainedModel); //Score var prediction = predEngine.Predict(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine("=============== Saving the model to a file ==============="); mlContext.Model.Save(trainedModel, trainingDataView.Schema, ModelPath); Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized"); }
public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 0); // STEP 1: Common data loading configuration var textLoader = GitHubLabelerTextLoaderFactory.CreateTextLoader(mlContext); var trainingDataView = textLoader.Read(DataSetLocation); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = GitHubLabelerDataProcessPipelineFactory.CreateDataProcessPipeline(mlContext); // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features" Common.ConsoleHelper.PeekDataViewInConsole <GitHubIssue>(mlContext, trainingDataView, dataProcessPipeline, 2); //Common.ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "Features", trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label, DefaultColumnNames.Features, numIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = new Ova(mlContext, averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm var modelBuilder = new Common.ModelBuilder <GitHubIssue, GitHubIssuePrediction>(mlContext, dataProcessPipeline); modelBuilder.AddTrainer(trainer); modelBuilder.AddEstimator(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValResults = modelBuilder.CrossValidateAndEvaluateMulticlassClassificationModel(trainingDataView, 6, "Label"); ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("=============== Training the model ==============="); modelBuilder.Train(trainingDataView); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) GitHubIssue issue = new GitHubIssue() { ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; var modelScorer = new ModelScorer <GitHubIssue, GitHubIssuePrediction>(mlContext, modelBuilder.TrainedModel); var prediction = modelScorer.PredictSingle(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine("=============== Saving the model to a file ==============="); modelBuilder.SaveModelAsFile(ModelPath); Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized"); }
private static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 1); // STEP 1: Common data loading configuration var trainingDataView = mlContext.Data.LoadFromTextFile <SortedFile>(DataSetLocation, hasHeader: true, separatorChar: ';', allowSparse: false); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: "Label", inputColumnName: nameof(SortedFile.Label)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "FileNameFeaturized", inputColumnName: nameof(SortedFile.FileName))) .Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", "FileNameFeaturized")) .AppendCacheCheckpoint(mlContext); // Use in-memory cache for small/medium datasets to lower training time. // Do NOT use it (remove .AppendCacheCheckpoint()) when handling very large datasets. // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features"); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features", numberOfIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(data: trainingDataView, estimator: trainingPipeline, numberOfFolds: 6, labelColumnName: "Label"); // STEP 5: Train the model fitting to the DataSet var trainedModel = trainingPipeline.Fit(trainingDataView); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) // var inputFile = new SortedFile() { FileName = "ВКС селектор.doc" }; // Create prediction engine related to the loaded trained model // var predEngine = mlContext.Model.CreatePredictionEngine<SortedFile, SortedFilePrediction>(trainedModel); //Score // var prediction = predEngine.Predict(inputFile); // Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Label} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file // Console.WriteLine("=============== Saving the model to a file ==============="); mlContext.Model.Save(trainedModel, trainingDataView.Schema, ModelPath); }