/// <summary> /// Initialize (or reinitialize) the stopwords remover. Also called by the constructor. /// </summary> /// <param name="CustomStopWords"></param> public void InitializeStopWordsRemover(string[] CustomStopWords = null) { _mlContext = new MLContext(); _emptySamplesList = new List <TextData>(); _emptyDataView = _mlContext.Data.LoadFromEnumerable(_emptySamplesList); // stop words _stopWordsTextPipeline = _mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") .Append(_mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English)); if (CustomStopWords == null) { _textPipeline = _stopWordsTextPipeline.Fit(_emptyDataView); _mode = removeStopWordswMode.Default; } else { var textPipeline = _mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") .Append(_mlContext.Transforms.Text.RemoveStopWords( "WordsWithoutStopWords", "Words", stopwords: CustomStopWords)); _mode = removeStopWordswMode.Custom; } _predictionEngine = _mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(_textPipeline); }
public void Train(string trainingFileName) { //1. First, we check to make sure that the training data filename exists if (!File.Exists(trainingFileName)) { Console.WriteLine($"Failed to find training data file ({trainingFileName}"); return; } //2. Use the LoadFromTextFile helper method that ML.NET provides to assist with //the loading of text files into an IDataView object IDataView trainingDataView = MlContext.Data.LoadFromTextFile <RestaurantFeedback>(trainingFileName); //3. Use the TrainTestSplit method that ML.NET provides to create a test set from the //main training data DataOperationsCatalog.TrainTestData dataSplit = MlContext.Data.TrainTestSplit(trainingDataView, testFraction: 0.2); //4. Firstly, we create the pipeline Microsoft.ML.Transforms.Text.TextFeaturizingEstimator dataProcessPipeline = MlContext.Transforms.Text.FeaturizeText( outputColumnName: "Features", inputColumnName: nameof(RestaurantFeedback.Text)); //5. Next, we instantiate our Trainer class Microsoft.ML.Trainers.SdcaLogisticRegressionBinaryTrainer sdcaRegressionTrainer = MlContext.BinaryClassification.Trainers.SdcaLogisticRegression( labelColumnName: nameof(RestaurantFeedback.Label), featureColumnName: "Features"); //6. Then, we complete the pipeline by appending the trainer we instantiated //previously Microsoft.ML.Data.EstimatorChain <Microsoft.ML.Data.BinaryPredictionTransformer <Microsoft.ML.Calibrators.CalibratedModelParametersBase <Microsoft.ML.Trainers.LinearBinaryModelParameters, Microsoft.ML.Calibrators.PlattCalibrator> > > trainingPipeline = dataProcessPipeline.Append(sdcaRegressionTrainer); //7. Next, we train the model with the dataset we created earlier in the chapter ITransformer trainedModel = trainingPipeline.Fit(dataSplit.TrainSet); MlContext.Model.Save(trainedModel, dataSplit.TrainSet.Schema, ModelPath); //8. We save our newly created model to the filename specified, matching the //training set's schema IDataView testSetTransform = trainedModel.Transform(dataSplit.TestSet); //9. Now, we transform our newly created model with the test set we created earlier Microsoft.ML.Data.CalibratedBinaryClassificationMetrics modelMetrics = MlContext.BinaryClassification.Evaluate( data: testSetTransform, labelColumnName: nameof(RestaurantFeedback.Label), scoreColumnName: nameof(RestaurantPrediction.Score)); //10. Finally, we will use the testSetTransform function created previously and //pass it into the BinaryClassification class's Evaluate method //This method allows us to generate model metrics. We then print the main metrics using the //trained model with the test set. We will dive into these properties specifically in the //Evaluating the Model section of this chapter. Console.WriteLine($"Area Under Curve: {modelMetrics.AreaUnderRocCurve:P2}{Environment.NewLine}" + $"Area Under Precision Recall Curve: {modelMetrics.AreaUnderPrecisionRecallCurve:P2}{Environment.NewLine}" + $"Accuracy: {modelMetrics.Accuracy:P2}{Environment.NewLine}" + $"F1Score: {modelMetrics.F1Score:P2}{Environment.NewLine}" + $"Positive Recall: {modelMetrics.PositiveRecall:#.##}{Environment.NewLine}" + $"Negative Recall: {modelMetrics.NegativeRecall:#.##}{Environment.NewLine}"); }
/// <summary> /// Train/create the ML model. /// </summary> /// <param name="trainDataView"></param> /// <param name="pipeline"></param> /// <returns>Trained Model</returns> private static ITransformer GetTrainedModel(IDataView trainDataView, Microsoft.ML.Data.EstimatorChain <KeyToValueMappingTransformer> pipeline) { Console.WriteLine("*** Training the image classification model with DNN Transfer Learning on top of the selected pre-trained model/architecture ***"); var watch = Stopwatch.StartNew(); ITransformer trainedModel = pipeline.Fit(trainDataView); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; Console.WriteLine($"Training with transfer learning took: {elapsedMs / 1000} seconds"); return(trainedModel); }
public void Train(string trainingFileName) { if (!File.Exists(trainingFileName)) { Console.WriteLine($"Failed to find training data file ({trainingFileName}"); return; } //The first change is the use of a comma to separate the data IDataView trainingDataView = MlContext.Data.LoadFromTextFile <EmploymentHistory>(trainingFileName, ','); DataOperationsCatalog.TrainTestData dataSplit = MlContext.Data.TrainTestSplit(trainingDataView, testFraction: 0.4); Microsoft.ML.Data.EstimatorChain <Microsoft.ML.Data.TransformerChain <Microsoft.ML.Data.ColumnConcatenatingTransformer> > dataProcessPipeline = MlContext.Transforms.CopyColumns("Label", nameof(EmploymentHistory.DurationInMonths)) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.IsMarried))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.BSDegree))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.MSDegree))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.YearsExperience)) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.AgeAtHire))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.HasKids))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.WithinMonthOfVesting))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.DeskDecorations))) .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(EmploymentHistory.LongCommute))) .Append(MlContext.Transforms.Concatenate("Features", typeof(EmploymentHistory).ToPropertyList <EmploymentHistory>(nameof(EmploymentHistory.DurationInMonths))))); //We can then create the Sdca trainer using the default parameters Microsoft.ML.Trainers.SdcaRegressionTrainer trainer = MlContext.Regression.Trainers.Sdca(labelColumnName: "Label", featureColumnName: "Features"); Microsoft.ML.Data.EstimatorChain <Microsoft.ML.Data.RegressionPredictionTransformer <Microsoft.ML.Trainers.LinearRegressionModelParameters> > trainingPipeline = dataProcessPipeline.Append(trainer); ITransformer trainedModel = trainingPipeline.Fit(dataSplit.TrainSet); MlContext.Model.Save(trainedModel, dataSplit.TrainSet.Schema, ModelPath); IDataView testSetTransform = trainedModel.Transform(dataSplit.TestSet); //Lastly, we call the Regression.Evaluate method to provide regression specific metrics Microsoft.ML.Data.RegressionMetrics modelMetrics = MlContext.Regression.Evaluate(testSetTransform); Console.WriteLine($"Loss Function: {modelMetrics.LossFunction:0.##}{Environment.NewLine}" + $"Mean Absolute Error: {modelMetrics.MeanAbsoluteError:#.##}{Environment.NewLine}" + $"Mean Squared Error: {modelMetrics.MeanSquaredError:#.##}{Environment.NewLine}" + $"RSquared: {modelMetrics.RSquared:0.##}{Environment.NewLine}" + $"Root Mean Squared Error: {modelMetrics.RootMeanSquaredError:#.##}"); }