public static ITransformer TrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { ITransformer model = trainingPipeline.Fit(trainingDataView); return(model); }
public static IEstimator <ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator <ITransformer> pipeline) { var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features")) .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); _trainedModel = trainingPipeline.Fit(trainingDataView); _predEngine = _mlContext.Model.CreatePredictionEngine <GitHubIssue, IssuePrediction>(_trainedModel); GitHubIssue issue = new GitHubIssue() { Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; var prediction = _predEngine.Predict(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); return(trainingPipeline); }
private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { // Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValidationResults = mlContext.BinaryClassification.CrossValidateNonCalibrated(trainingDataView, trainingPipeline, numberOfFolds: 5, labelColumnName: "Category"); PrintBinaryClassificationFoldsAverageMetrics(crossValidationResults); }
public CategoricalTransform(TermEstimator term, IEstimator <ITransformer> keyToVector, IDataView input) { var chain = term.Append(keyToVector); _transformer = chain.Fit(input); }
public static List <float[]> PeekVectorColumnDataInConsole(MLContext mlContext, string columnName, IDataView dataView, IEstimator <ITransformer> pipeline, int numberOfRows = 4) { string msg = string.Format("Peek data in DataView: : Show {0} rows with just the '{1}' column", numberOfRows, columnName); ConsoleWriteHeader(msg); var transformer = pipeline.Fit(dataView); var transformedData = transformer.Transform(dataView); // Extract the 'Features' column. var someColumnData = transformedData.GetColumn <float[]>(columnName) .Take(numberOfRows).ToList(); // print to console the peeked rows someColumnData.ForEach(row => { String concatColumn = String.Empty; foreach (float f in row) { concatColumn += f.ToString(); } Console.WriteLine(concatColumn); }); return(someColumnData); }
public static IEstimator <ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator <ITransformer> pipeline) { var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features")) .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); _trainedModel = trainingPipeline.Fit(trainingDataView); _predEngine = _mlContext.Model.CreatePredictionEngine <Supervised, PurposePrediction>(_trainedModel); return(trainingPipeline); }
// </SnippetDeclareGlobalVariables> static void Main(string[] args) { // Create MLContext to be shared across the model creation workflow objects // <SnippetCreateMLContext> MLContext mlContext = new MLContext(); // </SnippetCreateMLContext> // Dictionary to encode words as integers. // <SnippetCreateLookupMap> var lookupMap = mlContext.Data.LoadFromTextFile(Path.Combine(_modelPath, "imdb_word_index.csv"), columns: new[] { new TextLoader.Column("Words", DataKind.String, 0), new TextLoader.Column("Ids", DataKind.Int32, 1), }, separatorChar: ',' ); // </SnippetCreateLookupMap> // The model expects the input feature vector to be a fixed length vector. // This action resizes the integer vector to a fixed length vector. If there // are less than 600 words in the sentence, the remaining indices will be filled // with zeros. If there are more than 600 words in the sentence, then the // array is truncated at 600. // <SnippetResizeFeatures> Action <MovieReview, FixedLengthFeatures> ResizeFeaturesAction = (s, f) => { var features = s.VariableLengthFeatures; Array.Resize(ref features, FeatureLength); f.Features = features; }; // </SnippetResizeFeatures> // Load the TensorFlow model. // <SnippetLoadTensorFlowModel> TensorFlowModel tensorFlowModel = mlContext.Model.LoadTensorFlowModel(_modelPath); // </SnippetLoadTensorFlowModel> // <SnippetGetModelSchema> DataViewSchema schema = tensorFlowModel.GetModelSchema(); Console.WriteLine(" =============== TensorFlow Model Schema =============== "); var featuresType = (VectorDataViewType)schema["Features"].Type; Console.WriteLine($"Name: Features, Type: {featuresType.ItemType.RawType}, Size: ({featuresType.Dimensions[0]})"); var predictionType = (VectorDataViewType)schema["Prediction/Softmax"].Type; Console.WriteLine($"Name: Prediction/Softmax, Type: {predictionType.ItemType.RawType}, Size: ({predictionType.Dimensions[0]})"); // </SnippetGetModelSchema> // <SnippetTokenizeIntoWords> IEstimator <ITransformer> pipeline = // Split the text into individual words mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "ReviewText") // </SnippetTokenizeIntoWords> // <SnippetMapValue> // Map each word to an integer value. The array of integer makes up the input features. .Append(mlContext.Transforms.Conversion.MapValue("VariableLengthFeatures", lookupMap, lookupMap.Schema["Words"], lookupMap.Schema["Ids"], "TokenizedWords")) // </SnippetMapValue> // <SnippetCustomMapping> // Resize variable length vector to fixed length vector. .Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize")) // </SnippetCustomMapping> // <SnippetScoreTensorFlowModel> // Passes the data to TensorFlow for scoring .Append(tensorFlowModel.ScoreTensorFlowModel("Prediction/Softmax", "Features")) // </SnippetScoreTensorFlowModel> // <SnippetCopyColumns> // Retrieves the 'Prediction' from TensorFlow and and copies to a column .Append(mlContext.Transforms.CopyColumns("Prediction", "Prediction/Softmax")); // </SnippetCopyColumns> // <SnippetCreateModel> // Create an executable model from the estimator pipeline IDataView dataView = mlContext.Data.LoadFromEnumerable(new List <MovieReview>()); ITransformer model = pipeline.Fit(dataView); // </SnippetCreateModel> // <SnippetCallPredictSentiment> PredictSentiment(mlContext, model); // </SnippetCallPredictSentiment> }
public GreedySolver(IEstimator estimator) { this.estimator = estimator; }
public static IEstimator <ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator <ITransformer> pipeline) { var trainingPipeline = pipeline .Append(_mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features)) .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); _trainedModel = trainingPipeline.Fit(trainingDataView); _predEngine = _trainedModel.CreatePredictionEngine <SentimentData, SentimentPrediction>(_mlContext); SentimentData issue = new SentimentData() { Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; var prediction = _predEngine.Predict(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); return(trainingPipeline); }
private static EvaluationResults Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { // Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValidationResults = mlContext.Regression.CrossValidate(trainingDataView, trainingPipeline, 10, "Value"); var results = PrintRegressionFoldsAverageMetrics(crossValidationResults); return(results); }
public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 0); // STEP 1: Common data loading configuration var textLoader = GitHubLabelerTextLoaderFactory.CreateTextLoader(mlContext); var trainingDataView = textLoader.Read(DataSetLocation); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = GitHubLabelerDataProcessPipelineFactory.CreateDataProcessPipeline(mlContext); // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features" Common.ConsoleHelper.PeekDataViewInConsole <GitHubIssue>(mlContext, trainingDataView, dataProcessPipeline, 2); //Common.ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "Features", trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label, DefaultColumnNames.Features, numIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = new Ova(mlContext, averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm var modelBuilder = new Common.ModelBuilder <GitHubIssue, GitHubIssuePrediction>(mlContext, dataProcessPipeline); modelBuilder.AddTrainer(trainer); modelBuilder.AddEstimator(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValResults = modelBuilder.CrossValidateAndEvaluateMulticlassClassificationModel(trainingDataView, 6, "Label"); ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("=============== Training the model ==============="); modelBuilder.Train(trainingDataView); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) GitHubIssue issue = new GitHubIssue() { ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; var modelScorer = new ModelScorer <GitHubIssue, GitHubIssuePrediction>(mlContext, modelBuilder.TrainedModel); var prediction = modelScorer.PredictSingle(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine("=============== Saving the model to a file ==============="); modelBuilder.SaveModelAsFile(ModelPath); Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized"); }
public ITransformer TrainFeaturizeText() { var textColumns = new List <string>(); for (int i = 0; i < 20; i++) // Only load first 20 columns { textColumns.Add($"Column{i}"); } var featurizers = new List <TextFeaturizingEstimator>(); foreach (var textColumn in textColumns) { var featurizer = _mlContext.Transforms.Text.FeaturizeText(textColumn, new TextFeaturizingEstimator.Options() { CharFeatureExtractor = null, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, MaximumNgramsCount = new int[] { 200000 } } }); featurizers.Add(featurizer); } IEstimator <ITransformer> pipeline = featurizers.First(); foreach (var featurizer in featurizers.Skip(1)) { pipeline = pipeline.Append(featurizer); } var model = pipeline.Fit(_dataset); // BENCHMARK OUTPUT // * Summary * //BenchmarkDotNet = v0.11.3, OS = Windows 10.0.18363 //Intel Xeon W - 2133 CPU 3.60GHz, 1 CPU, 12 logical and 6 physical cores //.NET Core SDK = 3.0.100 //[Host] : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), 64bit RyuJIT //Job - KDKCUJ : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), 64bit RyuJIT //Arguments =/ p:Configuration = Release Toolchain = netcoreapp2.1 IterationCount = 1 //LaunchCount = 3 MaxIterationCount = 20 RunStrategy = ColdStart //UnrollFactor = 1 WarmupCount = 1 // Method | Mean | Error | StdDev | Extra Metric | Gen 0 / 1k Op | Gen 1 / 1k Op | Gen 2 / 1k Op | Allocated Memory / Op | //------------------- | --------:| --------:| ---------:| -------------:| -------------:| ------------: | ------------: | --------------------: | // TrainFeaturizeText | 17.00 s | 6.337 s | 0.3474 s | - | 1949000.0000 | 721000.0000 | 36000.0000 | 315.48 MB | //// * Legends * // Mean : Arithmetic mean of all measurements // Error : Half of 99.9 % confidence interval // StdDev : Standard deviation of all measurements // Extra Metric: Value of the provided extra metric // Gen 0 / 1k Op : GC Generation 0 collects per 1k Operations // Gen 1 / 1k Op : GC Generation 1 collects per 1k Operations // Gen 2 / 1k Op : GC Generation 2 collects per 1k Operations // Allocated Memory/ Op : Allocated memory per single operation(managed only, inclusive, 1KB = 1024B) // 1 s: 1 Second(1 sec) //// * Diagnostic Output - MemoryDiagnoser * //// ***** BenchmarkRunner: End ***** // Run time: 00:01:52(112.92 sec), executed benchmarks: 1 //// * Artifacts cleanup * // Global total time: 00:01:59(119.89 sec), executed benchmarks: 1 return(model); }
private void MixMatch(string dataPath) { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), PetalLength: ctx.LoadFloat(2), PetalWidth: ctx.LoadFloat(3), // Label: kind of iris. Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. separator: ','); // Read the data. var data = reader.Read(dataPath); // Build the pre-processing pipeline. var learningPipeline = reader.MakeNewEstimator() .Append(r => ( // Convert string label to a key. Label: r.Label.ToKey(), // Concatenate all the features together into one column 'Features'. Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))); // Now, at the time of writing, there is no static pipeline for OVA (one-versus-all). So, let's // append the OVA learner to the dynamic pipeline. IEstimator <ITransformer> dynamicPipe = learningPipeline.AsDynamic; // Create a binary classification trainer. var binaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features"); // Append the OVA learner to the pipeline. dynamicPipe = dynamicPipe.Append(new Ova(mlContext, binaryTrainer)); // At this point, we have a choice. We could continue working with the dynamically-typed pipeline, and // ultimately call dynamicPipe.Fit(data.AsDynamic) to get the model, or we could go back into the static world. // Here's how we go back to the static pipeline: var staticFinalPipe = dynamicPipe.AssertStatic(mlContext, // Declare the shape of the input. As you can see, it's identical to the shape of the reader: // four float features and a string label. c => ( SepalLength: c.R4.Scalar, SepalWidth: c.R4.Scalar, PetalLength: c.R4.Scalar, PetalWidth: c.R4.Scalar, Label: c.Text.Scalar), // Declare the shape of the output (or a relevant subset of it). // In our case, we care only about the predicted label column (a key type), and scores (vector of floats). c => ( Score: c.R4.Vector, // Predicted label is a key backed by uint, with text values (since original labels are text). PredictedLabel: c.KeyU4.TextValues.Scalar)) // Convert the predicted label from key back to the original string value. .Append(r => r.PredictedLabel.ToValue()); // Train the model in a statically typed way. var model = staticFinalPipe.Fit(data); // And here is how we could've stayed in the dynamic pipeline and train that way. dynamicPipe = dynamicPipe.Append(new KeyToValueMappingEstimator(mlContext, "PredictedLabel")); var dynamicModel = dynamicPipe.Fit(data.AsDynamic); // Now 'dynamicModel', and 'model.AsDynamic' are equivalent. }
private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingDataView, trainingPipeline, numberOfFolds: 5, labelColumnName: "0"); PrintMulticlassClassificationFoldsAverageMetrics(crossValidationResults); }
public BaseTaskCreator(IEnumerable <IData> data, IEstimator estimator) { _data = data; _estimator = estimator; PrepareTasks(); }
public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(ValueToKeyMappingEstimator)); _hash = new HashingEstimator(_host, columns.Select(x => x.HashInfo).ToArray()); using (var ch = _host.Start(nameof(OneHotHashEncodingEstimator))) { var binaryCols = new List <(string input, string output)>(); var cols = new List <(string input, string output, bool bag)>(); for (int i = 0; i < columns.Length; i++) { var column = columns[i]; OneHotEncodingTransformer.OutputKind kind = columns[i].OutputKind; switch (kind) { default: throw _host.ExceptUserArg(nameof(column.OutputKind)); case OneHotEncodingTransformer.OutputKind.Key: continue; case OneHotEncodingTransformer.OutputKind.Bin: if ((column.HashInfo.InvertHash) != 0) { ch.Warning("Invert hashing is being used with binary encoding."); } binaryCols.Add((column.HashInfo.Output, column.HashInfo.Output)); break; case OneHotEncodingTransformer.OutputKind.Ind: cols.Add((column.HashInfo.Output, column.HashInfo.Output, false)); break; case OneHotEncodingTransformer.OutputKind.Bag: cols.Add((column.HashInfo.Output, column.HashInfo.Output, true)); break; } } IEstimator <ITransformer> toBinVector = null; IEstimator <ITransformer> toVector = null; if (binaryCols.Count > 0) { toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.input, x.output)).ToArray()); } if (cols.Count > 0) { toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.input, x.output, x.bag)).ToArray()); } if (toBinVector != null && toVector != null) { _toSomething = toVector.Append(toBinVector); } else { if (toBinVector != null) { _toSomething = toBinVector; } else { _toSomething = toVector; } } } }
private static ITransformer Train(IDataView trainingDataView, IEstimator <ITransformer> pipeLine) { // Train your model based on the data set return(pipeLine.Fit(trainingDataView)); }
/// <summary> /// A base template of regression trainer which contains pre-processing likes OHE,PCA with any choosing algorithm. /// </summary> /// <typeparam name="TType">Type of training data.</typeparam> /// <typeparam name="TTrainer">Type of trainer algorithm.</typeparam> /// <param name="context">Microsoft.ML context.</param> /// <param name="trainDataset">Training dataset.</param> /// <param name="estimator">Algorithm estimator.</param> /// <returns>Model of training datatype from given estimator.</returns> private static TransformerChain <TTrainer> RegressionTrainerTemplate <TType, TTrainer>(this MLContext context, IEnumerable <TType> trainDataset, IEstimator <TTrainer> estimator) where TType : class, new() where TTrainer : class, ITransformer { var type = typeof(TType); var labelColumnName = Preprocessing.LabelColumn(type.GetProperties()).Name; var properties = Preprocessing.ExcludeColumns(type.GetProperties()); var preprocessor = context.OneHotEncoding(properties); var trainDataframe = context.Data.LoadFromEnumerable(trainDataset); var pipeline = context.Transforms.CopyColumns(outputColumnName: "Label", inputColumnName: labelColumnName) .Append(preprocessor.OneHotEncodingEstimator) .Append(context.Transforms.Concatenate("Features", preprocessor.CombinedFeatures.ToArray())) .Append(context.Transforms.ProjectToPrincipalComponents(outputColumnName: "PCAFeatures", inputColumnName: "Features", rank: 2)) .Append(estimator); var model = pipeline.Fit(trainDataframe); return(model); }
public CategoricalEstimator(IHostEnvironment env, ColumnInfo[] columns, string file = null, string termsColumn = null, IComponentFactory <IMultiStreamSource, IDataLoader> loaderFactory = null) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(TermEstimator)); _term = new TermEstimator(_host, columns, file, termsColumn, loaderFactory); var binaryCols = new List <(string input, string output)>(); var cols = new List <(string input, string output, bool bag)>(); for (int i = 0; i < columns.Length; i++) { var column = columns[i]; CategoricalTransform.OutputKind kind = columns[i].OutputKind; switch (kind) { default: throw _host.ExceptUserArg(nameof(column.OutputKind)); case CategoricalTransform.OutputKind.Key: continue; case CategoricalTransform.OutputKind.Bin: binaryCols.Add((column.Output, column.Output)); break; case CategoricalTransform.OutputKind.Ind: cols.Add((column.Output, column.Output, false)); break; case CategoricalTransform.OutputKind.Bag: cols.Add((column.Output, column.Output, true)); break; } } IEstimator <ITransformer> toBinVector = null; IEstimator <ITransformer> toVector = null; if (binaryCols.Count > 0) { toBinVector = new KeyToBinaryVectorEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray()); } if (cols.Count > 0) { toVector = new KeyToVectorEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray()); } if (toBinVector != null && toVector != null) { _toSomething = toVector.Append(toBinVector); } else { if (toBinVector != null) { _toSomething = toBinVector; } else { _toSomething = toVector; } } }
public EstimatorChain <TNewTrans> Append <TNewTrans>(IEstimator <TNewTrans> estimator, TransformerScope scope = TransformerScope.Everything) where TNewTrans : class, ITransformer { Contracts.CheckValue(estimator, nameof(estimator)); return(new EstimatorChain <TNewTrans>(_host, _estimators.AppendElement(estimator), _scopes.AppendElement(scope), _needCacheAfter.AppendElement(false))); }
public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy) { // Create MLContext to be shared across the model creation workflow objects // Set a random seed for repeatable/deterministic results across multiple trainings. var mlContext = new MLContext(seed: 1); // STEP 1: Common data loading configuration var trainingDataView = mlContext.Data.ReadFromTextFile <GitHubIssue>(DataSetLocation, hasHeader: true, separatorChar: '\t', supportSparse: false); // STEP 2: Common data process configuration with pipeline data transformations var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: DefaultColumnNames.Label, inputColumnName: nameof(GitHubIssue.Area)) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "TitleFeaturized", inputColumnName: nameof(GitHubIssue.Title))) .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(GitHubIssue.Description))) .Append(mlContext.Transforms.Concatenate(outputColumnName: DefaultColumnNames.Features, "TitleFeaturized", "DescriptionFeaturized")) .AppendCacheCheckpoint(mlContext); // Use in-memory cache for small/medium datasets to lower training time. // Do NOT use it (remove .AppendCacheCheckpoint()) when handling very large datasets. // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features" Common.ConsoleHelper.PeekDataViewInConsole(mlContext, trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer IEstimator <ITransformer> trainer = null; switch (selectedStrategy) { case MyTrainerStrategy.SdcaMultiClassTrainer: trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features); break; case MyTrainerStrategy.OVAAveragedPerceptronTrainer: { // Create a binary classification trainer. var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label, DefaultColumnNames.Features, numIterations: 10); // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer. // In this strategy, a binary classification algorithm is used to train one classifier for each class, " // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, " // and choosing the prediction with the highest confidence score. trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); break; } default: break; } //Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer) .Append(mlContext.Transforms.Conversion.MapKeyToValue(DefaultColumnNames.PredictedLabel)); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); //Measure cross-validation time var watchCrossValTime = System.Diagnostics.Stopwatch.StartNew(); var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(data: trainingDataView, estimator: trainingPipeline, numFolds: 6, labelColumn: DefaultColumnNames.Label); //Stop measuring time watchCrossValTime.Stop(); long elapsedMs = watchCrossValTime.ElapsedMilliseconds; Console.WriteLine($"Time Cross-Validating: {elapsedMs} miliSecs"); ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("=============== Training the model ==============="); //Measure training time var watch = System.Diagnostics.Stopwatch.StartNew(); var trainedModel = trainingPipeline.Fit(trainingDataView); //Stop measuring time watch.Stop(); long elapsedCrossValMs = watch.ElapsedMilliseconds; Console.WriteLine($"Time Training the model: {elapsedCrossValMs} miliSecs"); // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model) GitHubIssue issue = new GitHubIssue() { ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.." }; // Create prediction engine related to the loaded trained model var predEngine = trainedModel.CreatePredictionEngine <GitHubIssue, GitHubIssuePrediction>(mlContext); //Score var prediction = predEngine.Predict(issue); Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ==============="); // // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine("=============== Saving the model to a file ==============="); using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write)) mlContext.Model.Save(trainedModel, fs); Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized"); }
public SuggestedTransform(PipelineNode pipelineNode, IEstimator <ITransformer> estimator) { PipelineNode = pipelineNode; Estimator = estimator; }
public static void PeekDataViewInConsole(MLContext mlContext, IDataView dataView, IEstimator <ITransformer> pipeline, int numberOfRows = 4) { string msg = string.Format("Peek data in DataView: Showing {0} rows with the columns", numberOfRows.ToString()); ConsoleWriteHeader(msg); //https://github.com/dotnet/machinelearning/blob/master/docs/code/MlNetCookBook.md#how-do-i-look-at-the-intermediate-data var transformer = pipeline.Fit(dataView); var transformedData = transformer.Transform(dataView); // 'transformedData' is a 'promise' of data, lazy-loading. call Preview //and iterate through the returned collection from preview. var preViewTransformedData = transformedData.Preview(maxRows: numberOfRows); foreach (var row in preViewTransformedData.RowView) { var ColumnCollection = row.Values; string lineToPrint = "Row--> "; foreach (KeyValuePair <string, object> column in ColumnCollection) { lineToPrint += $"| {column.Key}:{column.Value}"; } Console.WriteLine(lineToPrint + "\n"); } }
private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingDataView, trainingPipeline, numberOfFolds: 5, labelColumnName: "Area"); PrintMulticlassClassificationFoldsAverageMetrics(crossValidationResults); }
private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { // Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ==============="); }
private static IEstimator <ITransformer> BuildTrainingPipeline(MLContext mlContext, IEstimator <ITransformer> dataProcessingPipeline) { return(dataProcessingPipeline.Append(mlContext.BinaryClassification.Trainers.FastTree(new FastTreeBinaryTrainer.Options() { NumberOfLeaves = 10, NumberOfTrees = 50, LabelColumnName = "isFraud", FeatureColumnName = "Features" }))); }
public static ITransformer TrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline) { Console.WriteLine("=============== Training model ==============="); ITransformer model = trainingPipeline.Fit(trainingDataView); Console.WriteLine("=============== End of training process ==============="); return(model); }
/// <summary> /// Produces the estimator. Note that this is made out of <see cref="ReconcileCore(IHostEnvironment, string[])"/>'s /// return value, plus whatever usages of <see cref="CopyColumnsEstimator"/> are necessary to avoid collisions with /// the output names fed to the constructor. This class provides the implementation, and subclasses should instead /// override <see cref="ReconcileCore(IHostEnvironment, string[])"/>. /// </summary> public sealed override IEstimator <ITransformer> Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, IReadOnlyDictionary <PipelineColumn, string> inputNames, IReadOnlyDictionary <PipelineColumn, string> outputNames, IReadOnlyCollection <string> usedNames) { Contracts.AssertValue(env); env.AssertValue(toOutput); env.AssertValue(inputNames); env.AssertValue(outputNames); env.AssertValue(usedNames); // The reconciler should have been called with all the input columns having names. env.Assert(inputNames.Keys.All(_inputs.Contains) && _inputs.All(inputNames.Keys.Contains)); // The output name map should contain only outputs as their keys. Yet, it is possible not all // outputs will be required in which case these will both be subsets of those outputs indicated // at construction. env.Assert(outputNames.Keys.All(Outputs.Contains)); env.Assert(toOutput.All(Outputs.Contains)); env.Assert(Outputs.Count() == _outputNames.Length); IEstimator <ITransformer> result = null; // In the case where we have names used that conflict with the fixed output names, we must have some // renaming logic. var collisions = new HashSet <string>(_outputNames); collisions.IntersectWith(usedNames); var old2New = new Dictionary <string, string>(); if (collisions.Count > 0) { // First get the old names to some temporary names. int tempNum = 0; foreach (var c in collisions) { old2New[c] = $"#TrainTemp{tempNum++}"; } // In the case where the input names have anything that is used, we must reconstitute the input mapping. if (inputNames.Values.Any(old2New.ContainsKey)) { var newInputNames = new Dictionary <PipelineColumn, string>(); foreach (var p in inputNames) { newInputNames[p.Key] = old2New.ContainsKey(p.Value) ? old2New[p.Value] : p.Value; } inputNames = newInputNames; } result = new CopyColumnsEstimator(env, old2New.Select(p => (p.Key, p.Value)).ToArray()); } // Map the inputs to the names. string[] mappedInputNames = _inputs.Select(c => inputNames[c]).ToArray(); // Finally produce the trainer. var trainerEst = ReconcileCore(env, mappedInputNames); if (result == null) { result = trainerEst; } else { result = result.Append(trainerEst); } // OK. Now handle the final renamings from the fixed names, to the desired names, in the case // where the output was desired, and a renaming is even necessary. var toRename = new List <(string source, string name)>(); foreach ((PipelineColumn outCol, string fixedName) in Outputs.Zip(_outputNames, (c, n) => (c, n))) { if (outputNames.TryGetValue(outCol, out string desiredName)) { toRename.Add((fixedName, desiredName)); } else { env.Assert(!toOutput.Contains(outCol)); } } // Finally if applicable handle the renaming back from the temp names to the original names. foreach (var p in old2New) { toRename.Add((p.Value, p.Key)); } if (toRename.Count > 0) { result = result.Append(new CopyColumnsEstimator(env, toRename.ToArray())); } return(result); }
GeneralFunctionAnalyzer <TIn, TDelegateInput, TOutShape>( IHostEnvironment env, IChannel ch, TDelegateInput input, ReaderReconciler <TIn> baseReconciler, Func <TDelegateInput, TOutShape> mapper, out IEstimator <ITransformer> estimator, Func <PipelineColumn, string> inputNameFunction) { Contracts.CheckValue(mapper, nameof(mapper)); var method = mapper.Method; var output = mapper(input); KeyValuePair <string, PipelineColumn>[] outPairs = StaticPipeInternalUtils.GetNamesValues(output, method.ReturnParameter); // Map where the key depends on the set of things in the value. The value contains the yet unresolved dependencies. var keyDependsOn = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >(); // Map where the set of things in the value depend on the key. var dependsOnKey = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >(); // The set of columns detected with zero dependencies. var zeroDependencies = new List <PipelineColumn>(); // First we build up the two structures above, using a queue and visiting from the outputs up. var toVisit = new Queue <PipelineColumn>(outPairs.Select(p => p.Value)); while (toVisit.Count > 0) { var col = toVisit.Dequeue(); ch.CheckParam(col != null, nameof(mapper), "The delegate seems to have null columns returned somewhere in the pipe."); if (keyDependsOn.ContainsKey(col)) { continue; // Already visited. } var dependsOn = new HashSet <PipelineColumn>(); foreach (var dep in col.Dependencies ?? Enumerable.Empty <PipelineColumn>()) { dependsOn.Add(dep); if (!dependsOnKey.TryGetValue(dep, out var dependsOnDep)) { dependsOnKey[dep] = dependsOnDep = new HashSet <PipelineColumn>(); toVisit.Enqueue(dep); } dependsOnDep.Add(col); } keyDependsOn[col] = dependsOn; if (dependsOn.Count == 0) { zeroDependencies.Add(col); } } // Get the base input columns. var baseInputs = keyDependsOn.Select(p => p.Key).Where(col => col.ReconcilerObj == baseReconciler).ToArray(); // The columns that utilize the base reconciler should have no dependencies. This could only happen if // the caller of this function has introduced a situation whereby they are claiming they can reconcile // to a data-reader object but still have input data dependencies, which does not make sense and // indicates that there is a bug in that component code. Unfortunately we can only detect that condition, // not determine exactly how it arose, but we can still do so to indicate to the user that there is a // problem somewhere in the stack. ch.CheckParam(baseInputs.All(col => keyDependsOn[col].Count == 0), nameof(input), "Bug detected where column producing object was yielding columns with dependencies."); // This holds the mappings of columns to names and back. Note that while the same column could be used on // the *output*, e.g., you could hypothetically have `(a: r.Foo, b: r.Foo)`, we treat that as the last thing // that is done. var nameMap = new BidirectionalDictionary <string, PipelineColumn>(); // Check to see if we have any set of initial names. This is important in the case where we are mapping // in an input data view. foreach (var col in baseInputs) { string inputName = inputNameFunction(col); if (inputName != null) { ch.Assert(!nameMap.ContainsKey(col)); ch.Assert(!nameMap.ContainsKey(inputName)); nameMap[col] = inputName; ch.Trace($"Using input with name {inputName}."); } } estimator = null; var toCopy = new List <(string src, string dst)>(); int tempNum = 0; // For all outputs, get potential name collisions with used inputs. Resolve by assigning the input a temporary name. foreach (var p in outPairs) { // If the name for the output is already used by one of the inputs, and this output column does not // happen to have the same name, then we need to rename that input to keep it available. if (nameMap.TryGetValue(p.Key, out var inputCol) && p.Value != inputCol) { ch.Assert(baseInputs.Contains(inputCol)); string tempName = $"#Temp_{tempNum++}"; ch.Trace($"Input/output name collision: Renaming '{p.Key}' to '{tempName}'."); toCopy.Add((p.Key, tempName)); nameMap[tempName] = nameMap[p.Key]; ch.Assert(!nameMap.ContainsKey(p.Key)); } // If we already have a name for this output column, maybe it is used elsewhere. (This can happen when // the only thing done with an input is we rename it, or output it twice, or something like this.) In // this case it is most appropriate to delay renaming till after all other processing has been done in // that case. But otherwise we may as well just take the name. if (!nameMap.ContainsKey(p.Value)) { nameMap[p.Key] = p.Value; } } // If any renamings were necessary, create the CopyColumns estimator. if (toCopy.Count > 0) { estimator = new CopyColumnsEstimator(env, toCopy.ToArray()); } // First clear the inputs from zero-dependencies yet to be resolved. foreach (var col in baseInputs) { ch.Assert(zeroDependencies.Contains(col)); ch.Assert(col.ReconcilerObj == baseReconciler); zeroDependencies.Remove(col); // Make more efficient... if (!dependsOnKey.TryGetValue(col, out var depends)) { continue; } // If any of these base inputs do not have names because, for example, they do not directly appear // in the outputs and otherwise do not have names, assign them a name. if (!nameMap.ContainsKey(col)) { nameMap[col] = $"Temp_{tempNum++}"; } foreach (var depender in depends) { var dependencies = keyDependsOn[depender]; ch.Assert(dependencies.Contains(col)); dependencies.Remove(col); if (dependencies.Count == 0) { zeroDependencies.Add(depender); } } dependsOnKey.Remove(col); } // Call the reconciler to get the base reader estimator. var readerEstimator = baseReconciler.Reconcile(env, baseInputs, nameMap.AsOther(baseInputs)); ch.AssertValueOrNull(readerEstimator); // Next we iteratively find those columns with zero dependencies, "create" them, and if anything depends on // these add them to the collection of zero dependencies, etc. etc. while (zeroDependencies.Count > 0) { // All columns with the same reconciler can be transformed together. // Note that the following policy of just taking the first group is not optimal. So for example, we // could have three columns, (a, b, c). If we had the output (a.X(), b.X() c.Y().X()), then maybe we'd // reconcile a.X() and b.X() together, then reconcile c.Y(), then reconcile c.Y().X() alone. Whereas, we // could have reconciled c.Y() first, then reconciled a.X(), b.X(), and c.Y().X() together. var group = zeroDependencies.GroupBy(p => p.ReconcilerObj).First(); // Beyond that first group that *might* be a data reader reconciler, all subsequent operations will // be on where the data is already loaded and so accept data as an input, that is, they should produce // an estimator. If this is not the case something seriously wonky is going on, most probably that the // user tried to use a column from another source. If this is detected we can produce a sensible error // message to tell them not to do this. if (!(group.Key is EstimatorReconciler rec)) { throw ch.Except("Columns from multiple sources were detected. " + "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); } PipelineColumn[] cols = group.ToArray(); // All dependencies should, by this time, have names. ch.Assert(cols.SelectMany(c => c.Dependencies).All(dep => nameMap.ContainsKey(dep))); foreach (var newCol in cols) { if (!nameMap.ContainsKey(newCol)) { nameMap[newCol] = $"#Temp_{tempNum++}"; } } var localInputNames = nameMap.AsOther(cols.SelectMany(c => c.Dependencies ?? Enumerable.Empty <PipelineColumn>())); var localOutputNames = nameMap.AsOther(cols); var usedNames = new HashSet <string>(nameMap.Keys1.Except(localOutputNames.Values)); var localEstimator = rec.Reconcile(env, cols, localInputNames, localOutputNames, usedNames); readerEstimator = readerEstimator?.Append(localEstimator); estimator = estimator?.Append(localEstimator) ?? localEstimator; foreach (var newCol in cols) { zeroDependencies.Remove(newCol); // Make more efficient!! // Finally, we find all columns that depend on this one. If this happened to be the last pending // dependency, then we add it to the list. if (dependsOnKey.TryGetValue(newCol, out var depends)) { foreach (var depender in depends) { var dependencies = keyDependsOn[depender]; Contracts.Assert(dependencies.Contains(newCol)); dependencies.Remove(newCol); if (dependencies.Count == 0) { zeroDependencies.Add(depender); } } dependsOnKey.Remove(newCol); } } } if (keyDependsOn.Any(p => p.Value.Count > 0)) { // This might happen if the user does something incredibly strange, like, say, take some prior // lambda, assign a column to a local variable, then re-use it downstream in a different lambdas. // The user would have to go to some extraorindary effort to do that, but nonetheless we want to // fail with a semi-sensible error message. throw ch.Except("There were some leftover columns with unresolved dependencies. " + "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); } // Now do the final renaming, if any is necessary. toCopy.Clear(); foreach (var p in outPairs) { // TODO: Right now we just write stuff out. Once the copy-columns estimator is in place // we ought to do this for real. Contracts.Assert(nameMap.ContainsKey(p.Value)); string currentName = nameMap[p.Value]; if (currentName != p.Key) { ch.Trace($"Will copy '{currentName}' to '{p.Key}'"); toCopy.Add((currentName, p.Key)); } } // If any final renamings were necessary, insert the appropriate CopyColumns transform. if (toCopy.Count > 0) { var copyEstimator = new CopyColumnsEstimator(env, toCopy.ToArray()); if (estimator == null) { estimator = copyEstimator; } else { estimator = estimator.Append(copyEstimator); } } ch.Trace($"Exiting {nameof(ReaderEstimatorAnalyzerHelper)}"); return(readerEstimator); }
private static void TrainModel(string dataFile, string modelFile) { // Create MLContext to be shared across the model creation workflow objects var mlContext = new MLContext(seed: 0); // STEP 1: Loading the data Console.WriteLine($"Step 1: Loading the data ({dataFile})"); var textLoader = mlContext.Data.TextReader( new TextLoader.Arguments { Separator = ",", HasHeader = true, AllowQuoting = true, AllowSparse = true, Column = new[] { new TextLoader.Column("Id", DataKind.Text, 0), new TextLoader.Column("Category", DataKind.Text, 1), new TextLoader.Column("Content", DataKind.Text, 2), } }); var trainingDataView = textLoader.Read(dataFile); // STEP 2: Common data process configuration with pipeline data transformations Console.WriteLine("Step 2: Map raw input data columns to ML.NET data"); var dataProcessPipeline = mlContext.Transforms.Categorical.MapValueToKey("Category", DefaultColumnNames.Label) .Append(mlContext.Transforms.Text.FeaturizeText("Content", DefaultColumnNames.Features)); // (OPTIONAL) Peek data (few records) in training DataView after applying the ProcessPipeline's transformations into "Features" // DataViewToConsole<JokeModel>(mlContext, trainingDataView, dataProcessPipeline, 2); // STEP 3: Create the selected training algorithm/trainer Console.WriteLine("Step 3: Create and configure the selected training algorithm (trainer)"); IEstimator <ITransformer> trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(); // Alternative training //// var averagedPerceptionBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron( //// DefaultColumnNames.Label, //// DefaultColumnNames.Features, //// numIterations: 10); //// trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer); // Set the trainer/algorithm and map label to value (original readable state) var trainingPipeline = dataProcessPipeline.Append(trainer).Append( mlContext.Transforms.Conversion.MapKeyToValue(DefaultColumnNames.PredictedLabel)); // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate) // in order to evaluate and get the model's accuracy metrics Console.WriteLine("Step 4: Cross-Validate with single dataset (alternatively we can divide it 80-20)"); var crossValidationResults = mlContext.MulticlassClassification.CrossValidate( trainingDataView, trainingPipeline, numFolds: 10, labelColumn: "Label"); PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults); // STEP 5: Train the model fitting to the DataSet Console.WriteLine("Step 5: Train the model fitting to the DataSet"); var trainedModel = trainingPipeline.Fit(trainingDataView); // STEP 6: Save/persist the trained model to a .ZIP file Console.WriteLine($"Step 6: Save the model to a file ({modelFile})"); using (var fs = new FileStream(modelFile, FileMode.Create, FileAccess.Write, FileShare.Write)) { mlContext.Model.Save(trainedModel, fs); } }