private void AddPipelineStage(IEstimator <ITransformer> estimator) { _pipeline = _pipeline?.Append(estimator) ?? estimator; }
private void MixMatch(string dataPath) { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); // Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), PetalLength: ctx.LoadFloat(2), PetalWidth: ctx.LoadFloat(3), // Label: kind of iris. Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. separator: ','); // Read the data. var data = reader.Read(dataPath); // Build the pre-processing pipeline. var pipeline = reader.MakeNewEstimator() .Append(r => ( // Convert string label to a key. Label: r.Label.ToKey(), // Concatenate all the features together into one column 'Features'. Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth))); // Now, at the time of writing, there is no static pipeline for OVA (one-versus-all). So, let's // append the OVA learner to the dynamic pipeline. IEstimator <ITransformer> dynamicPipe = pipeline.AsDynamic; // Create a binary classification trainer. var binaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features"); // Append the OVA learner to the pipeline. dynamicPipe = dynamicPipe.Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer)); // At this point, we have a choice. We could continue working with the dynamically-typed pipeline, and // ultimately call dynamicPipe.Fit(data.AsDynamic) to get the model, or we could go back into the static world. // Here's how we go back to the static pipeline: var staticFinalPipe = dynamicPipe.AssertStatic(mlContext, // Declare the shape of the input. As you can see, it's identical to the shape of the reader: // four float features and a string label. c => ( SepalLength: c.R4.Scalar, SepalWidth: c.R4.Scalar, PetalLength: c.R4.Scalar, PetalWidth: c.R4.Scalar, Label: c.Text.Scalar), // Declare the shape of the output (or a relevant subset of it). // In our case, we care only about the predicted label column (a key type), and scores (vector of floats). c => ( Score: c.R4.Vector, // Predicted label is a key backed by uint, with text values (since original labels are text). PredictedLabel: c.KeyU4.TextValues.Scalar)) // Convert the predicted label from key back to the original string value. .Append(r => r.PredictedLabel.ToValue()); // Train the model in a statically typed way. var model = staticFinalPipe.Fit(data); // And here is how we could've stayed in the dynamic pipeline and train that way. dynamicPipe = dynamicPipe.Append(new KeyToValueMappingEstimator(mlContext, "PredictedLabel")); var dynamicModel = dynamicPipe.Fit(data.AsDynamic); // Now 'dynamicModel', and 'model.AsDynamic' are equivalent. }
static void Main(string[] args) { var mlContext = new MLContext(); IDataView attritionData = mlContext.Data.LoadFromTextFile <Employee>(path: "./data/attrition.csv", hasHeader: true, separatorChar: ','); var split = mlContext.Data.TrainTestSplit(attritionData, testFraction: 0.2); var trainData = split.TrainSet; var testData = split.TestSet; var numFields = attritionData.Schema.AsEnumerable() .Select(column => new { column.Name, column.Type }) .Where(column => (column.Name != nameof(Employee.Attrition)) && (column.Type.ToString() == "Single")) .ToArray(); var numFieldNames = numFields.AsEnumerable() .Select(column => column.Name) .ToList(); var oheFieldNames = new List <string>(); oheFieldNames.Add("OHE-" + nameof(Employee.BusinessTravel)); oheFieldNames.Add("OHE-" + nameof(Employee.Department)); oheFieldNames.Add("OHE-" + nameof(Employee.EducationField)); oheFieldNames.Add("OHE-" + nameof(Employee.MaritalStatus)); oheFieldNames.Add("OHE-" + nameof(Employee.JobLevel)); oheFieldNames.Add("OHE-" + nameof(Employee.JobRole)); oheFieldNames.Add("OHE-" + nameof(Employee.OverTime)); var allFeatureFields = new List <string>(); allFeatureFields.AddRange(oheFieldNames); string[] numFeatures = numFieldNames.ToArray(); allFeatureFields.AddRange(numFeatures); string[] allFeatureNames = allFeatureFields.ToArray(); IEstimator <ITransformer> featurizePipeline = mlContext.Transforms.Categorical.OneHotEncoding( new[] { new InputOutputColumnPair("OHE-" + nameof(Employee.BusinessTravel), nameof(Employee.BusinessTravel)), new InputOutputColumnPair("OHE-" + nameof(Employee.Department), nameof(Employee.Department)), new InputOutputColumnPair("OHE-" + nameof(Employee.EducationField), nameof(Employee.EducationField)), new InputOutputColumnPair("OHE-" + nameof(Employee.MaritalStatus), nameof(Employee.MaritalStatus)), new InputOutputColumnPair("OHE-" + nameof(Employee.JobLevel), nameof(Employee.JobLevel)), new InputOutputColumnPair("OHE-" + nameof(Employee.JobRole), nameof(Employee.JobRole)), new InputOutputColumnPair("OHE-" + nameof(Employee.OverTime), nameof(Employee.OverTime)) }, OneHotEncodingEstimator.OutputKind.Indicator); featurizePipeline = featurizePipeline.Append(mlContext.Transforms.Concatenate("Features", allFeatureNames)) .Append(mlContext.Transforms.NormalizeMinMax("Features", "Features")) .AppendCacheCheckpoint(mlContext); ConsoleHelper.ConsoleWriteHeader("=============== Begin to train the model ==============="); var trainer = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression( labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features"); /* ----- Tried with other trainers below and compared the outcome ------ */ // var trainer = mlContext.BinaryClassification.Trainers.LightGbm(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features"); // var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features"); // var trainer = mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features"); // var trainer = mlContext.BinaryClassification.Trainers.SgdCalibrated(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features"); /* ------------------------------------------------------------------- */ var trainPipeline = featurizePipeline.Append(trainer); var trainedModel = trainPipeline.Fit(trainData); Console.WriteLine("===== Evaluating Model's accuracy with Test data ====="); var testDataPredictions = trainedModel.Transform(testData); var evaluateMetrics = mlContext.BinaryClassification.Evaluate(data: testDataPredictions, labelColumnName: nameof(Employee.Attrition), scoreColumnName: "Score"); ConsoleHelper.PrintBinaryClassificationMetrics(trainedModel.ToString(), evaluateMetrics); Console.WriteLine("===== Permutation Test ====="); var transformedData = trainedModel.Transform(trainData); var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance( predictionTransformer: trainedModel.LastTransformer, data: transformedData, labelColumnName: nameof(Employee.Attrition), permutationCount: 50); var mapFields = new List <string>(); for (int i = 0; i < allFeatureNames.Count(); i++) { var slotField = new VBuffer <ReadOnlyMemory <char> >(); if (transformedData.Schema[allFeatureNames[i]].HasSlotNames()) { transformedData.Schema[allFeatureNames[i]].GetSlotNames(ref slotField); for (int j = 0; j < slotField.Length; j++) { mapFields.Add(allFeatureNames[i]); } } else { mapFields.Add(allFeatureNames[i]); } } // Now let's look at which features are most important to the model // overall. Get the feature indices sorted by their impact on AUC. var sortedIndices = permutationMetrics .Select((metrics, index) => new { index, metrics.AreaUnderRocCurve }) .OrderByDescending( feature => Math.Abs(feature.AreaUnderRocCurve.Mean)); foreach (var feature in sortedIndices) { Console.WriteLine($"{mapFields[feature.index],-20}|\t{Math.Abs(feature.AreaUnderRocCurve.Mean):F6}"); } }
public void Train(string trainingFileName, string testFileName) { System.Diagnostics.Debug.WriteLine("Reached Train Method"); //Check if training file exists if (!File.Exists(trainingFileName)) { System.Diagnostics.Debug.WriteLine($"Failed to find training data file ({trainingFileName}"); return; } //Check if test file exists if (!File.Exists(testFileName)) { System.Diagnostics.Debug.WriteLine($"Failed to find test data file ({testFileName}"); return; } //Convert training file into IDataView object (ready for processing) var trainingDataView = MlContext.Data.LoadFromTextFile <CarInventory>(trainingFileName, ',', hasHeader: false); //Normalise Mean Variance on the inputted values IEstimator <ITransformer> dataProcessPipeline = MlContext.Transforms .Concatenate("Features", typeof(CarInventory) .ToPropertyList <CarInventory>(nameof(CarInventory.Label))) .Append(MlContext.Transforms.NormalizeMeanVariance(inputColumnName: "Features", outputColumnName: "FeaturesNormalizedByMeanVar")); //Create a trainer object with the label from the car inventory class + normalised mean variance var trainer = MlContext.BinaryClassification.Trainers.FastTree( labelColumnName: nameof(CarInventory.Label), featureColumnName: "FeaturesNormalizedByMeanVar", numberOfLeaves: 2, numberOfTrees: 800, minimumExampleCountPerLeaf: 1, learningRate: 0.2); //Append the trainer to the pipeline var trainingPipeline = dataProcessPipeline.Append(trainer); //Save the model var trainedModel = trainingPipeline.Fit(trainingDataView); MlContext.Model.Save(trainedModel, trainingDataView.Schema, ModelPath); //Evaluate the model like we trained it var evaluationPipeline = trainedModel.Append(MlContext.Transforms .CalculateFeatureContribution(trainedModel.LastTransformer) .Fit(dataProcessPipeline.Fit(trainingDataView).Transform(trainingDataView))); var testDataView = MlContext.Data.LoadFromTextFile <CarInventory>(testFileName, ',', hasHeader: false); var testSetTransform = evaluationPipeline.Transform(testDataView); var modelMetrics = MlContext.BinaryClassification.Evaluate(data: testSetTransform, labelColumnName: nameof(CarInventory.Label), scoreColumnName: "Score"); System.Diagnostics.Debug.WriteLine($"Accuracy: {modelMetrics.Accuracy:P2}"); System.Diagnostics.Debug.WriteLine($"Area Under Curve: {modelMetrics.AreaUnderRocCurve:P2}"); System.Diagnostics.Debug.WriteLine($"Area under Precision recall Curve: {modelMetrics.AreaUnderPrecisionRecallCurve:P2}"); System.Diagnostics.Debug.WriteLine($"F1Score: {modelMetrics.F1Score:P2}"); System.Diagnostics.Debug.WriteLine($"LogLoss: {modelMetrics.LogLoss:#.##}"); System.Diagnostics.Debug.WriteLine($"LogLossReduction: {modelMetrics.LogLossReduction:#.##}"); System.Diagnostics.Debug.WriteLine($"PositivePrecision: {modelMetrics.PositivePrecision:#.##}"); System.Diagnostics.Debug.WriteLine($"PositiveRecall: {modelMetrics.PositiveRecall:#.##}"); System.Diagnostics.Debug.WriteLine($"NegativePrecision: {modelMetrics.NegativePrecision:#.##}"); System.Diagnostics.Debug.WriteLine($"NegativeRecall: {modelMetrics.NegativeRecall:P2}"); }
static void Main(string[] args) { //############################################################### //INICIALIZACIÓN DEL PROCESO //############################################################### //Inicialización de mlContext; utilización del seed para replicidad MLContext mlContext = new MLContext(seed: 1); //Definición de las clases de los datos de entrada: // -Clase Observaciones: CountryObservation //Carga de datos IDataView originalFullData = mlContext.Data .LoadFromTextFile <CountryObservation>( _DataPath, separatorChar: ',', hasHeader: true); //############################################################### //CONSTRUYE EL CONJUNTO DE DATOS (DATASET) //############################################################### IDataView trainingDataView = originalFullData; //Guardamos dataset trainingDataView using (var fileStream = File.Create(_salida_trainDataPath)) { mlContext.Data.SaveAsText(trainingDataView, fileStream, separatorChar: ';', headerRow: true, schema: true); } //############################################################### //SELECCIÓN DE VARIABLES //############################################################### //Suprimimos del esquema IDataView lo que no seleccionemos como features string[] featureColumnNames = trainingDataView.Schema.AsQueryable() .Select(column => column.Name) .Where(name => name != "country")//no aporta información .ToArray(); //############################################################### //TRANFORMACIÓN DE LOS DATOS DEL MODELO --> pipeline //############################################################### //Concatena IEstimator <ITransformer> pipeline = mlContext.Transforms.Concatenate("Features", featureColumnNames) //Normalizado de las Features .Append(mlContext.Transforms.NormalizeMinMax(inputColumnName: "Features", outputColumnName: "FeaturesNormalized")); //Guardamos dataset transformedData IDataView transformedData = pipeline.Fit(trainingDataView).Transform(trainingDataView); using (var fileStream = File.Create(_salida_transformationData)) { mlContext.Data.SaveAsText(transformedData, fileStream, separatorChar: ';', headerRow: true, schema: true); } //############################################################### //SELECCIÓN DEL ALGORITMO DE ENTRENAMIENTO --> trainingPipeline //############################################################### //*************************************************************** //1. K-Means //*************************************************************** //Selección del Número de Clusters int k = 4; //Opciones K-Means var options = new KMeansTrainer.Options { FeatureColumnName = "FeaturesNormalized", NumberOfClusters = k, MaximumNumberOfIterations = 5800, OptimizationTolerance = 1e-6f }; //K-Means var trainer_km = mlContext.Clustering.Trainers.KMeans(options); //Se añade el Algoritmo al pipeline de transformación de datos IEstimator <ITransformer> trainingPipeline_km = pipeline.Append(trainer_km); //############################################################### //ENTRENAMIENTO DEL MODELO //############################################################### Console.WriteLine($"\n**************************************************************"); Console.WriteLine($"* Entrenamiento del Modelo calculado con el Algoritmo K-Means "); Console.WriteLine($"*-------------------------------------------------------------"); var watch_km = System.Diagnostics.Stopwatch.StartNew(); var model_km = trainingPipeline_km.Fit(trainingDataView); watch_km.Stop(); var elapseds_km = watch_km.ElapsedMilliseconds * 0.001; Console.WriteLine($"El entrenamiento K-Means ha tardado: {elapseds_km:#.##} s\n"); //############################################################### //EVALUACIÓN DEL MODELO //############################################################### //Transformación del IDataView trainingDataView var predictions_km = model_km.Transform(trainingDataView); //Calculo de las métricas de cada Modelo var metrics_km = mlContext.Clustering.Evaluate(predictions_km, scoreColumnName: "Score", featureColumnName: "FeaturesNormalized"); //Mostramos las métricas K-Means Console.WriteLine($"\n**************************************************************"); Console.WriteLine($"* Métricas para el Modelo calculado con el Algoritmo K-Means "); Console.WriteLine($"*-------------------------------------------------------------"); Console.WriteLine($"* K-Means Average Distance: {metrics_km.AverageDistance:#.##}"); Console.WriteLine($"* K-Means Davies Bouldin Index: {metrics_km.DaviesBouldinIndex:#.##}"); Console.WriteLine($"* K-Means Normalized Mutual Information: {metrics_km.NormalizedMutualInformation:#.##}"); //############################################################### //SELECCIÓN MODELO //############################################################### //Guardamos el Modelo para su posterior consumo mlContext.Model.Save(model_km, trainingDataView.Schema, _salida_modelPath); //############################################################### //VISUALIZACIÓN DEL MODELO //############################################################### //Definición de las clases de las predicciones: // -Clase Predicciones: CountryPrediction //Inicialización de PlotModel var plot = new PlotModel { Title = "Clúster Paises", IsLegendVisible = true }; //Transformamos el dataset con el Modelo var predictionsData = model_km.Transform(trainingDataView); //Creamos Array a partir del IDataView y la clase de predicción var predictions = mlContext.Data .CreateEnumerable <CountryPrediction>(predictionsData, false) .ToArray(); //Extraemos la lista de los nombres clusteres creados var clusters = predictions .Select(p => p.PredictedLabel).Distinct().OrderBy(x => x); //Construimos el conjunto de puntos para su visualización foreach (var cluster in clusters) { var scatter = new ScatterSeries { MarkerType = MarkerType.Circle, MarkerStrokeThickness = 2, Title = $"Cluster: {cluster}", RenderInLegend = true }; //Array ScatterPoint (2 dimensiones) var series = predictions .Where(p => p.PredictedLabel == cluster) //Seleccionamos 2 de las 5 coordenadas de nuestras Features .Select(p => new ScatterPoint(p.Location[2], p.Location[0])).ToArray(); scatter.Points.AddRange(series); plot.Series.Add(scatter); } //Le damos un color a cada cluster plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors; //Guardamos la gráfica en un archivo .svg var exporter = new SvgExporter { Width = 1000, Height = 800 }; using (var fs = new System.IO.FileStream(_salida_plotDataPath, System.IO.FileMode.Create)) { exporter.Export(plot, fs); } //Guardamos un archivo .csv con el cluster resultante para cada pais using (var w = new System.IO.StreamWriter(_salida_ResultDataPath)) { w.WriteLine($"Country;Cluster"); w.Flush(); predictions.ToList().ForEach(prediction => { w.WriteLine($"{prediction.country};{prediction.PredictedLabel}"); w.Flush(); }); } }