Example #1
0
 private void AddPipelineStage(IEstimator <ITransformer> estimator)
 {
     _pipeline = _pipeline?.Append(estimator) ?? estimator;
 }
Example #2
0
        private void MixMatch(string dataPath)
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Read the data as an IDataView.
            // First, we define the reader: specify the data columns and where to find them in the text file.
            var reader = mlContext.Data.CreateTextReader(ctx => (
                                                             // The four features of the Iris dataset.
                                                             SepalLength: ctx.LoadFloat(0),
                                                             SepalWidth: ctx.LoadFloat(1),
                                                             PetalLength: ctx.LoadFloat(2),
                                                             PetalWidth: ctx.LoadFloat(3),
                                                             // Label: kind of iris.
                                                             Label: ctx.LoadText(4)
                                                             ),
                                                         // Default separator is tab, but the dataset has comma.
                                                         separator: ',');

            // Read the data.
            var data = reader.Read(dataPath);

            // Build the pre-processing pipeline.
            var pipeline = reader.MakeNewEstimator()
                           .Append(r => (
                                       // Convert string label to a key.
                                       Label: r.Label.ToKey(),
                                       // Concatenate all the features together into one column 'Features'.
                                       Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth)));

            // Now, at the time of writing, there is no static pipeline for OVA (one-versus-all). So, let's
            // append the OVA learner to the dynamic pipeline.
            IEstimator <ITransformer> dynamicPipe = pipeline.AsDynamic;

            // Create a binary classification trainer.
            var binaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features");

            // Append the OVA learner to the pipeline.
            dynamicPipe = dynamicPipe.Append(mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer));

            // At this point, we have a choice. We could continue working with the dynamically-typed pipeline, and
            // ultimately call dynamicPipe.Fit(data.AsDynamic) to get the model, or we could go back into the static world.
            // Here's how we go back to the static pipeline:
            var staticFinalPipe = dynamicPipe.AssertStatic(mlContext,
                                                           // Declare the shape of the input. As you can see, it's identical to the shape of the reader:
                                                           // four float features and a string label.
                                                           c => (
                                                               SepalLength: c.R4.Scalar,
                                                               SepalWidth: c.R4.Scalar,
                                                               PetalLength: c.R4.Scalar,
                                                               PetalWidth: c.R4.Scalar,
                                                               Label: c.Text.Scalar),
                                                           // Declare the shape of the output (or a relevant subset of it).
                                                           // In our case, we care only about the predicted label column (a key type), and scores (vector of floats).
                                                           c => (
                                                               Score: c.R4.Vector,
                                                               // Predicted label is a key backed by uint, with text values (since original labels are text).
                                                               PredictedLabel: c.KeyU4.TextValues.Scalar))
                                  // Convert the predicted label from key back to the original string value.
                                  .Append(r => r.PredictedLabel.ToValue());

            // Train the model in a statically typed way.
            var model = staticFinalPipe.Fit(data);

            // And here is how we could've stayed in the dynamic pipeline and train that way.
            dynamicPipe = dynamicPipe.Append(new KeyToValueMappingEstimator(mlContext, "PredictedLabel"));
            var dynamicModel = dynamicPipe.Fit(data.AsDynamic);

            // Now 'dynamicModel', and 'model.AsDynamic' are equivalent.
        }
Example #3
0
        static void Main(string[] args)
        {
            var mlContext = new MLContext();

            IDataView attritionData = mlContext.Data.LoadFromTextFile <Employee>(path: "./data/attrition.csv", hasHeader: true, separatorChar: ',');

            var split     = mlContext.Data.TrainTestSplit(attritionData, testFraction: 0.2);
            var trainData = split.TrainSet;
            var testData  = split.TestSet;

            var numFields = attritionData.Schema.AsEnumerable()
                            .Select(column => new { column.Name, column.Type })
                            .Where(column => (column.Name != nameof(Employee.Attrition)) && (column.Type.ToString() == "Single"))
                            .ToArray();

            var numFieldNames = numFields.AsEnumerable()
                                .Select(column => column.Name)
                                .ToList();

            var oheFieldNames = new List <string>();

            oheFieldNames.Add("OHE-" + nameof(Employee.BusinessTravel));
            oheFieldNames.Add("OHE-" + nameof(Employee.Department));
            oheFieldNames.Add("OHE-" + nameof(Employee.EducationField));
            oheFieldNames.Add("OHE-" + nameof(Employee.MaritalStatus));
            oheFieldNames.Add("OHE-" + nameof(Employee.JobLevel));
            oheFieldNames.Add("OHE-" + nameof(Employee.JobRole));
            oheFieldNames.Add("OHE-" + nameof(Employee.OverTime));

            var allFeatureFields = new List <string>();

            allFeatureFields.AddRange(oheFieldNames);
            string[] numFeatures = numFieldNames.ToArray();
            allFeatureFields.AddRange(numFeatures);
            string[] allFeatureNames = allFeatureFields.ToArray();

            IEstimator <ITransformer> featurizePipeline = mlContext.Transforms.Categorical.OneHotEncoding(
                new[]
            {
                new InputOutputColumnPair("OHE-" + nameof(Employee.BusinessTravel), nameof(Employee.BusinessTravel)),
                new InputOutputColumnPair("OHE-" + nameof(Employee.Department), nameof(Employee.Department)),
                new InputOutputColumnPair("OHE-" + nameof(Employee.EducationField), nameof(Employee.EducationField)),
                new InputOutputColumnPair("OHE-" + nameof(Employee.MaritalStatus), nameof(Employee.MaritalStatus)),
                new InputOutputColumnPair("OHE-" + nameof(Employee.JobLevel), nameof(Employee.JobLevel)),
                new InputOutputColumnPair("OHE-" + nameof(Employee.JobRole), nameof(Employee.JobRole)),
                new InputOutputColumnPair("OHE-" + nameof(Employee.OverTime), nameof(Employee.OverTime))
            }, OneHotEncodingEstimator.OutputKind.Indicator);

            featurizePipeline = featurizePipeline.Append(mlContext.Transforms.Concatenate("Features", allFeatureNames))
                                .Append(mlContext.Transforms.NormalizeMinMax("Features", "Features"))
                                .AppendCacheCheckpoint(mlContext);

            ConsoleHelper.ConsoleWriteHeader("=============== Begin to train the model ===============");

            var trainer = mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(
                labelColumnName: nameof(Employee.Attrition),
                featureColumnName: "Features");

            /* ----- Tried with other trainers below and compared the outcome ------ */
            // var trainer = mlContext.BinaryClassification.Trainers.LightGbm(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features");
            // var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features");
            // var trainer = mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features");
            // var trainer = mlContext.BinaryClassification.Trainers.SgdCalibrated(labelColumnName: nameof(Employee.Attrition), featureColumnName: "Features");
            /* ------------------------------------------------------------------- */

            var trainPipeline = featurizePipeline.Append(trainer);
            var trainedModel  = trainPipeline.Fit(trainData);

            Console.WriteLine("===== Evaluating Model's accuracy with Test data =====");

            var testDataPredictions = trainedModel.Transform(testData);
            var evaluateMetrics     = mlContext.BinaryClassification.Evaluate(data: testDataPredictions,
                                                                              labelColumnName: nameof(Employee.Attrition),
                                                                              scoreColumnName: "Score");

            ConsoleHelper.PrintBinaryClassificationMetrics(trainedModel.ToString(), evaluateMetrics);

            Console.WriteLine("===== Permutation Test =====");

            var transformedData    = trainedModel.Transform(trainData);
            var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance(
                predictionTransformer: trainedModel.LastTransformer,
                data: transformedData,
                labelColumnName: nameof(Employee.Attrition),
                permutationCount: 50);

            var mapFields = new List <string>();

            for (int i = 0; i < allFeatureNames.Count(); i++)
            {
                var slotField = new VBuffer <ReadOnlyMemory <char> >();
                if (transformedData.Schema[allFeatureNames[i]].HasSlotNames())
                {
                    transformedData.Schema[allFeatureNames[i]].GetSlotNames(ref slotField);
                    for (int j = 0; j < slotField.Length; j++)
                    {
                        mapFields.Add(allFeatureNames[i]);
                    }
                }
                else
                {
                    mapFields.Add(allFeatureNames[i]);
                }
            }

            // Now let's look at which features are most important to the model
            // overall. Get the feature indices sorted by their impact on AUC.
            var sortedIndices = permutationMetrics
                                .Select((metrics, index) => new { index, metrics.AreaUnderRocCurve })
                                .OrderByDescending(
                feature => Math.Abs(feature.AreaUnderRocCurve.Mean));

            foreach (var feature in sortedIndices)
            {
                Console.WriteLine($"{mapFields[feature.index],-20}|\t{Math.Abs(feature.AreaUnderRocCurve.Mean):F6}");
            }
        }
Example #4
0
        public void Train(string trainingFileName, string testFileName)
        {
            System.Diagnostics.Debug.WriteLine("Reached Train Method");
            //Check if training file exists
            if (!File.Exists(trainingFileName))
            {
                System.Diagnostics.Debug.WriteLine($"Failed to find training data file ({trainingFileName}");

                return;
            }
            //Check if test file exists
            if (!File.Exists(testFileName))
            {
                System.Diagnostics.Debug.WriteLine($"Failed to find test data file ({testFileName}");

                return;
            }

            //Convert training file into IDataView object (ready for processing)
            var trainingDataView = MlContext.Data.LoadFromTextFile <CarInventory>(trainingFileName, ',', hasHeader: false);

            //Normalise Mean Variance on the inputted values

            IEstimator <ITransformer> dataProcessPipeline = MlContext.Transforms
                                                            .Concatenate("Features", typeof(CarInventory)
                                                                         .ToPropertyList <CarInventory>(nameof(CarInventory.Label)))
                                                            .Append(MlContext.Transforms.NormalizeMeanVariance(inputColumnName: "Features", outputColumnName: "FeaturesNormalizedByMeanVar"));

            //Create a trainer object with the label from the car inventory class + normalised mean variance
            var trainer = MlContext.BinaryClassification.Trainers.FastTree(
                labelColumnName: nameof(CarInventory.Label),
                featureColumnName: "FeaturesNormalizedByMeanVar",
                numberOfLeaves: 2,
                numberOfTrees: 800,
                minimumExampleCountPerLeaf: 1,
                learningRate: 0.2);
            //Append the trainer to the pipeline
            var trainingPipeline = dataProcessPipeline.Append(trainer);

            //Save the model
            var trainedModel = trainingPipeline.Fit(trainingDataView);

            MlContext.Model.Save(trainedModel, trainingDataView.Schema, ModelPath);

            //Evaluate the model like we trained it
            var evaluationPipeline = trainedModel.Append(MlContext.Transforms
                                                         .CalculateFeatureContribution(trainedModel.LastTransformer)
                                                         .Fit(dataProcessPipeline.Fit(trainingDataView).Transform(trainingDataView)));

            var testDataView = MlContext.Data.LoadFromTextFile <CarInventory>(testFileName, ',', hasHeader: false);

            var testSetTransform = evaluationPipeline.Transform(testDataView);

            var modelMetrics = MlContext.BinaryClassification.Evaluate(data: testSetTransform,
                                                                       labelColumnName: nameof(CarInventory.Label),
                                                                       scoreColumnName: "Score");

            System.Diagnostics.Debug.WriteLine($"Accuracy: {modelMetrics.Accuracy:P2}");
            System.Diagnostics.Debug.WriteLine($"Area Under Curve: {modelMetrics.AreaUnderRocCurve:P2}");
            System.Diagnostics.Debug.WriteLine($"Area under Precision recall Curve: {modelMetrics.AreaUnderPrecisionRecallCurve:P2}");
            System.Diagnostics.Debug.WriteLine($"F1Score: {modelMetrics.F1Score:P2}");
            System.Diagnostics.Debug.WriteLine($"LogLoss: {modelMetrics.LogLoss:#.##}");
            System.Diagnostics.Debug.WriteLine($"LogLossReduction: {modelMetrics.LogLossReduction:#.##}");
            System.Diagnostics.Debug.WriteLine($"PositivePrecision: {modelMetrics.PositivePrecision:#.##}");
            System.Diagnostics.Debug.WriteLine($"PositiveRecall: {modelMetrics.PositiveRecall:#.##}");
            System.Diagnostics.Debug.WriteLine($"NegativePrecision: {modelMetrics.NegativePrecision:#.##}");
            System.Diagnostics.Debug.WriteLine($"NegativeRecall: {modelMetrics.NegativeRecall:P2}");
        }
Example #5
0
        static void Main(string[] args)
        {
            //###############################################################
            //INICIALIZACIÓN DEL PROCESO
            //###############################################################

            //Inicialización de mlContext; utilización del seed para replicidad
            MLContext mlContext = new MLContext(seed: 1);

            //Definición de las clases de los datos de entrada:
            //  -Clase Observaciones: CountryObservation

            //Carga de datos
            IDataView originalFullData = mlContext.Data
                                         .LoadFromTextFile <CountryObservation>(
                _DataPath,
                separatorChar: ',',
                hasHeader: true);


            //###############################################################
            //CONSTRUYE EL CONJUNTO DE DATOS (DATASET)
            //###############################################################

            IDataView trainingDataView = originalFullData;

            //Guardamos dataset trainingDataView
            using (var fileStream = File.Create(_salida_trainDataPath))
            {
                mlContext.Data.SaveAsText(trainingDataView, fileStream, separatorChar: ';', headerRow: true,
                                          schema: true);
            }


            //###############################################################
            //SELECCIÓN DE VARIABLES
            //###############################################################

            //Suprimimos del esquema IDataView lo que no seleccionemos como features
            string[] featureColumnNames = trainingDataView.Schema.AsQueryable()
                                          .Select(column => column.Name)
                                          .Where(name => name != "country")//no aporta información
                                          .ToArray();

            //###############################################################
            //TRANFORMACIÓN DE LOS DATOS DEL MODELO --> pipeline
            //###############################################################

            //Concatena
            IEstimator <ITransformer> pipeline = mlContext.Transforms.Concatenate("Features",
                                                                                  featureColumnNames)
                                                 //Normalizado de las Features
                                                 .Append(mlContext.Transforms.NormalizeMinMax(inputColumnName: "Features",
                                                                                              outputColumnName: "FeaturesNormalized"));

            //Guardamos dataset transformedData
            IDataView transformedData =
                pipeline.Fit(trainingDataView).Transform(trainingDataView);

            using (var fileStream = File.Create(_salida_transformationData))
            {
                mlContext.Data.SaveAsText(transformedData, fileStream, separatorChar: ';', headerRow: true,
                                          schema: true);
            }


            //###############################################################
            //SELECCIÓN DEL ALGORITMO DE ENTRENAMIENTO --> trainingPipeline
            //###############################################################

            //***************************************************************
            //1. K-Means
            //***************************************************************

            //Selección del Número de Clusters
            int k = 4;

            //Opciones K-Means
            var options = new KMeansTrainer.Options
            {
                FeatureColumnName         = "FeaturesNormalized",
                NumberOfClusters          = k,
                MaximumNumberOfIterations = 5800,
                OptimizationTolerance     = 1e-6f
            };

            //K-Means
            var trainer_km = mlContext.Clustering.Trainers.KMeans(options);

            //Se añade el Algoritmo al pipeline de transformación de datos
            IEstimator <ITransformer> trainingPipeline_km = pipeline.Append(trainer_km);


            //###############################################################
            //ENTRENAMIENTO DEL MODELO
            //###############################################################

            Console.WriteLine($"\n**************************************************************");
            Console.WriteLine($"* Entrenamiento del Modelo calculado con el Algoritmo K-Means   ");
            Console.WriteLine($"*-------------------------------------------------------------");
            var watch_km = System.Diagnostics.Stopwatch.StartNew();
            var model_km = trainingPipeline_km.Fit(trainingDataView);

            watch_km.Stop();
            var elapseds_km = watch_km.ElapsedMilliseconds * 0.001;

            Console.WriteLine($"El entrenamiento K-Means ha tardado: {elapseds_km:#.##} s\n");


            //###############################################################
            //EVALUACIÓN DEL MODELO
            //###############################################################

            //Transformación del IDataView trainingDataView
            var predictions_km = model_km.Transform(trainingDataView);

            //Calculo de las métricas de cada Modelo
            var metrics_km = mlContext.Clustering.Evaluate(predictions_km,
                                                           scoreColumnName: "Score",
                                                           featureColumnName: "FeaturesNormalized");

            //Mostramos las métricas K-Means
            Console.WriteLine($"\n**************************************************************");
            Console.WriteLine($"* Métricas para el Modelo calculado con el Algoritmo K-Means      ");
            Console.WriteLine($"*-------------------------------------------------------------");
            Console.WriteLine($"*       K-Means Average Distance:  {metrics_km.AverageDistance:#.##}");
            Console.WriteLine($"*       K-Means Davies Bouldin Index:  {metrics_km.DaviesBouldinIndex:#.##}");
            Console.WriteLine($"*       K-Means Normalized Mutual Information:  {metrics_km.NormalizedMutualInformation:#.##}");


            //###############################################################
            //SELECCIÓN MODELO
            //###############################################################

            //Guardamos el Modelo para su posterior consumo
            mlContext.Model.Save(model_km, trainingDataView.Schema, _salida_modelPath);


            //###############################################################
            //VISUALIZACIÓN DEL MODELO
            //###############################################################

            //Definición de las clases de las predicciones:
            //  -Clase Predicciones: CountryPrediction

            //Inicialización de PlotModel
            var plot = new PlotModel {
                Title = "Clúster Paises", IsLegendVisible = true
            };

            //Transformamos el dataset con el Modelo
            var predictionsData = model_km.Transform(trainingDataView);

            //Creamos Array a partir del IDataView y la clase de predicción
            var predictions = mlContext.Data
                              .CreateEnumerable <CountryPrediction>(predictionsData, false)
                              .ToArray();

            //Extraemos la lista de los nombres clusteres creados
            var clusters = predictions
                           .Select(p => p.PredictedLabel).Distinct().OrderBy(x => x);

            //Construimos el conjunto de puntos para su visualización
            foreach (var cluster in clusters)
            {
                var scatter = new ScatterSeries {
                    MarkerType            = MarkerType.Circle,
                    MarkerStrokeThickness = 2,
                    Title          = $"Cluster: {cluster}",
                    RenderInLegend = true
                };
                //Array ScatterPoint (2 dimensiones)
                var series = predictions
                             .Where(p => p.PredictedLabel == cluster)
                             //Seleccionamos 2 de las 5 coordenadas de nuestras Features
                             .Select(p => new ScatterPoint(p.Location[2], p.Location[0])).ToArray();
                scatter.Points.AddRange(series);

                plot.Series.Add(scatter);
            }

            //Le damos un color a cada cluster
            plot.DefaultColors = OxyPalettes.HueDistinct(plot.Series.Count).Colors;

            //Guardamos la gráfica en un archivo .svg
            var exporter = new SvgExporter {
                Width = 1000, Height = 800
            };

            using (var fs = new System.IO.FileStream(_salida_plotDataPath, System.IO.FileMode.Create))
            {
                exporter.Export(plot, fs);
            }

            //Guardamos un archivo .csv con el cluster resultante para cada pais
            using (var w = new System.IO.StreamWriter(_salida_ResultDataPath))
            {
                w.WriteLine($"Country;Cluster");
                w.Flush();
                predictions.ToList().ForEach(prediction =>
                {
                    w.WriteLine($"{prediction.country};{prediction.PredictedLabel}");
                    w.Flush();
                });
            }
        }