示例#1
0
        public void TrainTestSplit()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(0), features: c.LoadFloat(1, 4)));
            var data = reader.Read(dataSource);

            var(train, test) = ctx.TrainTestSplit(data, 0.5);

            // Just make sure that the train is about the same size as the test set.
            var trainCount = train.GetColumn(r => r.label).Count();
            var testCount  = test.GetColumn(r => r.label).Count();

            Assert.InRange(trainCount * 1.0 / testCount, 0.8, 1.2);

            // Now stratify by label. Silly thing to do.
            (train, test) = ctx.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label);
            var trainLabels = train.GetColumn(r => r.label).Distinct();
            var testLabels  = test.GetColumn(r => r.label).Distinct();

            Assert.True(trainLabels.Count() > 0);
            Assert.True(testLabels.Count() > 0);
            Assert.False(trainLabels.Intersect(testLabels).Any());
        }
        internal static BinaryClassificationResult AutoFit(this BinaryClassificationContext context,
                                                           IDataView trainData,
                                                           string label,
                                                           IDataView validationData            = null,
                                                           InferredColumn[] inferredColumns    = null,
                                                           AutoFitSettings settings            = null,
                                                           CancellationToken cancellationToken = default,
                                                           IProgress <BinaryClassificationItertionResult> iterationCallback = null,
                                                           IDebugLogger debugLogger = null)
        {
            // run autofit & get all pipelines run in that process
            var(allPipelines, bestPipeline) = AutoFitApi.Fit(trainData, validationData, label, inferredColumns,
                                                             settings, TaskKind.BinaryClassification, OptimizingMetric.Accuracy,
                                                             debugLogger);

            var results = new BinaryClassificationItertionResult[allPipelines.Length];

            for (var i = 0; i < results.Length; i++)
            {
                var iterationResult = allPipelines[i];
                var result          = new BinaryClassificationItertionResult(iterationResult.Model, (BinaryClassificationMetrics)iterationResult.EvaluatedMetrics, iterationResult.ScoredValidationData);
                results[i] = result;
            }
            var bestResult = new BinaryClassificationItertionResult(bestPipeline.Model, (BinaryClassificationMetrics)bestPipeline.EvaluatedMetrics, bestPipeline.ScoredValidationData);

            return(new BinaryClassificationResult(bestResult, results));
        }
示例#3
0
        public void FfmBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features1: c.LoadFloat(1, 4), features2: c.LoadFloat(5, 9)));

            FieldAwareFactorizationMachinePredictor pred = null;

            // With a custom loss function we no longer get calibrated predictions.
            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.FieldAwareFactorizationMachine(r.label, new[] { r.features1, r.features2 }, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
示例#4
0
        private void CategoricalFeaturizationOn(string dataPath)
        {
            // Create a new environment for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var env = new LocalEnvironment();

            // Define the reader: specify the data columns and where to find them in the text file.
            var reader = TextLoader.CreateReader(env, ctx => (
                                                     Label: ctx.LoadBool(0),
                                                     // We will load all the categorical features into one vector column of size 8.
                                                     CategoricalFeatures: ctx.LoadText(1, 8),
                                                     // Similarly, load all numerical features into one vector of size 6.
                                                     NumericalFeatures: ctx.LoadFloat(9, 14),
                                                     // Let's also separately load the 'Workclass' column.
                                                     Workclass: ctx.LoadText(1)
                                                     ), hasHeader: true);

            // Read the data.
            var data = reader.Read(new MultiFileSource(dataPath));

            // Inspect the categorical columns to check that they are correctly read.
            var catColumns = data.GetColumn(r => r.CategoricalFeatures).Take(10).ToArray();

            // Build several alternative featurization pipelines.
            var learningPipeline = reader.MakeNewEstimator()
                                   .Append(r => (
                                               r.Label,
                                               r.NumericalFeatures,
                                               // Convert each categorical feature into one-hot encoding independently.
                                               CategoricalOneHot: r.CategoricalFeatures.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind),
                                               // Convert all categorical features into indices, and build a 'word bag' of these.
                                               CategoricalBag: r.CategoricalFeatures.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag),
                                               // One-hot encode the workclass column, then drop all the categories that have fewer than 10 instances in the train set.
                                               WorkclassOneHotTrimmed: r.Workclass.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)
                                               ));

            // Let's train our pipeline, and then apply it to the same data.
            var transformedData = learningPipeline.Fit(data).Transform(data);

            // Inspect some columns of the resulting dataset.
            var categoricalBags = transformedData.GetColumn(x => x.CategoricalBag).Take(10).ToArray();
            var workclasses     = transformedData.GetColumn(x => x.WorkclassOneHotTrimmed).Take(10).ToArray();

            // Of course, if we want to train the model, we will need to compose a single float vector of all the features.
            // Here's how we could do this:

            var classification       = new BinaryClassificationContext(env);
            var fullLearningPipeline = learningPipeline
                                       .Append(r => (
                                                   r.Label,
                                                   // Concatenate two of the 3 categorical pipelines, and the numeric features.
                                                   Features: r.NumericalFeatures.ConcatWith(r.CategoricalBag, r.WorkclassOneHotTrimmed)))
                                       // Now we're ready to train. We chose our FastTree trainer for this classification task.
                                       .Append(r => classification.Trainers.FastTree(r.Label, r.Features, numTrees: 50));

            // Train the model.
            var model = fullLearningPipeline.Fit(data);
        }
示例#5
0
        public void TrainOnAutoGeneratedData()
        {
            // Create a new environment for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var env = new LocalEnvironment();

            // Step one: read the data as an IDataView.
            // Let's assume that 'GetChurnData()' fetches and returns the training data from somewhere.
            IEnumerable <CustomerChurnInfo> churnData = GetChurnInfo();

            // Turn the data into the ML.NET data view.
            // We can use CreateDataView or CreateStreamingDataView, depending on whether 'churnData' is an IList,
            // or merely an IEnumerable.
            var trainData = env.CreateStreamingDataView(churnData);

            // Now note that 'trainData' is just an IDataView, so we face a choice here: either declare the static type
            // and proceed in the statically typed fashion, or keep dynamic types and build a dynamic pipeline.
            // We demonstrate both below.

            // We know that this is a binary classification task, so we create a binary classification context: it will give us the algorithms
            // we need, as well as the evaluation procedure.
            var classification = new BinaryClassificationContext(env);

            // Build the learning pipeline.
            // In our case, we will one-hot encode the demographic category, and concatenate that with the number of visits.
            // We apply our FastTree binary classifier to predict the 'HasChurned' label.

            var dynamicLearningPipeline = new CategoricalEstimator(env, "DemographicCategory")
                                          .Append(new ConcatEstimator(env, "Features", "DemographicCategory", "LastVisits"))
                                          .Append(new FastTreeBinaryClassificationTrainer(env, "HasChurned", "Features", numTrees: 20));

            var dynamicModel = dynamicLearningPipeline.Fit(trainData);

            // Build the same learning pipeline, but statically typed.
            // First, transition to the statically-typed data view.
            var staticData = trainData.AssertStatic(env, c => (
                                                        HasChurned: c.Bool.Scalar,
                                                        DemographicCategory: c.Text.Scalar,
                                                        LastVisits: c.R4.Vector));

            // Build the pipeline, same as the one above.
            var staticLearningPipeline = staticData.MakeNewEstimator()
                                         .Append(r => (
                                                     r.HasChurned,
                                                     Features: r.DemographicCategory.OneHotEncoding().ConcatWith(r.LastVisits)))
                                         .Append(r => classification.Trainers.FastTree(r.HasChurned, r.Features, numTrees: 20));

            var staticModel = staticLearningPipeline.Fit(staticData);

            // Note that dynamicModel should be the same as staticModel.AsDynamic (give or take random variance from
            // the training procedure).

            var qualityMetrics = classification.Evaluate(dynamicModel.Transform(trainData), "HasChurned");
        }
 public static BinaryClassificationResult AutoFit(this BinaryClassificationContext context,
                                                  IDataView trainData,
                                                  string label,
                                                  IDataView validationData            = null,
                                                  InferredColumn[] inferredColumns    = null,
                                                  AutoFitSettings settings            = null,
                                                  CancellationToken cancellationToken = default,
                                                  IProgress <BinaryClassificationItertionResult> iterationCallback = null)
 {
     return(AutoFit(context, trainData, label, validationData, inferredColumns, settings,
                    cancellationToken, iterationCallback, null));
 }
示例#7
0
        static void Main(string[] args)
        {
            // The first thing to do is create an environment, this can be used for exception tracking and logging
            // Eric: dbcontext in EF, provides the context for the job you are runnning. Random seed for determinism.
            var env = new LocalEnvironment();

            // The other thing on the environment is a component catalog, all the transforms and learners which
            // from an API perspective need to worry about its important to learn the learners etc.
            env.AddListener((src, message) => Console.WriteLine(message));

            // Text loader is used to load data from a text file, with ML.NET we support csv's/tsv's etc.
            // The text loader takes in as parameters, env and a func delegate which maps the current context row of
            // the text file into a named tuple representing the columns. You can also pass in additional setting options
            // like hasheader:true in case of our dataset.
            // Is this an anonymus type?
            var reader2 = TextLoader.CreateReader(env, ctx => (label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true);
            var reader1 = TextLoader.CreateReader(env, myfunc, hasHeader: true);

            BinaryClassificationContext ctx2 = new BinaryClassificationContext(env);

            //Once a reader has been defined you can use it read the training data using a multifilesource.
            //Everything we have done so far in this API is lazy, so the actual reading will happen when the data is accessed later.
            var traindata = reader2.Read(new MultiFileSource(TrainDataPath));

            //Once a reader has been defined you can use it read the training data using a multifilesource.
            //Everything we have done so far in this API is lazy, so the actual reading will happen when the data is accessed later.
            var testdata = reader2.Read(new MultiFileSource(TestDataPath));

            //This looks more like a sklearn.pipeline. Esimators take in data and output transforms.
            var est = traindata.MakeNewEstimator().
                      Append(row => (label: row.label,
                                     prediction: ctx2.Trainers.Sdca(row.label, row.text.FeaturizeText()))).
                      Append(row => (label: row.label,
                                     prediction: row.prediction,
                                     predictedLabel: row.prediction.predictedLabel));

            // Estimator.Fit() will try to learn the parameters of the sdca Binary Classifier that fit the data and return a Transformer (model) with the learnt parameter values
            var model = est.Fit(traindata);

            // The model is a transform so you can call transform on it where you can pass test data and it will return label,prediction,predictedlabel in an IDV.
            var prediction = model.Transform(testdata);

            var metrics = ctx2.Evaluate(prediction, Row => Row.label, Row => Row.prediction);

            // why dynamic? and why prediction function
            var predictionfunction = model.AsDynamic.MakePredictionFunction <SentimentIssue, SentimentPrediction>(env);

            var predicted = predictionfunction.Predict(new SentimentIssue
            {
                text = "foo"
            });
        }
示例#8
0
        /// <summary>
        /// Create the ML context.
        /// </summary>
        /// <param name="seed">Random seed. Set to <c>null</c> for a non-deterministic environment.</param>
        /// <param name="conc">Concurrency level. Set to 1 to run single-threaded. Set to 0 to pick automatically.</param>
        public MLContext(int?seed = null, int conc = 0)
        {
            _env = new LocalEnvironment(seed, conc, MakeCompositionContainer);
            _env.AddListener(ProcessMessage);

            BinaryClassification     = new BinaryClassificationContext(_env);
            MulticlassClassification = new MulticlassClassificationContext(_env);
            Regression = new RegressionContext(_env);
            Clustering = new ClusteringContext(_env);
            Ranking    = new RankingContext(_env);
            Transforms = new TransformsCatalog(_env);
            Model      = new ModelOperationsCatalog(_env);
            Data       = new DataOperations(_env);
        }
示例#9
0
        public void SdcaBinaryClassificationNoCalibration()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;

            var loss = new HingeLoss(new HingeLoss.Arguments()
            {
                Margin = 1
            });

            // With a custom loss function we no longer get calibrated predictions.
            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(r.label, r.features,
                                                                      maxIterations: 2,
                                                                      loss: loss, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);

            // Just output some data on the schema for fun.
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}");
            }
        }
示例#10
0
        public void SdcaBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryModelParameters        pred = null;
            ParameterMixingCalibratedPredictor cali = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(r.label, r.features,
                                                                      maxIterations: 2,
                                                                      onFit: (p, c) => { pred = p; cali = c; },
                                                                      advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            Assert.Null(cali);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            Assert.NotNull(cali);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
            Assert.InRange(metrics.LogLoss, 0, double.PositiveInfinity);
            Assert.InRange(metrics.Entropy, 0, double.PositiveInfinity);

            // Just output some data on the schema for fun.
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.Count; ++c)
            {
                Console.WriteLine($"{schema[c].Name}, {schema[c].Type}");
            }
        }
示例#11
0
 PermutationFeatureImportance(
     this BinaryClassificationContext ctx,
     IPredictionTransformer <IPredictor> model,
     IDataView data,
     string label                = DefaultColumnNames.Label,
     string features             = DefaultColumnNames.Features,
     bool useFeatureWeightFilter = false,
     int?topExamples             = null)
 {
     return(PermutationFeatureImportance <BinaryClassifierEvaluator.Result> .GetImportanceMetricsMatrix(
                CatalogUtils.GetEnvironment(ctx),
                model,
                data,
                idv => ctx.Evaluate(idv, label),
                BinaryClassifierDelta,
                features,
                useFeatureWeightFilter,
                topExamples));
 }
示例#12
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only
        public void LightGbmBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.LightGbm(r.label, r.features,
                                                                          numBoostRound: 10,
                                                                          numLeaves: 5,
                                                                          learningRate: 0.01,
                                                                          onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
示例#13
0
        public void HogwildSGDBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.StochasticGradientDescentClassificationTrainer(r.label, r.features,
                                                                                                                l2Weight: 0,
                                                                                                                onFit: (p) => { pred = p; },
                                                                                                                advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
示例#14
0
        public void AveragePerceptronNoCalibration()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;

            var loss = new HingeLoss(new HingeLoss.Arguments()
            {
                Margin = 1
            });

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss,
                                                                                    numIterations: 2, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
示例#15
0
        public bool TrainModel(string trainingDataPath)
        {
            using (var stream = File.Create(_modelName))
            {
                var env = new LocalEnvironment();

                var reader = TextLoader.CreateReader(env, ctx => (
                                                         Label: ctx.LoadDouble(0),
                                                         Is64: ctx.LoadBool(1)),
                                                     hasHeader: true);

                var data = reader.Read(new MultiFileSource(trainingDataPath));

                var learningPipeline = reader.MakeNewEstimator();

                var classification = new BinaryClassificationContext(env);

                var model = learningPipeline.Fit(data);

                model.AsDynamic.SaveTo(env, stream);

                return(true);
            }
        }
 /// <summary>
 /// Evaluates scored binary classification data.
 /// </summary>
 /// <typeparam name="T">The shape type for the input data.</typeparam>
 /// <param name="ctx">The binary classification context.</param>
 /// <param name="data">The data to evaluate.</param>
 /// <param name="label">The index delegate for the label column.</param>
 /// <param name="pred">The index delegate for columns from calibrated prediction of a binary classifier.
 /// Under typical scenarios, this will just be the same tuple of results returned from the trainer.</param>
 /// <returns>The evaluation results for these calibrated outputs.</returns>
 public static CalibratedBinaryClassificationMetrics Evaluate <T>(
     this BinaryClassificationContext ctx,
     DataView <T> data,
     Func <T, Scalar <bool> > label,
     Func <T, (Scalar <float> score, Scalar <float> probability, Scalar <bool> predictedLabel)> pred)
        public static void Run(
            [QueueTrigger("training-jobs", Connection = "AzureWebJobsStorage")] TrainingJobTriggerData myQueueItem,
            [Blob("models/customer-churn.zip", FileAccess.Write, Connection = "AzureWebJobsStorage")] Stream modelStream,
            [Blob("data/train.csv", FileAccess.Read, Connection = "AzureWebJobsStorage")] Stream trainingData,
            [Blob("data/validate.csv", FileAccess.Read, Connection = "AzureWebJobsStorage")] Stream validationData,
            ILogger log)
        {
            if (typeof(Microsoft.ML.Runtime.Data.LoadTransform) == null ||
                typeof(Microsoft.ML.Runtime.Learners.LinearClassificationTrainer) == null ||
                typeof(Microsoft.ML.Runtime.Internal.CpuMath.SseUtils) == null ||
                typeof(Microsoft.ML.Runtime.FastTree.FastTree) == null)
            {
                log.LogError("Error loading ML.NET");
            }

            log.LogInformation("Training customer churn model.");

            var env = new LocalEnvironment();
            var classificationContext = new BinaryClassificationContext(env);

            var loader = TextLoader.CreateReader(env, ctx => (
                                                     SeniorCitizen: ctx.LoadText(FieldNames.IndexOf("SeniorCitizen")),
                                                     Partner: ctx.LoadText(FieldNames.IndexOf("Partner")),
                                                     Dependents: ctx.LoadText(FieldNames.IndexOf("Dependents")),
                                                     InternetService: ctx.LoadText(FieldNames.IndexOf("InternetService")),
                                                     OnlineSecurity: ctx.LoadText(FieldNames.IndexOf("OnlineSecurity")),
                                                     OnlineBackup: ctx.LoadText(FieldNames.IndexOf("OnlineBackup")),
                                                     DeviceProtection: ctx.LoadText(FieldNames.IndexOf("DeviceProtection")),
                                                     TechSupport: ctx.LoadText(FieldNames.IndexOf("TechSupport")),
                                                     Contract: ctx.LoadText(FieldNames.IndexOf("Contract")),
                                                     PaperlessBilling: ctx.LoadText(FieldNames.IndexOf("PaperlessBilling")),
                                                     PaymentMethod: ctx.LoadText(FieldNames.IndexOf("PaymentMethod")),
                                                     Tenure: ctx.LoadFloat(FieldNames.IndexOf("Tenure")),
                                                     MonthlyCharges: ctx.LoadFloat(FieldNames.IndexOf("MonthlyCharges")),
                                                     Churn: ctx.LoadBool(FieldNames.IndexOf("Churn"))
                                                     ), new StreamDataSource(trainingData), hasHeader: true, separator: ',');

            var estimator = loader.MakeNewEstimator()
                            .Append(row => (
                                        SeniorCitizen: row.SeniorCitizen.OneHotEncoding(),
                                        Partner: row.Partner.OneHotEncoding(),
                                        Dependents: row.Dependents.OneHotEncoding(),
                                        InternetService: row.InternetService.OneHotEncoding(),
                                        OnlineSecurity: row.OnlineSecurity.OneHotEncoding(),
                                        OnlineBackup: row.OnlineBackup.OneHotEncoding(),
                                        DeviceProtection: row.DeviceProtection.OneHotEncoding(),
                                        TechSupport: row.TechSupport.OneHotEncoding(),
                                        Contract: row.Contract.OneHotEncoding(),
                                        PaperlessBilling: row.PaperlessBilling.OneHotEncoding(),
                                        PaymentMethod: row.PaymentMethod.OneHotEncoding(),
                                        Tenure: row.Tenure,
                                        MonthlyCharges: row.MonthlyCharges,
                                        Churn: row.Churn
                                        ))
                            .Append(row => (
                                        Churn: row.Churn,
                                        Features: row.SeniorCitizen.ConcatWith(
                                            row.Partner, row.Dependents, row.InternetService,
                                            row.OnlineSecurity, row.OnlineBackup, row.DeviceProtection,
                                            row.TechSupport, row.Contract, row.PaperlessBilling,
                                            row.Tenure, row.MonthlyCharges
                                            )))
                            .Append(row =>
            {
                var prediction = classificationContext.Trainers.Sdca(row.Churn, row.Features);
                return(
                    PredictedLabel: prediction.predictedLabel,
                    Score: prediction.score,
                    Label: row.Churn,
                    Probability: prediction.probability);
            });

            var trainingSet   = loader.Read(new StreamDataSource(trainingData));
            var validationSet = loader.Read(new StreamDataSource(validationData));

            var trainingSampleCount = trainingSet
                                      .AsDynamic
                                      .AsEnumerable <CustomerChurnPredictionData>(env, false)
                                      .Count();

            var validationSampleCount = validationSet
                                        .AsDynamic
                                        .AsEnumerable <CustomerChurnPredictionData>(env, false)
                                        .Count();

            log.LogInformation("Training on {Rows} samples", trainingSampleCount);

            var model = estimator.Fit(trainingSet);

            log.LogInformation("Validating on {Rows} samples", trainingSampleCount);

            var predictions = model.Transform(validationSet).AsDynamic;
            var score       = classificationContext.Evaluate(predictions, "Label");

            log.LogInformation("Model accuracy: {Accuracy}, F1 score: {F1Score}", score.Accuracy, score.F1Score);

            model.AsDynamic.SaveTo(env, modelStream);
        }
示例#18
0
        public static void Main(string[] args)
        {
            var trainDataPath = Path.Combine("Assets", "Data", "train.tsv");
            var testDataPath  = Path.Combine("Assets", "Data", "test.tsv");

            using (var environment = new ConsoleEnvironment())
            {
                // 1. [Define Trainer context]
                var binaryClassificationContext = new BinaryClassificationContext(environment);

                // 2. [Load Data with Initial Schema] Create Reader (lazy evaluation)
                var reader = TextLoader.CreateReader(
                    environment,
                    context => (
                        Comment: context.LoadText(1),
                        Attack: context.LoadBool(2)));

                var trainData = reader.Read(new MultiFileSource(trainDataPath));
                var testData  = reader.Read(new MultiFileSource(testDataPath));

                // 3. [Define Training Pipeline (estimator) and Feature Extraction]
                var estimator = reader.MakeNewEstimator()
                                .Append(
                    row => (
                        Label: row.Attack,
                        Text: row.Comment.FeaturizeText()))
                                .Append(
                    row => (
                        Label: row.Label,
                        Attack: binaryClassificationContext.Trainers.FastTree(row.Label, row.Text, numLeaves: 50, numTrees: 50, minDatapointsInLeafs: 20)))
                                .Append(
                    row => (
                        Label: row.Label,
                        Prediction: row.Attack,
                        PredictedLabel: row.Attack.predictedLabel));

                // 4. [Train Model]
                var model = estimator.Fit(trainData);

                // 5. [Evaluate Model]
                var predictions = model.Transform(testData);
                var metrics     = binaryClassificationContext.Evaluate(predictions, row => row.Label, row => row.Prediction);

                var predictionFunction = model.AsDynamic.MakePredictionFunction <SentimentData, SentimentPrediction>(environment);

                var prediction = predictionFunction.Predict(new SentimentData()
                {
                    Comment = "Insert comment"
                });

                Console.WriteLine("Predicted sentiment is: " + prediction.PredictedLabel);
                Console.WriteLine();
                Console.WriteLine("PredictionModel quality metrics evaluation");
                Console.WriteLine("------------------------------------------");
                Console.WriteLine($"Accuracy: {metrics.Accuracy:P2}");
                Console.WriteLine($"AUC: {metrics.Auc:P2}");
                Console.WriteLine($"Positive Precision: {metrics.PositivePrecision:P2}");
                Console.WriteLine($"Negative Precision: {metrics.NegativePrecision:P2}");
                Console.WriteLine($"Positive Recall: {metrics.PositiveRecall:P2}");
                Console.WriteLine($"Negative Recall: {metrics.NegativeRecall:P2}");
                Console.WriteLine($"F1Score: {metrics.F1Score:P2}");

                Console.ReadLine();
            }
        }
示例#19
0
        static void Main(string[] args)
        {
            //Create MLContext to be shared across the model creation workflow objects
            //Set a random seed for repeatable/deterministic results across multiple trainings.
            var mlContext = new MLContext();

            // STEP 1: Common data loading configuration
            // PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
            var textLoader = mlContext.Data.TextReader(new TextLoader.Arguments()
            {
                Separator = ",",
                HasHeader = true,
                Column    = new[]
                {
                    new TextLoader.Column("PassengerId", DataKind.Num, 0),
                    new TextLoader.Column("Label", DataKind.Bool, 1),
                    new TextLoader.Column("Pclass", DataKind.Num, 2),
                    new TextLoader.Column("Name", DataKind.Text, 3),
                    new TextLoader.Column("Sex", DataKind.Text, 4),
                    new TextLoader.Column("Age", DataKind.Num, 5),
                    new TextLoader.Column("SibSp", DataKind.Num, 6),
                    new TextLoader.Column("Parch", DataKind.Num, 7),
                    new TextLoader.Column("Ticket", DataKind.Text, 8),
                    new TextLoader.Column("Fare", DataKind.Text, 9),
                    new TextLoader.Column("Cabin", DataKind.Text, 10),
                    new TextLoader.Column("Embarked", DataKind.Text, 11)
                }
            });

            // Load training data and add it to the pipeline
            string trainingDataPath = @".\data\titanic.training.csv";
            string dataPath         = @".\data\titanic.csv";
            var    trainingData     = textLoader.Read(trainingDataPath);
            var    testData         = textLoader.Read(dataPath);

            var features = trainingData.Schema.GetColumns()
                           .Where(c => c.column.Name != "Label")
                           .Select(c => c.column.Name)
                           .ToArray();

            // Build several alternative featurization pipelines.
            var pipeline =
                mlContext.Transforms.Categorical.OneHotEncoding("Sex")
                .Append(mlContext.Transforms.Categorical.OneHotEncoding("Name"))
                .Append(mlContext.Transforms.Categorical.OneHotEncoding("Ticket"))
                .Append(mlContext.Transforms.Categorical.OneHotEncoding("Fare"))
                .Append(mlContext.Transforms.Categorical.OneHotEncoding("Cabin"))
                .Append(mlContext.Transforms.Categorical.OneHotEncoding("Embarked"))
                .Append(mlContext.Transforms.Concatenate("Features", features))
                .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumn: "Label", featureColumn: "Features"));

            // Train the model.
            var model = pipeline.Fit(trainingData);

            // Create a PredictionFunction from our model
            var predictor  = model.MakePredictionFunction <TitanicData, TitanicPrediction>(mlContext);
            var prediction = predictor.Predict(new TitanicData()
            {
                Pclass   = 3f,
                Name     = "Braund, Mr. Owen Harris",
                Sex      = "male",
                Age      = 31,
                SibSp    = 0,
                Parch    = 0,
                Ticket   = "335097",
                Fare     = "7.75",
                Cabin    = "",
                Embarked = "Q"
            });

            Console.WriteLine($"Did this passenger survive? {(prediction.Survived ? "Yes" : "No")}");

            var context = new BinaryClassificationContext(mlContext);
            var metrics = context.Evaluate(model.Transform(testData), "Label");

            Console.WriteLine("Acuracy: " + metrics.Accuracy);
            Console.WriteLine($"Auc: {metrics.Auc:P2}");
            Console.WriteLine($"F1Score: {metrics.F1Score:P2}");
        }
        private static Estimator <SentimentIssueInput, SentimentPred, ITransformer> CreateEstimator(DataReader <IMultiStreamSource, SentimentIssueInput> reader, BinaryClassificationContext bctx)
        {
            var est = reader.MakeNewEstimator().Append(row =>
            {
                var featurizedText = row.Text.FeaturizeText();                      //Convert text to numeric vectors
                var prediction     = bctx.Trainers.Sdca(row.Label, featurizedText); //Specify SDCA trainer based on the label and featurized text columns
                return(new SentimentPred()
                {
                    label = row.Label,
                    score = prediction.score,
                    probablity = prediction.probability,
                    predictedlabel = prediction.predictedLabel
                });  //Return label and prediction columns. "prediction" holds predictedLabel, score and probability
            });

            return(est);
        }
        static void Main(string[] args)
        {
            //1. Create ML.NET context/environment
            using (var env = new LocalEnvironment())
            {
                //2. Create DataReader with data schema mapped to file's columns
                var reader = new TextLoader(env,
                                            new TextLoader.Arguments()
                {
                    Separator = "tab",
                    HasHeader = true,
                    Column    = new[]
                    {
                        new TextLoader.Column("Label", DataKind.Bool, 0),
                        new TextLoader.Column("Text", DataKind.Text, 1)
                    }
                });

                //Load training data
                IDataView trainingDataView = reader.Read(new MultiFileSource(TrainDataPath));


                //3.Create a flexible pipeline (composed by a chain of estimators) for creating/traing the model.

                var pipeline = new TextTransform(env, "Text", "Features")  //Convert the text column to numeric vectors (Features column)
                               .Append(new LinearClassificationTrainer(env, new LinearClassificationTrainer.Arguments(),
                                                                       "Features",
                                                                       "Label"));
                //.Append(new LinearClassificationTrainer(env, "Features", "Label")); //(Simpler in ML.NET v0.7)



                //4. Create and train the model
                Console.WriteLine("=============== Create and Train the Model ===============");

                var model = pipeline.Fit(trainingDataView);

                Console.WriteLine("=============== End of training ===============");
                Console.WriteLine();


                //5. Evaluate the model and show accuracy stats

                //Load evaluation/test data
                IDataView testDataView = reader.Read(new MultiFileSource(TestDataPath));

                Console.WriteLine("=============== Evaluating Model's accuracy with Test data===============");
                var predictions = model.Transform(testDataView);

                var binClassificationCtx = new BinaryClassificationContext(env);
                var metrics = binClassificationCtx.Evaluate(predictions, "Label");

                Console.WriteLine();
                Console.WriteLine("Model quality metrics evaluation");
                Console.WriteLine("------------------------------------------");
                Console.WriteLine($"Accuracy: {metrics.Accuracy:P2}");
                Console.WriteLine($"Auc: {metrics.Auc:P2}");
                Console.WriteLine($"F1Score: {metrics.F1Score:P2}");
                Console.WriteLine("=============== End of Model's evaluation ===============");
                Console.WriteLine();


                //6. Test Sentiment Prediction with one sample text
                var predictionFunct = model.MakePredictionFunction <SentimentIssue, SentimentPrediction>(env);

                SentimentIssue sampleStatement = new SentimentIssue
                {
                    Text = "This is a very rude movie"
                };

                var resultprediction = predictionFunct.Predict(sampleStatement);

                Console.WriteLine();
                Console.WriteLine("=============== Test of model with a sample ===============");

                Console.WriteLine($"Text: {sampleStatement.Text} | Prediction: {(Convert.ToBoolean(resultprediction.Prediction) ? "Toxic" : "Nice")} sentiment | Probability: {resultprediction.Probability} ");

                // Save model to .ZIP file
                SaveModelAsFile(env, model);

                // Predict again but now testing the model loading from the .ZIP file
                PredictWithModelLoadedFromFile(sampleStatement);

                Console.WriteLine("=============== End of process, hit any key to finish ===============");
                Console.ReadKey();
            }
        }
示例#22
0
        PrepareData(MLContext mlContext)
        {
            IDataView data      = null;
            IDataView trainData = null;
            IDataView testData  = null;

            TextLoader.Column[] columns = new[] {
                // A boolean column depicting the 'label'.
                new TextLoader.Column("Label", DataKind.BL, 30),
                // 29 Features V1..V28 + Amount
                new TextLoader.Column("V1", DataKind.R4, 1),
                new TextLoader.Column("V2", DataKind.R4, 2),
                new TextLoader.Column("V3", DataKind.R4, 3),
                new TextLoader.Column("V4", DataKind.R4, 4),
                new TextLoader.Column("V5", DataKind.R4, 5),
                new TextLoader.Column("V6", DataKind.R4, 6),
                new TextLoader.Column("V7", DataKind.R4, 7),
                new TextLoader.Column("V8", DataKind.R4, 8),
                new TextLoader.Column("V9", DataKind.R4, 9),
                new TextLoader.Column("V10", DataKind.R4, 10),
                new TextLoader.Column("V11", DataKind.R4, 11),
                new TextLoader.Column("V12", DataKind.R4, 12),
                new TextLoader.Column("V13", DataKind.R4, 13),
                new TextLoader.Column("V14", DataKind.R4, 14),
                new TextLoader.Column("V15", DataKind.R4, 15),
                new TextLoader.Column("V16", DataKind.R4, 16),
                new TextLoader.Column("V17", DataKind.R4, 17),
                new TextLoader.Column("V18", DataKind.R4, 18),
                new TextLoader.Column("V19", DataKind.R4, 19),
                new TextLoader.Column("V20", DataKind.R4, 20),
                new TextLoader.Column("V21", DataKind.R4, 21),
                new TextLoader.Column("V22", DataKind.R4, 22),
                new TextLoader.Column("V23", DataKind.R4, 23),
                new TextLoader.Column("V24", DataKind.R4, 24),
                new TextLoader.Column("V25", DataKind.R4, 25),
                new TextLoader.Column("V26", DataKind.R4, 26),
                new TextLoader.Column("V27", DataKind.R4, 27),
                new TextLoader.Column("V28", DataKind.R4, 28),
                new TextLoader.Column("Amount", DataKind.R4, 29)
            };

            TextLoader.Arguments txtLoaderArgs = new TextLoader.Arguments
            {
                Column = columns,
                // First line of the file is a header, not a data row.
                HasHeader = true,
                Separator = ","
            };

            // Step one: read the data as an IDataView.
            // Create the reader: define the data columns
            // and where to find them in the text file.
            var reader = new TextLoader(mlContext, txtLoaderArgs);


            // We know that this is a Binary Classification task,
            // so we create a Binary Classification context:
            // it will give us the algorithms we need,
            // as well as the evaluation procedure.
            var classification = new BinaryClassificationContext(mlContext);

            if (!File.Exists(Path.Combine(_outputPath, "testData.idv")) &&
                !File.Exists(Path.Combine(_outputPath, "trainData.idv")))
            {
                // Split the data 80:20 into train and test sets, train and evaluate.

                data = reader.Read(new MultiFileSource(_dataSetFile));
                ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (source)");
                ConsoleHelpers.InspectData(mlContext, data, 4);



                // Can't do stratification when column type is a boolean, is this an issue?
                //(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2, stratificationColumn: "Label");
                (trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);

                // save test split
                using (var fileStream = File.Create(Path.Combine(_outputPath, "testData.csv")))
                {
                    mlContext.Data.SaveAsText(testData, fileStream, separator: ',', headerRow: true, schema: true);
                }

                // save train split
                using (var fileStream = File.Create(Path.Combine(_outputPath, "trainData.csv")))
                {
                    mlContext.Data.SaveAsText(testData, fileStream, separator: ',', headerRow: true, schema: true);
                }
            }
            else
            {
                //Add the "StratificationColumn" that was added by classification.TrainTestSplit()
                // And Label is moved to column 0

                TextLoader.Column[] columnsPlus = new[] {
                    // A boolean column depicting the 'label'.
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    // 30 Features V1..V28 + Amount + StratificationColumn
                    new TextLoader.Column("V1", DataKind.R4, 1),
                    new TextLoader.Column("V2", DataKind.R4, 2),
                    new TextLoader.Column("V3", DataKind.R4, 3),
                    new TextLoader.Column("V4", DataKind.R4, 4),
                    new TextLoader.Column("V5", DataKind.R4, 5),
                    new TextLoader.Column("V6", DataKind.R4, 6),
                    new TextLoader.Column("V7", DataKind.R4, 7),
                    new TextLoader.Column("V8", DataKind.R4, 8),
                    new TextLoader.Column("V9", DataKind.R4, 9),
                    new TextLoader.Column("V10", DataKind.R4, 10),
                    new TextLoader.Column("V11", DataKind.R4, 11),
                    new TextLoader.Column("V12", DataKind.R4, 12),
                    new TextLoader.Column("V13", DataKind.R4, 13),
                    new TextLoader.Column("V14", DataKind.R4, 14),
                    new TextLoader.Column("V15", DataKind.R4, 15),
                    new TextLoader.Column("V16", DataKind.R4, 16),
                    new TextLoader.Column("V17", DataKind.R4, 17),
                    new TextLoader.Column("V18", DataKind.R4, 18),
                    new TextLoader.Column("V19", DataKind.R4, 19),
                    new TextLoader.Column("V20", DataKind.R4, 20),
                    new TextLoader.Column("V21", DataKind.R4, 21),
                    new TextLoader.Column("V22", DataKind.R4, 22),
                    new TextLoader.Column("V23", DataKind.R4, 23),
                    new TextLoader.Column("V24", DataKind.R4, 24),
                    new TextLoader.Column("V25", DataKind.R4, 25),
                    new TextLoader.Column("V26", DataKind.R4, 26),
                    new TextLoader.Column("V27", DataKind.R4, 27),
                    new TextLoader.Column("V28", DataKind.R4, 28),
                    new TextLoader.Column("Amount", DataKind.R4, 29),
                    new TextLoader.Column("StratificationColumn", DataKind.R4, 30)
                };

                // Load splited data
                trainData = mlContext.Data.ReadFromTextFile(columnsPlus, Path.Combine(_outputPath, "trainData.csv"),
                                                            advancedSettings: s => {
                    s.HasHeader = txtLoaderArgs.HasHeader;
                    s.Separator = txtLoaderArgs.Separator;
                }
                                                            );
                testData = mlContext.Data.ReadFromTextFile(columnsPlus, Path.Combine(_outputPath, "testData.csv"),
                                                           advancedSettings: s => {
                    s.HasHeader = txtLoaderArgs.HasHeader;
                    s.Separator = txtLoaderArgs.Separator;
                }
                                                           );
            }

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (traindata)");
            ConsoleHelpers.InspectData(mlContext, trainData, 4);

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (testData)");
            ConsoleHelpers.InspectData(mlContext, testData, 4);

            return(classification, reader, trainData, testData);
        }
示例#23
0
        public static PredictionFunction <TrafficObservation, AlertPrediction> ModelAndTrain()
        {
            Console.WriteLine("Starting Machine Learning Binary Classification");
            MLContext mlContext = new MLContext(seed: 1);

            IDataView data      = null;
            IDataView trainData = null;
            IDataView testData  = null;

            // Step one: read the data as an IDataView.
            // Create the reader: define the data columns
            // and where to find them in the text file.
            var reader = new TextLoader(mlContext, new TextLoader.Arguments
            {
                Column = new[] {
                    // A boolean column depicting the 'label'.
                    new TextLoader.Column("NextHourAlert", DataKind.BL, 20),
                    // 18 Features
                    new TextLoader.Column("AvgTotalBytes", DataKind.R4, 2),
                    new TextLoader.Column("AvgTotalPackets", DataKind.R4, 3),
                    new TextLoader.Column("AvgAveragebps", DataKind.R4, 4),
                    new TextLoader.Column("AvgOutPercentUtil", DataKind.R4, 5),
                    new TextLoader.Column("AvgInPercentUtil", DataKind.R4, 6),
                    new TextLoader.Column("AvgPercentUtil", DataKind.R4, 7),
                    new TextLoader.Column("MinTotalBytes", DataKind.R4, 8),
                    new TextLoader.Column("MinTotalPackets", DataKind.R4, 9),
                    new TextLoader.Column("MinAveragebps", DataKind.R4, 10),
                    new TextLoader.Column("MinOutPercentUtil", DataKind.R4, 11),
                    new TextLoader.Column("MinInPercentUtil", DataKind.R4, 12),
                    new TextLoader.Column("MinPercentUtil", DataKind.R4, 13),
                    new TextLoader.Column("MaxTotalBytes", DataKind.R4, 14),
                    new TextLoader.Column("MaxTotalPackets", DataKind.R4, 15),
                    new TextLoader.Column("MaxAveragebps", DataKind.R4, 16),
                    new TextLoader.Column("MaxOutPercentUtil", DataKind.R4, 17),
                    new TextLoader.Column("MaxInPercentUtil", DataKind.R4, 18),
                    new TextLoader.Column("MaxPercentUtil", DataKind.R4, 19)
                },
                // First line of the file is a header, not a data row.
                HasHeader = true,
                Separator = ","
            });

            // We know that this is a Binary Classification task,
            // so we create a Binary Classification context:
            // it will give us the algorithms we need,
            // as well as the evaluation procedure.
            var classification = new BinaryClassificationContext(mlContext);

            data = reader.Read(new MultiFileSource(_datapath));

            (trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);

            //Create a flexible pipeline (composed by a chain of estimators) for building/traing the model.

            var pipeline = mlContext.Transforms.Concatenate("Features", new[] { "AvgTotalBytes", "AvgTotalPackets", "AvgAveragebps", "AvgOutPercentUtil", "AvgInPercentUtil", "AvgPercentUtil",
                                                                                "MinTotalBytes", "MinTotalPackets", "MinAveragebps", "MinOutPercentUtil", "MinInPercentUtil", "MinPercentUtil",
                                                                                "MaxTotalBytes", "MaxTotalPackets", "MaxAveragebps", "MaxOutPercentUtil", "MaxInPercentUtil", "MaxPercentUtil" })
                           .Append(mlContext.Transforms.Normalize(inputName: "Features", outputName: "FeaturesNormalizedByMeanVar", mode: NormalizerMode.MeanVariance))
                           .Append(mlContext.BinaryClassification.Trainers.FastTree(label: "NextHourAlert",
                                                                                    features: "Features",
                                                                                    numLeaves: 20,
                                                                                    numTrees: 100,
                                                                                    minDatapointsInLeafs: 10,
                                                                                    learningRate: 0.2));
            var model = pipeline.Fit(trainData);

            var metrics = classification.Evaluate(model.Transform(testData), "NextHourAlert");

            Console.WriteLine("Acuracy: " + metrics.Accuracy);
            Console.WriteLine($"Area under ROC curve: {metrics.Auc}");
            Console.WriteLine($"Area under the precision/recall curve: {metrics.Auprc}");
            Console.WriteLine($"Entropy: {metrics.Entropy}");
            Console.WriteLine($"F1 Score: {metrics.F1Score}");
            Console.WriteLine($"Log loss: {metrics.LogLoss}");
            Console.WriteLine($"Log loss reduction: {metrics.LogLossReduction}");
            Console.WriteLine($"Negative precision: {metrics.NegativePrecision}");
            Console.WriteLine($"Positive precision: {metrics.PositivePrecision}");
            Console.WriteLine($"Positive recall: {metrics.PositiveRecall}");

            var predictor = model.MakePredictionFunction <TrafficObservation, AlertPrediction>(mlContext);

            return(predictor);
        }
        static void Main(string[] args)
        {
            //1. Create ML.NET context/environment
            var env = new ConsoleEnvironment();

            //SentimentIssueInput

            //2. Create DataReader with data schema mapped to file's columns

            //var text = TextLoader.CreateReader(env, ctx => (
            //                                    issueInput: new SentimentIssueInput() { Label = ctx.LoadBool(0), Text = ctx.LoadText(1) },
            //                                    issueInput2: new SentimentIssueInput() { Label = ctx.LoadBool(0), Text = ctx.LoadText(1) }
            //                                    ));


            var reader = TextLoader.CreateReader(env, ctx =>
                                                 new SentimentIssueInput()
            {
                Label = ctx.LoadBool(0),
                Text  = ctx.LoadText(1)
            });

            //var reader = TextLoader.CreateReader(env, ctx => (label: ctx.LoadBool(0),
            //                                                  text: ctx.LoadText(1)));

            //3. Create an estimator to use afterwards for creating/traing the model.

            var bctx = new BinaryClassificationContext(env);
            Estimator <SentimentIssueInput, SentimentPred, ITransformer> est = CreateEstimator(reader, bctx);

            est.ToString();

            //var est = reader.MakeNewEstimator().Append(row =>
            //{
            //    var featurizedText = row.text.FeaturizeText();  //Convert text to numeric vectors
            //    var prediction = bctx.Trainers.Sdca(row.label, featurizedText);  //Specify SDCA trainer based on the label and featurized text columns
            //    return (row.label, prediction);  //Return label and prediction columns. "prediction" holds predictedLabel, score and probability
            //});


            //Another way to create an Estimator, with the same behaviour, by chaining appends
            //var est = reader.MakeNewEstimator().Append(row => (label: row.label,
            //                                                  featurizedtext: row.text.FeaturizeText()))  //Convert text to numeric vectors
            //                                   .Append(row => (label: row.label,
            //                                                  prediction: bctx.Trainers.Sdca(row.label, row.featurizedtext)));  //Specify SDCA trainer based on the label and featurized text columns


            //4. Build and train the model

            //Load training data
            //var traindata = reader.Read(new MultiFileSource(TrainDataPath));

            //Console.WriteLine("=============== Create and Train the Model ===============");
            //var model = est.Fit(traindata);
            //Console.WriteLine("=============== End of training ===============");
            //Console.WriteLine();


            //5. Evaluate the model

            //Load test data
            //var testdata = reader.Read(new MultiFileSource(TestDataPath));

            //Console.WriteLine("=============== Evaluating Model's accuracy with Test data===============");
            //var predictions = model.Transform(testdata);
            //var metrics = bctx.Evaluate(predictions, row => row.label, row => row.prediction);

            //Console.WriteLine();
            //Console.WriteLine("PredictionModel quality metrics evaluation");
            //Console.WriteLine("------------------------------------------");
            //Console.WriteLine($"Accuracy: {metrics.Accuracy:P2}");
            //Console.WriteLine($"Auc: {metrics.Auc:P2}");
            //Console.WriteLine($"F1Score: {metrics.F1Score:P2}");
            //Console.WriteLine("=============== End of Model's evaluation ===============");
            //Console.WriteLine();

            //6. Test Sentiment Prediction with one sample text
            //var predictionFunct = model.AsDynamic.MakePredictionFunction<SentimentIssue, SentimentPrediction>(env);

            //SentimentIssue sampleStatement = new SentimentIssue
            //                            {
            //                                text = "This is a very rude movie"
            //                            };

            //var resultprediction = predictionFunct.Predict(sampleStatement);

            //Console.WriteLine();
            //Console.WriteLine("=============== Test of model with a sample ===============");
            //Console.WriteLine($"Text: {sampleStatement.text} | Prediction: {(resultprediction.PredictionLabel ? "Negative" : "Positive")} sentiment | Probability: {resultprediction.Probability} ");

            //Console.WriteLine("=============== End of process, hit any key to finish ===============");
            //Console.ReadKey();
        }
示例#25
0
        PrepareData(MLContext mlContext)
        {
            IDataView data      = null;
            IDataView trainData = null;
            IDataView testData  = null;

            // Step one: read the data as an IDataView.
            // Create the reader: define the data columns
            // and where to find them in the text file.
            var reader = new TextLoader(mlContext, new TextLoader.Arguments
            {
                Column = new[] {
                    // A boolean column depicting the 'label'.
                    new TextLoader.Column("Label", DataKind.BL, 30),
                    // 29 Features V1..V28 + Amount
                    new TextLoader.Column("V1", DataKind.R4, 1),
                    new TextLoader.Column("V2", DataKind.R4, 2),
                    new TextLoader.Column("V3", DataKind.R4, 3),
                    new TextLoader.Column("V4", DataKind.R4, 4),
                    new TextLoader.Column("V5", DataKind.R4, 5),
                    new TextLoader.Column("V6", DataKind.R4, 6),
                    new TextLoader.Column("V7", DataKind.R4, 7),
                    new TextLoader.Column("V8", DataKind.R4, 8),
                    new TextLoader.Column("V9", DataKind.R4, 9),
                    new TextLoader.Column("V10", DataKind.R4, 10),
                    new TextLoader.Column("V11", DataKind.R4, 11),
                    new TextLoader.Column("V12", DataKind.R4, 12),
                    new TextLoader.Column("V13", DataKind.R4, 13),
                    new TextLoader.Column("V14", DataKind.R4, 14),
                    new TextLoader.Column("V15", DataKind.R4, 15),
                    new TextLoader.Column("V16", DataKind.R4, 16),
                    new TextLoader.Column("V17", DataKind.R4, 17),
                    new TextLoader.Column("V18", DataKind.R4, 18),
                    new TextLoader.Column("V19", DataKind.R4, 19),
                    new TextLoader.Column("V20", DataKind.R4, 20),
                    new TextLoader.Column("V21", DataKind.R4, 21),
                    new TextLoader.Column("V22", DataKind.R4, 22),
                    new TextLoader.Column("V23", DataKind.R4, 23),
                    new TextLoader.Column("V24", DataKind.R4, 24),
                    new TextLoader.Column("V25", DataKind.R4, 25),
                    new TextLoader.Column("V26", DataKind.R4, 26),
                    new TextLoader.Column("V27", DataKind.R4, 27),
                    new TextLoader.Column("V28", DataKind.R4, 28),
                    new TextLoader.Column("Amount", DataKind.R4, 29),
                },
                // First line of the file is a header, not a data row.
                HasHeader = true,
                Separator = ","
            });


            // We know that this is a Binary Classification task,
            // so we create a Binary Classification context:
            // it will give us the algorithms we need,
            // as well as the evaluation procedure.
            var classification = new BinaryClassificationContext(mlContext);

            if (!File.Exists(Path.Combine(_outputPath, "testData.idv")) &&
                !File.Exists(Path.Combine(_outputPath, "trainData.idv")))
            {
                // Split the data 80:20 into train and test sets, train and evaluate.

                data = reader.Read(new MultiFileSource(_dataSetFile));
                ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (source)");
                ConsoleHelpers.InspectData(mlContext, data, 4);



                // Can't do stratification when column type is a boolean, is this an issue?
                //(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2, stratificationColumn: "Label");
                (trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);

                // save test split
                IHostEnvironment env = (IHostEnvironment)mlContext;
                using (var ch = env.Start("SaveData"))
                    using (var file = env.CreateOutputFile(Path.Combine(_outputPath, "testData.idv")))
                    {
                        var saver = new BinarySaver(mlContext, new BinarySaver.Arguments());
                        DataSaverUtils.SaveDataView(ch, saver, testData, file);
                    }

                // save train split
                using (var ch = ((IHostEnvironment)env).Start("SaveData"))
                    using (var file = env.CreateOutputFile(Path.Combine(_outputPath, "trainData.idv")))
                    {
                        var saver = new BinarySaver(mlContext, new BinarySaver.Arguments());
                        DataSaverUtils.SaveDataView(ch, saver, trainData, file);
                    }
            }
            else
            {
                // Load splited data
                var binTrainData = new BinaryLoader(mlContext, new BinaryLoader.Arguments(), new MultiFileSource(Path.Combine(_outputPath, "trainData.idv")));
                var trainRoles   = new RoleMappedData(binTrainData, roles: TransactionObservation.Roles());
                trainData = trainRoles.Data;


                var binTestData = new BinaryLoader(mlContext, new BinaryLoader.Arguments(), new MultiFileSource(Path.Combine(_outputPath, "testData.idv")));
                var testRoles   = new RoleMappedData(binTestData, roles: TransactionObservation.Roles());
                testData = testRoles.Data;
            }

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (traindata)");
            ConsoleHelpers.InspectData(mlContext, trainData, 4);

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (testData)");
            ConsoleHelpers.InspectData(mlContext, testData, 4);

            return(classification, reader, trainData, testData);
        }
 public static Pipeline GetPipeline(this BinaryClassificationContext context, IDataView dataView, string label)
 {
     return(PipelineSuggesterApi.GetPipeline(TaskKind.BinaryClassification, dataView, label));
 }