Beispiel #1
0
        public async Task <ModelMetrics> GenerateModel(BaseDAL storage, string modelFileName)
        {
            if (storage == null)
            {
                Log.Error("Trainer::GenerateModel - BaseDAL is null");

                throw new ArgumentNullException(nameof(storage));
            }

            if (string.IsNullOrEmpty(modelFileName))
            {
                Log.Error("Trainer::GenerateModel - modelFileName is null");

                throw new ArgumentNullException(nameof(modelFileName));
            }

            if (!File.Exists(modelFileName))
            {
                Log.Error($"Trainer::GenerateModel - {modelFileName} does not exist");

                throw new FileNotFoundException(modelFileName);
            }

            var startTime = DateTime.Now;

            var options = new RandomizedPcaTrainer.Options
            {
                FeatureColumnName       = FEATURES,
                ExampleWeightColumnName = null,
                Rank           = 4,
                Oversampling   = 20,
                EnsureZeroMean = true,
                Seed           = Constants.ML_SEED
            };

            var(data, cleanRowCount, maliciousRowCount) = GetDataView(await storage.QueryPacketsAsync(a => a.IsClean), await storage.QueryPacketsAsync(a => !a.IsClean));

            IEstimator <ITransformer> dataProcessPipeline = _mlContext.Transforms.Concatenate(
                FEATURES,
                typeof(PayloadItem).ToPropertyList <PayloadItem>(nameof(PayloadItem.Label)));

            IEstimator <ITransformer> trainer = _mlContext.AnomalyDetection.Trainers.RandomizedPca(options: options);

            EstimatorChain <ITransformer> trainingPipeline = dataProcessPipeline.Append(trainer);

            TransformerChain <ITransformer> trainedModel = trainingPipeline.Fit(data.TrainSet);

            _mlContext.Model.Save(trainedModel, data.TrainSet.Schema, modelFileName);

            var testSetTransform = trainedModel.Transform(data.TestSet);

            return(new ModelMetrics
            {
                Metrics = _mlContext.AnomalyDetection.Evaluate(testSetTransform),
                NumCleanRows = cleanRowCount,
                NumMaliciousRows = maliciousRowCount,
                Duration = DateTime.Now.Subtract(startTime)
            });
        }
Beispiel #2
0
        public static ITransformer TrainModel(MLContext mlContext, IDataView trainDataView)
        {
            // Get all the feature column names (All except the Label and the IdPreservationColumn)
            string[] featureColumnNames = trainDataView.Schema.AsQueryable()
                                          .Select(column => column.Name)                               // Get all the column names
                                          .Where(name => name != nameof(TransactionObservation.Label)) // Do not include the Label column
                                          .Where(name => name != "IdPreservationColumn")               // Do not include the IdPreservationColumn/StratificationColumn
                                          .Where(name => name != nameof(TransactionObservation.Time))  // Do not include the Time column. Not needed as feature column
                                          .ToArray();


            // Create the data process pipeline
            IEstimator <ITransformer> dataProcessPipeline = mlContext.Transforms.Concatenate("Features", featureColumnNames)
                                                            .Append(mlContext.Transforms.DropColumns(new string[] { nameof(TransactionObservation.Time) }))
                                                            .Append(mlContext.Transforms.NormalizeLpNorm(outputColumnName: "NormalizedFeatures", inputColumnName: "Features"));

            // In Anomaly Detection, the learner assumes all training examples have label 0, as it only learns from normal examples.
            // If any of the training examples has label 1, it is recommended to use a Filter transform to filter them out before training:
            IDataView normalTrainDataView = mlContext.Data.FilterRowsByColumn(trainDataView, columnName: nameof(TransactionObservation.Label), lowerBound: 0, upperBound: 1);


            // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features"
            ConsoleHelper.PeekDataViewInConsole(mlContext, normalTrainDataView, dataProcessPipeline, 2);
            ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "NormalizedFeatures", normalTrainDataView, dataProcessPipeline, 2);


            var options = new RandomizedPcaTrainer.Options
            {
                FeatureColumnName       = "NormalizedFeatures", // The name of the feature column. The column data must be a known-sized vector of Single.
                ExampleWeightColumnName = null,                 // The name of the example weight column (optional). To use the weight column, the column data must be of type Single.
                Rank           = 28,                            // The number of components in the PCA.
                Oversampling   = 20,                            // Oversampling parameter for randomized PCA training.
                EnsureZeroMean = true,                          // If enabled, data is centered to be zero mean.
                Seed           = 1                              // The seed for random number generation.
            };


            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
            IEstimator <ITransformer> trainer = mlContext.AnomalyDetection.Trainers.RandomizedPca(options: options);

            EstimatorChain <ITransformer> trainingPipeline = dataProcessPipeline.Append(trainer);

            ConsoleHelper.ConsoleWriteHeader("=============== Training model ===============");

            TransformerChain <ITransformer> model = trainingPipeline.Fit(normalTrainDataView);

            ConsoleHelper.ConsoleWriteHeader("=============== End of training process ===============");

            return(model);
        }
Beispiel #3
0
        public void Train(string trainingFileName, string testingFileName)
        {
            if (!File.Exists(trainingFileName))
            {
                Console.WriteLine($"Failed to find training data file ({trainingFileName}");

                return;
            }

            if (!File.Exists(testingFileName))
            {
                Console.WriteLine($"Failed to find test data file ({testingFileName}");

                return;
            }

            var trainingDataView = GetDataView(trainingFileName);

            var options = new RandomizedPcaTrainer.Options
            {
                FeatureColumnName       = FEATURES,
                ExampleWeightColumnName = null,
                Rank           = 5,
                Oversampling   = 20,
                EnsureZeroMean = true,
                Seed           = 1
            };

            IEstimator <ITransformer> trainer = MlContext.AnomalyDetection.Trainers.RandomizedPca(options: options);

            EstimatorChain <ITransformer> trainingPipeline = trainingDataView.Transformer.Append(trainer);

            TransformerChain <ITransformer> trainedModel = trainingPipeline.Fit(trainingDataView.DataView);

            MlContext.Model.Save(trainedModel, trainingDataView.DataView.Schema, ModelPath);

            var testingDataView = GetDataView(testingFileName, true);

            var testSetTransform = trainedModel.Transform(testingDataView.DataView);

            var modelMetrics = MlContext.AnomalyDetection.Evaluate(testSetTransform);

            Console.WriteLine($"Area Under Curve: {modelMetrics.AreaUnderRocCurve:P2}{Environment.NewLine}" +
                              $"Detection at FP Count: {modelMetrics.DetectionRateAtFalsePositiveCount}");
        }