示例#1
0
        public override bool TrainModel(TrainerCommandLineOptions options)
        {
            var fileName = FeatureExtractFolder(options);

            var startDate = DateTime.Now;

            var dataView = MlContext.Data.LoadFromTextFile <ClassificationData>(fileName, hasHeader: false);

            var splitDataView = MlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);

            var featuresColumnName = "Features";

            var estimator = MlContext.Transforms.Text.FeaturizeText(nameof(ClassificationData.NGramText))
                            .Append(MlContext.Transforms.NormalizeMeanVariance(nameof(ClassificationData.FileGroupType)))
                            .Append(MlContext.Transforms.Concatenate(featuresColumnName, nameof(ClassificationData.NGramText), nameof(ClassificationData.FileGroupType)))
                            .Append(MlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: featuresColumnName));

            var model = estimator.Fit(splitDataView.TrainSet);

            Logger <TrainerCommandLineOptions> .Debug($"Model trained in {DateTime.Now.Subtract(startDate).TotalSeconds} seconds", options);

            var predictions = model.Transform(splitDataView.TestSet);

            var metrics = MlContext.BinaryClassification.Evaluate(predictions, "Label");

            Logger <TrainerCommandLineOptions> .Debug($"Accuracy: {metrics.Accuracy} | F1: {metrics.F1Score} | Auc: {metrics.AreaUnderRocCurve}", options);

            SaveModel(model, splitDataView.TrainSet.Schema, options);

            return(true);
        }
示例#2
0
        static void Main(string[] args)
        {
#if RELEASE
            var options = TrainerCommandLineParser.Parse(args);

            if (options == null)
            {
                return;
            }
#else
            var options = new TrainerCommandLineOptions()
            {
                FolderOfData = args[0], LogLevel = LogLevels.DEBUG, ModelType = Enum.Parse <ModelType>(args[1])
            };
#endif

            try
            {
                switch (options.ModelType)
                {
                case ModelType.CLASSIFICATION:
                    new ClassificationEngine().TrainModel(options);
                    break;

                case ModelType.CLUSTERING:
                    new ClusteringEngine().TrainModel(options);
                    break;
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
示例#3
0
        protected void SaveModel(ITransformer trainedModel, DataViewSchema schema, TrainerCommandLineOptions options)
        {
            using (var fileStream = new FileStream(OutputModelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
            {
                MlContext.Model.Save(trainedModel, schema, fileStream);
            }

            Logger <TrainerCommandLineOptions> .Debug($"Model saved to {OutputModelPath}", options);
        }
示例#4
0
        public override bool TrainModel(TrainerCommandLineOptions options)
        {
            var fileName = FeatureExtractFolder(options);

            var startDate = DateTime.Now;

            var fullData = MlContext.Data.LoadFromTextFile(path: fileName,
                                                           new[]
            {
                new TextLoader.Column("Label", DataKind.Single, 0),
                new TextLoader.Column(nameof(ClusterData.StartStringData), DataKind.String, 1),
                new TextLoader.Column(nameof(ClusterData.EndStringData), DataKind.String, 2)
            },
                                                           hasHeader: false,
                                                           separatorChar: ',');

            var trainTestData    = MlContext.Data.TrainTestSplit(fullData, testFraction: 0.2);
            var trainingDataView = trainTestData.TrainSet;
            var testingDataView  = trainTestData.TestSet;

            var featuresColumnName = "Features";

            var pipeline = MlContext.Transforms.Text.NormalizeText(nameof(ClusterData.StartStringData))
                           .Append(MlContext.Transforms.Text.TokenizeIntoWords(nameof(ClusterData.StartStringData)))
                           .Append(MlContext.Transforms.Text.RemoveDefaultStopWords(nameof(ClusterData.StartStringData)))
                           .Append(MlContext.Transforms.Conversion.MapValueToKey(nameof(ClusterData.StartStringData)))
                           .Append(MlContext.Transforms.Text.ProduceNgrams(nameof(ClusterData.StartStringData)))
                           .Append(MlContext.Transforms.NormalizeLpNorm(nameof(ClusterData.StartStringData)))
                           .Append(MlContext.Transforms.Text.NormalizeText(nameof(ClusterData.EndStringData))
                                   .Append(MlContext.Transforms.Text.TokenizeIntoWords(nameof(ClusterData.EndStringData)))
                                   .Append(MlContext.Transforms.Text.RemoveDefaultStopWords(nameof(ClusterData.EndStringData)))
                                   .Append(MlContext.Transforms.Conversion.MapValueToKey(nameof(ClusterData.EndStringData)))
                                   .Append(MlContext.Transforms.Text.ProduceNgrams(nameof(ClusterData.EndStringData)))
                                   .Append(MlContext.Transforms.NormalizeLpNorm(nameof(ClusterData.EndStringData)))
                                   .Append(MlContext.Transforms.Concatenate(featuresColumnName, nameof(ClusterData.StartStringData), nameof(ClusterData.EndStringData))));

            var trainer = MlContext.Clustering.Trainers.KMeans(featureColumnName: featuresColumnName, numberOfClusters: 5);

            var trainingPipeline = pipeline.Append(trainer);
            var trainedModel     = trainingPipeline.Fit(trainingDataView);

            Logger <TrainerCommandLineOptions> .Debug($"Model trained in {DateTime.Now.Subtract(startDate).TotalSeconds} seconds", options);

            var predictions = trainedModel.Transform(testingDataView);

            var metrics = MlContext.Clustering.Evaluate(predictions, scoreColumnName: "Score", featureColumnName: featuresColumnName);

            Logger <TrainerCommandLineOptions> .Debug($"Average Distance: {metrics.AverageDistance} | Davides Bouldin Index: {metrics.DaviesBouldinIndex}", options);

            SaveModel(trainedModel, trainingDataView.Schema, options);

            return(true);
        }
示例#5
0
        protected string SaveModel(ITransformer trainedModel, DataViewSchema schema, TrainerCommandLineOptions options)
        {
            try
            {
                using (var fileStream = new FileStream(OutputModelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
                {
                    MlContext.Model.Save(trainedModel, schema, fileStream);
                }

                return(OutputModelPath);
            }
            catch (Exception ex)
            {
                Log.Error($"Failure in saving model to {OutputModelPath} due to {ex}");

                return(null);
            }
        }
示例#6
0
        protected string FeatureExtractFolder(TrainerCommandLineOptions options)
        {
            var fileName = Path.Combine(AppContext.BaseDirectory, $"{DateTime.Now.Ticks}.txt");

            var files = Directory.GetFiles(options.FolderOfData);

            Logger <TrainerCommandLineOptions> .Debug($"{files.Length} Files found for training...", options);

            var stopWatch = DateTime.Now;

            var extractions = new ConcurrentQueue <string>();

            var classifications = new ConcurrentQueue <FileGroupType>();

            Parallel.ForEach(files, file =>
            {
                var response = new ClassifierResponseItem(File.ReadAllBytes(file), file, true);

                var(data, output) = FeatureExtraction(response);

                classifications.Enqueue(response.FileGroup);

                extractions.Enqueue(output);
            });

            File.WriteAllText(fileName, string.Join(System.Environment.NewLine, extractions));

            var featureBreakdown = (from classification in classifications.GroupBy(a => a).Select(a => a.Key)
                                    let count = classifications.Count(a => a == classification)
                                                let percentage = Math.Round((double)count / files.Length * 100.0, 0)
                                                                 select $"{classification}: {(double) count} ({percentage}%)").ToList();

            Logger <TrainerCommandLineOptions> .Debug(string.Join("|", featureBreakdown), options);

            Logger <TrainerCommandLineOptions> .Debug($"Feature Extraction took {DateTime.Now.Subtract(stopWatch).TotalSeconds} seconds", options);

            return(fileName);
        }
示例#7
0
 public abstract bool TrainModel(TrainerCommandLineOptions options);
示例#8
0
        public async Task <bool> Run(Hosts host, string serverURL)
        {
            _host = host;

            _serverURL = serverURL;

            CheckPendingSubmissions();

            var workerHandler = new WorkerHandler(_serverURL);

            var work = await workerHandler.GetWorkAsync(_host.Name);

            if (work == null)
            {
                System.Threading.Thread.Sleep(Constants.LOOP_INTERVAL_MS);

                return(false);
            }

            work.Started   = true;
            work.StartTime = DateTime.Now;

            var result = await workerHandler.UpdateWorkAsync(work);

            if (!result)
            {
                System.Threading.Thread.Sleep(Constants.LOOP_INTERVAL_MS);

                return(false);
            }

            if (!Directory.Exists(work.TrainingDataPath))
            {
                work.Completed     = true;
                work.Debug         = $"Path ({work.TrainingDataPath}) does not exist";
                work.CompletedTime = DateTime.Now;

                result = await workerHandler.UpdateWorkAsync(work);

                if (!result)
                {
                    AddToPending(work);
                }

                return(false);
            }

            var options = new TrainerCommandLineOptions
            {
                FolderOfData = work.TrainingDataPath,
                LogLevel     = LogLevels.DEBUG
            };

            var(outputFile, metrics) = (string.Empty, string.Empty);

            switch (Enum.Parse <ModelType>(work.ModelType, true))
            {
            case ModelType.CLASSIFICATION:
                (outputFile, metrics) = new ClassificationEngine().TrainModel(options);
                break;

            case ModelType.CLUSTERING:
                (outputFile, metrics) = new ClusteringEngine().TrainModel(options);
                break;
            }

            if (File.Exists(outputFile))
            {
                work.Model = File.ReadAllBytes(outputFile);
            }

            work.ModelEvaluationMetrics = metrics;
            work.Completed     = true;
            work.CompletedTime = DateTime.Now;

            result = await workerHandler.UpdateWorkAsync(work);

            if (result)
            {
                Console.WriteLine($"Successfully trained model and saved to {outputFile}");
            }
            else
            {
                AddToPending(work);
            }

            return(result);
        }
        public static TrainerCommandLineOptions Parse(string[] args)
        {
            TrainerCommandLineOptions options = null;

            Option oVerbose = new Option(
                "--verbose",
                "Enable verbose output",
                new Argument <bool>(defaultValue: false));

            Option oFolder = new Option(
                "--folderofdata",
                "Folder containing data to be parsed to build the model",
                new Argument <string>());

            Option oModelType = new Option(
                "--modeltype",
                "Model Type",
                new Argument <ModelType>());

            var rootCommand = new RootCommand
            {
                Description = "File Trainer builds a model"
            };

            rootCommand.AddOption(oFolder);
            rootCommand.AddOption(oVerbose);
            rootCommand.AddOption(oModelType);

            rootCommand.TreatUnmatchedTokensAsErrors = true;

            rootCommand.Argument.AddValidator(symbolResult =>
            {
                if (symbolResult.Children["--folderofdata"] is null)
                {
                    return("Folder Path is required");
                }
                else
                {
                    return(null);
                }
            });

            rootCommand.Handler = CommandHandler.Create <string, bool, ModelType>((folderPath, verbose, modelType) =>
            {
                if (string.IsNullOrEmpty(folderPath))
                {
                    return;
                }

                options = new TrainerCommandLineOptions
                {
                    FolderOfData = folderPath,
                    LogLevel     = LogLevels.DEBUG,
                    ModelType    = modelType
                };
            });

            rootCommand.InvokeAsync(args).Wait();

            return(options);
        }
 public abstract (string OutputFile, string Metrics) TrainModel(TrainerCommandLineOptions options);
示例#11
0
        public async Task <bool> Run(Workers worker, Config config)
        {
            _worker = worker;

            _config = config;

            var workerHandler = new WorkerHandler(_config.WebServiceURL, _config.RegistrationKey);

            var work = await workerHandler.GetWorkAsync(_worker.Name);

            if (work == null)
            {
                Log.Debug($"No work or connection issues to {_config.WebServiceURL}, waiting until next interval");

                System.Threading.Thread.Sleep(Constants.LOOP_INTERVAL_MS);

                return(false);
            }

            work.Started   = true;
            work.StartTime = DateTime.Now;

            var result = await workerHandler.UpdateWorkAsync(work);

            if (!result)
            {
                System.Threading.Thread.Sleep(Constants.LOOP_INTERVAL_MS);

                return(false);
            }

            if (!Directory.Exists(work.TrainingDataPath))
            {
                work.Completed     = true;
                work.Debug         = $"Path ({work.TrainingDataPath}) does not exist";
                work.CompletedTime = DateTime.Now;

                result = await workerHandler.UpdateWorkAsync(work);

                if (!result)
                {
                    AddToPending(work);
                }

                return(false);
            }

            var options = new TrainerCommandLineOptions
            {
                FolderOfData = work.TrainingDataPath,
                LogLevel     = LogLevels.DEBUG
            };

            var featureExtractor = Assembly.Load(work.FeatureExtractorBytes);

            if (featureExtractor == null)
            {
                work.Debug = "Feature Extractor Assembly was not piped to the worker";

                return(false);
            }

            var extractor = featureExtractor.GetTypes()
                            .Where(a => a.BaseType == typeof(BasePrediction) && !a.IsAbstract)
                            .Select(a => ((BasePrediction)Activator.CreateInstance(a)))
                            .FirstOrDefault(a => a.MODEL_NAME == work.ModelType);

            if (extractor == null)
            {
                work.Debug = $"Failed to load {work.ModelType} from piped in assembly";

                return(false);
            }

            var(outputFile, metrics) = extractor.TrainModel(options);

            if (File.Exists(outputFile))
            {
                work.Model = File.ReadAllBytes(outputFile);
            }

            work.ModelEvaluationMetrics = metrics;
            work.Completed     = true;
            work.CompletedTime = DateTime.Now;

            result = await workerHandler.UpdateWorkAsync(work);

            if (result)
            {
                Log.Debug($"{work.ID}.{work.Name} - was successfully trained and saved to {outputFile}");

                Console.WriteLine($"Successfully trained model and saved to {outputFile}");
            }
            else
            {
                AddToPending(work);
            }

            return(result);
        }