public static ITransformer TrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
        {
            ITransformer model = trainingPipeline.Fit(trainingDataView);

            return(model);
        }
示例#2
0
        public static IEstimator <ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator <ITransformer> pipeline)
        {
            var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features"))
                                   .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

            _trainedModel = trainingPipeline.Fit(trainingDataView);
            _predEngine   = _mlContext.Model.CreatePredictionEngine <GitHubIssue, IssuePrediction>(_trainedModel);

            GitHubIssue issue = new GitHubIssue()
            {
                Title       = "WebSockets communication is slow in my machine",
                Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
            };

            var prediction = _predEngine.Predict(issue);

            Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
            return(trainingPipeline);
        }
示例#3
0
        private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
        {
            // Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate)
            // in order to evaluate and get the model's accuracy metrics
            Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ===============");
            var crossValidationResults = mlContext.BinaryClassification.CrossValidateNonCalibrated(trainingDataView, trainingPipeline, numberOfFolds: 5, labelColumnName: "Category");

            PrintBinaryClassificationFoldsAverageMetrics(crossValidationResults);
        }
示例#4
0
        public CategoricalTransform(TermEstimator term, IEstimator <ITransformer> keyToVector, IDataView input)
        {
            var chain = term.Append(keyToVector);

            _transformer = chain.Fit(input);
        }
示例#5
0
        public static List <float[]> PeekVectorColumnDataInConsole(MLContext mlContext, string columnName, IDataView dataView, IEstimator <ITransformer> pipeline, int numberOfRows = 4)
        {
            string msg = string.Format("Peek data in DataView: : Show {0} rows with just the '{1}' column", numberOfRows, columnName);

            ConsoleWriteHeader(msg);

            var transformer     = pipeline.Fit(dataView);
            var transformedData = transformer.Transform(dataView);

            // Extract the 'Features' column.
            var someColumnData = transformedData.GetColumn <float[]>(columnName)
                                 .Take(numberOfRows).ToList();

            // print to console the peeked rows
            someColumnData.ForEach(row => {
                String concatColumn = String.Empty;
                foreach (float f in row)
                {
                    concatColumn += f.ToString();
                }
                Console.WriteLine(concatColumn);
            });

            return(someColumnData);
        }
示例#6
0
        public static IEstimator <ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator <ITransformer> pipeline)
        {
            var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features"))
                                   .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

            _trainedModel = trainingPipeline.Fit(trainingDataView);

            _predEngine = _mlContext.Model.CreatePredictionEngine <Supervised, PurposePrediction>(_trainedModel);

            return(trainingPipeline);
        }
示例#7
0
        // </SnippetDeclareGlobalVariables>

        static void Main(string[] args)
        {
            // Create MLContext to be shared across the model creation workflow objects
            // <SnippetCreateMLContext>
            MLContext mlContext = new MLContext();
            // </SnippetCreateMLContext>

            // Dictionary to encode words as integers.
            // <SnippetCreateLookupMap>
            var lookupMap = mlContext.Data.LoadFromTextFile(Path.Combine(_modelPath, "imdb_word_index.csv"),
                                                            columns: new[]
            {
                new TextLoader.Column("Words", DataKind.String, 0),
                new TextLoader.Column("Ids", DataKind.Int32, 1),
            },
                                                            separatorChar: ','
                                                            );
            // </SnippetCreateLookupMap>

            // The model expects the input feature vector to be a fixed length vector.
            // This action resizes the integer vector to a fixed length vector. If there
            // are less than 600 words in the sentence, the remaining indices will be filled
            // with zeros. If there are more than 600 words in the sentence, then the
            // array is truncated at 600.
            // <SnippetResizeFeatures>
            Action <MovieReview, FixedLengthFeatures> ResizeFeaturesAction = (s, f) =>
            {
                var features = s.VariableLengthFeatures;
                Array.Resize(ref features, FeatureLength);
                f.Features = features;
            };
            // </SnippetResizeFeatures>

            // Load the TensorFlow model.
            // <SnippetLoadTensorFlowModel>
            TensorFlowModel tensorFlowModel = mlContext.Model.LoadTensorFlowModel(_modelPath);
            // </SnippetLoadTensorFlowModel>

            // <SnippetGetModelSchema>
            DataViewSchema schema = tensorFlowModel.GetModelSchema();

            Console.WriteLine(" =============== TensorFlow Model Schema =============== ");
            var featuresType = (VectorDataViewType)schema["Features"].Type;

            Console.WriteLine($"Name: Features, Type: {featuresType.ItemType.RawType}, Size: ({featuresType.Dimensions[0]})");
            var predictionType = (VectorDataViewType)schema["Prediction/Softmax"].Type;

            Console.WriteLine($"Name: Prediction/Softmax, Type: {predictionType.ItemType.RawType}, Size: ({predictionType.Dimensions[0]})");

            // </SnippetGetModelSchema>

            // <SnippetTokenizeIntoWords>
            IEstimator <ITransformer> pipeline =
                // Split the text into individual words
                mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "ReviewText")
                // </SnippetTokenizeIntoWords>

                // <SnippetMapValue>
                // Map each word to an integer value. The array of integer makes up the input features.
                .Append(mlContext.Transforms.Conversion.MapValue("VariableLengthFeatures", lookupMap,
                                                                 lookupMap.Schema["Words"], lookupMap.Schema["Ids"], "TokenizedWords"))
                // </SnippetMapValue>

                // <SnippetCustomMapping>
                // Resize variable length vector to fixed length vector.
                .Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
                // </SnippetCustomMapping>

                // <SnippetScoreTensorFlowModel>
                // Passes the data to TensorFlow for scoring
                .Append(tensorFlowModel.ScoreTensorFlowModel("Prediction/Softmax", "Features"))
                // </SnippetScoreTensorFlowModel>

                // <SnippetCopyColumns>
                // Retrieves the 'Prediction' from TensorFlow and and copies to a column
                .Append(mlContext.Transforms.CopyColumns("Prediction", "Prediction/Softmax"));
            // </SnippetCopyColumns>

            // <SnippetCreateModel>
            // Create an executable model from the estimator pipeline
            IDataView    dataView = mlContext.Data.LoadFromEnumerable(new List <MovieReview>());
            ITransformer model    = pipeline.Fit(dataView);

            // </SnippetCreateModel>

            // <SnippetCallPredictSentiment>
            PredictSentiment(mlContext, model);
            // </SnippetCallPredictSentiment>
        }
示例#8
0
 public GreedySolver(IEstimator estimator)
 {
     this.estimator = estimator;
 }
示例#9
0
        public static IEstimator <ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator <ITransformer> pipeline)
        {
            var trainingPipeline = pipeline
                                   .Append(_mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label, DefaultColumnNames.Features))
                                   .Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

            _trainedModel = trainingPipeline.Fit(trainingDataView);
            _predEngine   = _trainedModel.CreatePredictionEngine <SentimentData, SentimentPrediction>(_mlContext);

            SentimentData issue = new SentimentData()
            {
                Title       = "WebSockets communication is slow in my machine",
                Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
            };

            var prediction = _predEngine.Predict(issue);

            Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
            return(trainingPipeline);
        }
示例#10
0
        private static EvaluationResults Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
        {
            // Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate)
            // in order to evaluate and get the model's accuracy metrics
            Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ===============");
            var crossValidationResults = mlContext.Regression.CrossValidate(trainingDataView, trainingPipeline, 10, "Value");

            var results = PrintRegressionFoldsAverageMetrics(crossValidationResults);

            return(results);
        }
示例#11
0
        public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy)
        {
            // Create MLContext to be shared across the model creation workflow objects
            // Set a random seed for repeatable/deterministic results across multiple trainings.
            var mlContext = new MLContext(seed: 0);

            // STEP 1: Common data loading configuration
            var textLoader       = GitHubLabelerTextLoaderFactory.CreateTextLoader(mlContext);
            var trainingDataView = textLoader.Read(DataSetLocation);

            // STEP 2: Common data process configuration with pipeline data transformations
            var dataProcessPipeline = GitHubLabelerDataProcessPipelineFactory.CreateDataProcessPipeline(mlContext);

            // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features"
            Common.ConsoleHelper.PeekDataViewInConsole <GitHubIssue>(mlContext, trainingDataView, dataProcessPipeline, 2);
            //Common.ConsoleHelper.PeekVectorColumnDataInConsole(mlContext, "Features", trainingDataView, dataProcessPipeline, 2);

            // STEP 3: Create the selected training algorithm/trainer
            IEstimator <ITransformer> trainer = null;

            switch (selectedStrategy)
            {
            case MyTrainerStrategy.SdcaMultiClassTrainer:
                trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label,
                                                                                                     DefaultColumnNames.Features);
                break;

            case MyTrainerStrategy.OVAAveragedPerceptronTrainer:
            {
                // Create a binary classification trainer.
                var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label,
                                                                                                                 DefaultColumnNames.Features,
                                                                                                                 numIterations: 10);
                // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer.
                // In this strategy, a binary classification algorithm is used to train one classifier for each class, "
                // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, "
                // and choosing the prediction with the highest confidence score.
                trainer = new Ova(mlContext, averagedPerceptronBinaryTrainer);
                break;
            }

            default:
                break;
            }

            //Set the trainer/algorithm
            var modelBuilder = new Common.ModelBuilder <GitHubIssue, GitHubIssuePrediction>(mlContext, dataProcessPipeline);

            modelBuilder.AddTrainer(trainer);
            modelBuilder.AddEstimator(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));

            // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate)
            // in order to evaluate and get the model's accuracy metrics
            Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ===============");
            var crossValResults = modelBuilder.CrossValidateAndEvaluateMulticlassClassificationModel(trainingDataView, 6, "Label");

            ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValResults);

            // STEP 5: Train the model fitting to the DataSet
            Console.WriteLine("=============== Training the model ===============");
            modelBuilder.Train(trainingDataView);

            // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model)
            GitHubIssue issue = new GitHubIssue()
            {
                ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
            };
            var modelScorer = new ModelScorer <GitHubIssue, GitHubIssuePrediction>(mlContext, modelBuilder.TrainedModel);
            var prediction  = modelScorer.PredictSingle(issue);

            Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
            //

            // STEP 6: Save/persist the trained model to a .ZIP file
            Console.WriteLine("=============== Saving the model to a file ===============");
            modelBuilder.SaveModelAsFile(ModelPath);

            Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized");
        }
        public ITransformer TrainFeaturizeText()
        {
            var textColumns = new List <string>();

            for (int i = 0; i < 20; i++) // Only load first 20 columns
            {
                textColumns.Add($"Column{i}");
            }

            var featurizers = new List <TextFeaturizingEstimator>();

            foreach (var textColumn in textColumns)
            {
                var featurizer = _mlContext.Transforms.Text.FeaturizeText(textColumn, new TextFeaturizingEstimator.Options()
                {
                    CharFeatureExtractor = null,
                    WordFeatureExtractor = new WordBagEstimator.Options()
                    {
                        NgramLength        = 2,
                        MaximumNgramsCount = new int[] { 200000 }
                    }
                });
                featurizers.Add(featurizer);
            }

            IEstimator <ITransformer> pipeline = featurizers.First();

            foreach (var featurizer in featurizers.Skip(1))
            {
                pipeline = pipeline.Append(featurizer);
            }

            var model = pipeline.Fit(_dataset);

            // BENCHMARK OUTPUT
            // * Summary *

            //BenchmarkDotNet = v0.11.3, OS = Windows 10.0.18363
            //Intel Xeon W - 2133 CPU 3.60GHz, 1 CPU, 12 logical and 6 physical cores
            //.NET Core SDK = 3.0.100
            //[Host]     : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), 64bit RyuJIT
            //Job - KDKCUJ : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), 64bit RyuJIT

            //Arguments =/ p:Configuration = Release  Toolchain = netcoreapp2.1  IterationCount = 1
            //LaunchCount = 3  MaxIterationCount = 20  RunStrategy = ColdStart
            //UnrollFactor = 1  WarmupCount = 1

            //             Method | Mean     | Error    | StdDev    | Extra Metric  | Gen 0 / 1k Op | Gen 1 / 1k Op | Gen 2 / 1k Op | Allocated Memory / Op |
            //------------------- | --------:| --------:| ---------:| -------------:| -------------:| ------------: | ------------: | --------------------: |
            // TrainFeaturizeText | 17.00 s  | 6.337 s  | 0.3474 s  | -             | 1949000.0000  | 721000.0000   | 36000.0000    | 315.48 MB             |

            //// * Legends *
            //  Mean                : Arithmetic mean of all measurements
            //  Error               : Half of 99.9 % confidence interval
            //  StdDev              : Standard deviation of all measurements
            //  Extra Metric: Value of the provided extra metric
            //  Gen 0 / 1k Op         : GC Generation 0 collects per 1k Operations
            //  Gen 1 / 1k Op         : GC Generation 1 collects per 1k Operations
            //  Gen 2 / 1k Op         : GC Generation 2 collects per 1k Operations
            //  Allocated Memory/ Op : Allocated memory per single operation(managed only, inclusive, 1KB = 1024B)
            //  1 s: 1 Second(1 sec)

            //// * Diagnostic Output - MemoryDiagnoser *
            //// ***** BenchmarkRunner: End *****
            //  Run time: 00:01:52(112.92 sec), executed benchmarks: 1

            //// * Artifacts cleanup *
            //  Global total time: 00:01:59(119.89 sec), executed benchmarks: 1

            return(model);
        }
示例#13
0
        private void MixMatch(string dataPath)
        {
            // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
            // as a catalog of available operations and as the source of randomness.
            var mlContext = new MLContext();

            // Read the data as an IDataView.
            // First, we define the reader: specify the data columns and where to find them in the text file.
            var reader = mlContext.Data.CreateTextReader(ctx => (
                                                             // The four features of the Iris dataset.
                                                             SepalLength: ctx.LoadFloat(0),
                                                             SepalWidth: ctx.LoadFloat(1),
                                                             PetalLength: ctx.LoadFloat(2),
                                                             PetalWidth: ctx.LoadFloat(3),
                                                             // Label: kind of iris.
                                                             Label: ctx.LoadText(4)
                                                             ),
                                                         // Default separator is tab, but the dataset has comma.
                                                         separator: ',');

            // Read the data.
            var data = reader.Read(dataPath);

            // Build the pre-processing pipeline.
            var learningPipeline = reader.MakeNewEstimator()
                                   .Append(r => (
                                               // Convert string label to a key.
                                               Label: r.Label.ToKey(),
                                               // Concatenate all the features together into one column 'Features'.
                                               Features: r.SepalLength.ConcatWith(r.SepalWidth, r.PetalLength, r.PetalWidth)));

            // Now, at the time of writing, there is no static pipeline for OVA (one-versus-all). So, let's
            // append the OVA learner to the dynamic pipeline.
            IEstimator <ITransformer> dynamicPipe = learningPipeline.AsDynamic;

            // Create a binary classification trainer.
            var binaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron("Label", "Features");

            // Append the OVA learner to the pipeline.
            dynamicPipe = dynamicPipe.Append(new Ova(mlContext, binaryTrainer));

            // At this point, we have a choice. We could continue working with the dynamically-typed pipeline, and
            // ultimately call dynamicPipe.Fit(data.AsDynamic) to get the model, or we could go back into the static world.
            // Here's how we go back to the static pipeline:
            var staticFinalPipe = dynamicPipe.AssertStatic(mlContext,
                                                           // Declare the shape of the input. As you can see, it's identical to the shape of the reader:
                                                           // four float features and a string label.
                                                           c => (
                                                               SepalLength: c.R4.Scalar,
                                                               SepalWidth: c.R4.Scalar,
                                                               PetalLength: c.R4.Scalar,
                                                               PetalWidth: c.R4.Scalar,
                                                               Label: c.Text.Scalar),
                                                           // Declare the shape of the output (or a relevant subset of it).
                                                           // In our case, we care only about the predicted label column (a key type), and scores (vector of floats).
                                                           c => (
                                                               Score: c.R4.Vector,
                                                               // Predicted label is a key backed by uint, with text values (since original labels are text).
                                                               PredictedLabel: c.KeyU4.TextValues.Scalar))
                                  // Convert the predicted label from key back to the original string value.
                                  .Append(r => r.PredictedLabel.ToValue());

            // Train the model in a statically typed way.
            var model = staticFinalPipe.Fit(data);

            // And here is how we could've stayed in the dynamic pipeline and train that way.
            dynamicPipe = dynamicPipe.Append(new KeyToValueMappingEstimator(mlContext, "PredictedLabel"));
            var dynamicModel = dynamicPipe.Fit(data.AsDynamic);

            // Now 'dynamicModel', and 'model.AsDynamic' are equivalent.
        }
        private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
        {
            var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingDataView, trainingPipeline, numberOfFolds: 5, labelColumnName: "0");

            PrintMulticlassClassificationFoldsAverageMetrics(crossValidationResults);
        }
示例#15
0
 public BaseTaskCreator(IEnumerable <IData> data, IEstimator estimator)
 {
     _data      = data;
     _estimator = estimator;
     PrepareTasks();
 }
示例#16
0
        public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(ValueToKeyMappingEstimator));
            _hash = new HashingEstimator(_host, columns.Select(x => x.HashInfo).ToArray());
            using (var ch = _host.Start(nameof(OneHotHashEncodingEstimator)))
            {
                var binaryCols = new List <(string input, string output)>();
                var cols       = new List <(string input, string output, bool bag)>();
                for (int i = 0; i < columns.Length; i++)
                {
                    var column = columns[i];
                    OneHotEncodingTransformer.OutputKind kind = columns[i].OutputKind;
                    switch (kind)
                    {
                    default:
                        throw _host.ExceptUserArg(nameof(column.OutputKind));

                    case OneHotEncodingTransformer.OutputKind.Key:
                        continue;

                    case OneHotEncodingTransformer.OutputKind.Bin:
                        if ((column.HashInfo.InvertHash) != 0)
                        {
                            ch.Warning("Invert hashing is being used with binary encoding.");
                        }
                        binaryCols.Add((column.HashInfo.Output, column.HashInfo.Output));
                        break;

                    case OneHotEncodingTransformer.OutputKind.Ind:
                        cols.Add((column.HashInfo.Output, column.HashInfo.Output, false));
                        break;

                    case OneHotEncodingTransformer.OutputKind.Bag:
                        cols.Add((column.HashInfo.Output, column.HashInfo.Output, true));
                        break;
                    }
                }
                IEstimator <ITransformer> toBinVector = null;
                IEstimator <ITransformer> toVector    = null;
                if (binaryCols.Count > 0)
                {
                    toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.input, x.output)).ToArray());
                }
                if (cols.Count > 0)
                {
                    toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.input, x.output, x.bag)).ToArray());
                }

                if (toBinVector != null && toVector != null)
                {
                    _toSomething = toVector.Append(toBinVector);
                }
                else
                {
                    if (toBinVector != null)
                    {
                        _toSomething = toBinVector;
                    }
                    else
                    {
                        _toSomething = toVector;
                    }
                }
            }
        }
示例#17
0
 private static ITransformer Train(IDataView trainingDataView, IEstimator <ITransformer> pipeLine)
 {
     // Train your model based on the data set
     return(pipeLine.Fit(trainingDataView));
 }
示例#18
0
        /// <summary>
        /// A base template of regression trainer which contains pre-processing likes OHE,PCA with any choosing algorithm.
        /// </summary>
        /// <typeparam name="TType">Type of training data.</typeparam>
        /// <typeparam name="TTrainer">Type of trainer algorithm.</typeparam>
        /// <param name="context">Microsoft.ML context.</param>
        /// <param name="trainDataset">Training dataset.</param>
        /// <param name="estimator">Algorithm estimator.</param>
        /// <returns>Model of training datatype from given estimator.</returns>
        private static TransformerChain <TTrainer> RegressionTrainerTemplate <TType, TTrainer>(this MLContext context, IEnumerable <TType> trainDataset, IEstimator <TTrainer> estimator)
            where TType : class, new()
            where TTrainer : class, ITransformer
        {
            var type            = typeof(TType);
            var labelColumnName = Preprocessing.LabelColumn(type.GetProperties()).Name;
            var properties      = Preprocessing.ExcludeColumns(type.GetProperties());

            var preprocessor   = context.OneHotEncoding(properties);
            var trainDataframe = context.Data.LoadFromEnumerable(trainDataset);

            var pipeline = context.Transforms.CopyColumns(outputColumnName: "Label", inputColumnName: labelColumnName)
                           .Append(preprocessor.OneHotEncodingEstimator)
                           .Append(context.Transforms.Concatenate("Features", preprocessor.CombinedFeatures.ToArray()))
                           .Append(context.Transforms.ProjectToPrincipalComponents(outputColumnName: "PCAFeatures", inputColumnName: "Features", rank: 2))
                           .Append(estimator);
            var model = pipeline.Fit(trainDataframe);

            return(model);
        }
示例#19
0
        public CategoricalEstimator(IHostEnvironment env, ColumnInfo[] columns,
                                    string file = null, string termsColumn = null,
                                    IComponentFactory <IMultiStreamSource, IDataLoader> loaderFactory = null)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register(nameof(TermEstimator));
            _term = new TermEstimator(_host, columns, file, termsColumn, loaderFactory);
            var binaryCols = new List <(string input, string output)>();
            var cols       = new List <(string input, string output, bool bag)>();

            for (int i = 0; i < columns.Length; i++)
            {
                var column = columns[i];
                CategoricalTransform.OutputKind kind = columns[i].OutputKind;
                switch (kind)
                {
                default:
                    throw _host.ExceptUserArg(nameof(column.OutputKind));

                case CategoricalTransform.OutputKind.Key:
                    continue;

                case CategoricalTransform.OutputKind.Bin:
                    binaryCols.Add((column.Output, column.Output));
                    break;

                case CategoricalTransform.OutputKind.Ind:
                    cols.Add((column.Output, column.Output, false));
                    break;

                case CategoricalTransform.OutputKind.Bag:
                    cols.Add((column.Output, column.Output, true));
                    break;
                }
            }
            IEstimator <ITransformer> toBinVector = null;
            IEstimator <ITransformer> toVector    = null;

            if (binaryCols.Count > 0)
            {
                toBinVector = new KeyToBinaryVectorEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorTransform.ColumnInfo(x.input, x.output)).ToArray());
            }
            if (cols.Count > 0)
            {
                toVector = new KeyToVectorEstimator(_host, cols.Select(x => new KeyToVectorTransform.ColumnInfo(x.input, x.output, x.bag)).ToArray());
            }

            if (toBinVector != null && toVector != null)
            {
                _toSomething = toVector.Append(toBinVector);
            }
            else
            {
                if (toBinVector != null)
                {
                    _toSomething = toBinVector;
                }
                else
                {
                    _toSomething = toVector;
                }
            }
        }
示例#20
0
 public EstimatorChain <TNewTrans> Append <TNewTrans>(IEstimator <TNewTrans> estimator, TransformerScope scope = TransformerScope.Everything)
     where TNewTrans : class, ITransformer
 {
     Contracts.CheckValue(estimator, nameof(estimator));
     return(new EstimatorChain <TNewTrans>(_host, _estimators.AppendElement(estimator), _scopes.AppendElement(scope), _needCacheAfter.AppendElement(false)));
 }
        public static void BuildAndTrainModel(string DataSetLocation, string ModelPath, MyTrainerStrategy selectedStrategy)
        {
            // Create MLContext to be shared across the model creation workflow objects
            // Set a random seed for repeatable/deterministic results across multiple trainings.
            var mlContext = new MLContext(seed: 1);

            // STEP 1: Common data loading configuration
            var trainingDataView = mlContext.Data.ReadFromTextFile <GitHubIssue>(DataSetLocation, hasHeader: true, separatorChar: '\t', supportSparse: false);

            // STEP 2: Common data process configuration with pipeline data transformations
            var dataProcessPipeline = mlContext.Transforms.Conversion.MapValueToKey(outputColumnName: DefaultColumnNames.Label, inputColumnName: nameof(GitHubIssue.Area))
                                      .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "TitleFeaturized", inputColumnName: nameof(GitHubIssue.Title)))
                                      .Append(mlContext.Transforms.Text.FeaturizeText(outputColumnName: "DescriptionFeaturized", inputColumnName: nameof(GitHubIssue.Description)))
                                      .Append(mlContext.Transforms.Concatenate(outputColumnName: DefaultColumnNames.Features, "TitleFeaturized", "DescriptionFeaturized"))
                                      .AppendCacheCheckpoint(mlContext);

            // Use in-memory cache for small/medium datasets to lower training time.
            // Do NOT use it (remove .AppendCacheCheckpoint()) when handling very large datasets.

            // (OPTIONAL) Peek data (such as 2 records) in training DataView after applying the ProcessPipeline's transformations into "Features"
            Common.ConsoleHelper.PeekDataViewInConsole(mlContext, trainingDataView, dataProcessPipeline, 2);

            // STEP 3: Create the selected training algorithm/trainer
            IEstimator <ITransformer> trainer = null;

            switch (selectedStrategy)
            {
            case MyTrainerStrategy.SdcaMultiClassTrainer:
                trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(DefaultColumnNames.Label,
                                                                                                     DefaultColumnNames.Features);
                break;

            case MyTrainerStrategy.OVAAveragedPerceptronTrainer:
            {
                // Create a binary classification trainer.
                var averagedPerceptronBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(DefaultColumnNames.Label,
                                                                                                                 DefaultColumnNames.Features,
                                                                                                                 numIterations: 10);
                // Compose an OVA (One-Versus-All) trainer with the BinaryTrainer.
                // In this strategy, a binary classification algorithm is used to train one classifier for each class, "
                // which distinguishes that class from all other classes. Prediction is then performed by running these binary classifiers, "
                // and choosing the prediction with the highest confidence score.
                trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer);

                break;
            }

            default:
                break;
            }

            //Set the trainer/algorithm and map label to value (original readable state)
            var trainingPipeline = dataProcessPipeline.Append(trainer)
                                   .Append(mlContext.Transforms.Conversion.MapKeyToValue(DefaultColumnNames.PredictedLabel));

            // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate)
            // in order to evaluate and get the model's accuracy metrics

            Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ===============");

            //Measure cross-validation time
            var watchCrossValTime = System.Diagnostics.Stopwatch.StartNew();

            var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(data: trainingDataView, estimator: trainingPipeline, numFolds: 6, labelColumn: DefaultColumnNames.Label);

            //Stop measuring time
            watchCrossValTime.Stop();
            long elapsedMs = watchCrossValTime.ElapsedMilliseconds;

            Console.WriteLine($"Time Cross-Validating: {elapsedMs} miliSecs");

            ConsoleHelper.PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults);

            // STEP 5: Train the model fitting to the DataSet
            Console.WriteLine("=============== Training the model ===============");

            //Measure training time
            var watch = System.Diagnostics.Stopwatch.StartNew();

            var trainedModel = trainingPipeline.Fit(trainingDataView);

            //Stop measuring time
            watch.Stop();
            long elapsedCrossValMs = watch.ElapsedMilliseconds;

            Console.WriteLine($"Time Training the model: {elapsedCrossValMs} miliSecs");

            // (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model)
            GitHubIssue issue = new GitHubIssue()
            {
                ID = "Any-ID", Title = "WebSockets communication is slow in my machine", Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
            };
            // Create prediction engine related to the loaded trained model
            var predEngine = trainedModel.CreatePredictionEngine <GitHubIssue, GitHubIssuePrediction>(mlContext);
            //Score
            var prediction = predEngine.Predict(issue);

            Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
            //

            // STEP 6: Save/persist the trained model to a .ZIP file
            Console.WriteLine("=============== Saving the model to a file ===============");
            using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
                mlContext.Model.Save(trainedModel, fs);

            Common.ConsoleHelper.ConsoleWriteHeader("Training process finalized");
        }
 public SuggestedTransform(PipelineNode pipelineNode, IEstimator <ITransformer> estimator)
 {
     PipelineNode = pipelineNode;
     Estimator    = estimator;
 }
示例#23
0
        public static void PeekDataViewInConsole(MLContext mlContext, IDataView dataView, IEstimator <ITransformer> pipeline, int numberOfRows = 4)
        {
            string msg = string.Format("Peek data in DataView: Showing {0} rows with the columns", numberOfRows.ToString());

            ConsoleWriteHeader(msg);

            //https://github.com/dotnet/machinelearning/blob/master/docs/code/MlNetCookBook.md#how-do-i-look-at-the-intermediate-data
            var transformer     = pipeline.Fit(dataView);
            var transformedData = transformer.Transform(dataView);

            // 'transformedData' is a 'promise' of data, lazy-loading. call Preview
            //and iterate through the returned collection from preview.

            var preViewTransformedData = transformedData.Preview(maxRows: numberOfRows);

            foreach (var row in preViewTransformedData.RowView)
            {
                var    ColumnCollection = row.Values;
                string lineToPrint      = "Row--> ";
                foreach (KeyValuePair <string, object> column in ColumnCollection)
                {
                    lineToPrint += $"| {column.Key}:{column.Value}";
                }
                Console.WriteLine(lineToPrint + "\n");
            }
        }
示例#24
0
        private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
        {
            Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ===============");
            var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(trainingDataView, trainingPipeline, numberOfFolds: 5, labelColumnName: "Area");

            PrintMulticlassClassificationFoldsAverageMetrics(crossValidationResults);
        }
 private static void Evaluate(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
 {
     // Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate)
     // in order to evaluate and get the model's accuracy metrics
     Console.WriteLine("=============== Cross-validating to get model's accuracy metrics ===============");
 }
示例#26
0
 private static IEstimator <ITransformer> BuildTrainingPipeline(MLContext mlContext, IEstimator <ITransformer> dataProcessingPipeline)
 {
     return(dataProcessingPipeline.Append(mlContext.BinaryClassification.Trainers.FastTree(new FastTreeBinaryTrainer.Options()
     {
         NumberOfLeaves = 10, NumberOfTrees = 50, LabelColumnName = "isFraud", FeatureColumnName = "Features"
     })));
 }
示例#27
0
        public static ITransformer TrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator <ITransformer> trainingPipeline)
        {
            Console.WriteLine("=============== Training  model ===============");

            ITransformer model = trainingPipeline.Fit(trainingDataView);

            Console.WriteLine("=============== End of training process ===============");
            return(model);
        }
        /// <summary>
        /// Produces the estimator. Note that this is made out of <see cref="ReconcileCore(IHostEnvironment, string[])"/>'s
        /// return value, plus whatever usages of <see cref="CopyColumnsEstimator"/> are necessary to avoid collisions with
        /// the output names fed to the constructor. This class provides the implementation, and subclasses should instead
        /// override <see cref="ReconcileCore(IHostEnvironment, string[])"/>.
        /// </summary>
        public sealed override IEstimator <ITransformer> Reconcile(IHostEnvironment env,
                                                                   PipelineColumn[] toOutput,
                                                                   IReadOnlyDictionary <PipelineColumn, string> inputNames,
                                                                   IReadOnlyDictionary <PipelineColumn, string> outputNames,
                                                                   IReadOnlyCollection <string> usedNames)
        {
            Contracts.AssertValue(env);
            env.AssertValue(toOutput);
            env.AssertValue(inputNames);
            env.AssertValue(outputNames);
            env.AssertValue(usedNames);

            // The reconciler should have been called with all the input columns having names.
            env.Assert(inputNames.Keys.All(_inputs.Contains) && _inputs.All(inputNames.Keys.Contains));
            // The output name map should contain only outputs as their keys. Yet, it is possible not all
            // outputs will be required in which case these will both be subsets of those outputs indicated
            // at construction.
            env.Assert(outputNames.Keys.All(Outputs.Contains));
            env.Assert(toOutput.All(Outputs.Contains));
            env.Assert(Outputs.Count() == _outputNames.Length);

            IEstimator <ITransformer> result = null;

            // In the case where we have names used that conflict with the fixed output names, we must have some
            // renaming logic.
            var collisions = new HashSet <string>(_outputNames);

            collisions.IntersectWith(usedNames);
            var old2New = new Dictionary <string, string>();

            if (collisions.Count > 0)
            {
                // First get the old names to some temporary names.
                int tempNum = 0;
                foreach (var c in collisions)
                {
                    old2New[c] = $"#TrainTemp{tempNum++}";
                }
                // In the case where the input names have anything that is used, we must reconstitute the input mapping.
                if (inputNames.Values.Any(old2New.ContainsKey))
                {
                    var newInputNames = new Dictionary <PipelineColumn, string>();
                    foreach (var p in inputNames)
                    {
                        newInputNames[p.Key] = old2New.ContainsKey(p.Value) ? old2New[p.Value] : p.Value;
                    }
                    inputNames = newInputNames;
                }
                result = new CopyColumnsEstimator(env, old2New.Select(p => (p.Key, p.Value)).ToArray());
            }

            // Map the inputs to the names.
            string[] mappedInputNames = _inputs.Select(c => inputNames[c]).ToArray();
            // Finally produce the trainer.
            var trainerEst = ReconcileCore(env, mappedInputNames);

            if (result == null)
            {
                result = trainerEst;
            }
            else
            {
                result = result.Append(trainerEst);
            }

            // OK. Now handle the final renamings from the fixed names, to the desired names, in the case
            // where the output was desired, and a renaming is even necessary.
            var toRename = new List <(string source, string name)>();

            foreach ((PipelineColumn outCol, string fixedName) in Outputs.Zip(_outputNames, (c, n) => (c, n)))
            {
                if (outputNames.TryGetValue(outCol, out string desiredName))
                {
                    toRename.Add((fixedName, desiredName));
                }
                else
                {
                    env.Assert(!toOutput.Contains(outCol));
                }
            }
            // Finally if applicable handle the renaming back from the temp names to the original names.
            foreach (var p in old2New)
            {
                toRename.Add((p.Value, p.Key));
            }
            if (toRename.Count > 0)
            {
                result = result.Append(new CopyColumnsEstimator(env, toRename.ToArray()));
            }

            return(result);
        }
示例#29
0
        GeneralFunctionAnalyzer <TIn, TDelegateInput, TOutShape>(
            IHostEnvironment env,
            IChannel ch,
            TDelegateInput input,
            ReaderReconciler <TIn> baseReconciler,
            Func <TDelegateInput, TOutShape> mapper,
            out IEstimator <ITransformer> estimator,
            Func <PipelineColumn, string> inputNameFunction)
        {
            Contracts.CheckValue(mapper, nameof(mapper));

            var method = mapper.Method;
            var output = mapper(input);

            KeyValuePair <string, PipelineColumn>[] outPairs = StaticPipeInternalUtils.GetNamesValues(output, method.ReturnParameter);

            // Map where the key depends on the set of things in the value. The value contains the yet unresolved dependencies.
            var keyDependsOn = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >();
            // Map where the set of things in the value depend on the key.
            var dependsOnKey = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >();
            // The set of columns detected with zero dependencies.
            var zeroDependencies = new List <PipelineColumn>();

            // First we build up the two structures above, using a queue and visiting from the outputs up.
            var toVisit = new Queue <PipelineColumn>(outPairs.Select(p => p.Value));

            while (toVisit.Count > 0)
            {
                var col = toVisit.Dequeue();
                ch.CheckParam(col != null, nameof(mapper), "The delegate seems to have null columns returned somewhere in the pipe.");
                if (keyDependsOn.ContainsKey(col))
                {
                    continue; // Already visited.
                }
                var dependsOn = new HashSet <PipelineColumn>();
                foreach (var dep in col.Dependencies ?? Enumerable.Empty <PipelineColumn>())
                {
                    dependsOn.Add(dep);
                    if (!dependsOnKey.TryGetValue(dep, out var dependsOnDep))
                    {
                        dependsOnKey[dep] = dependsOnDep = new HashSet <PipelineColumn>();
                        toVisit.Enqueue(dep);
                    }
                    dependsOnDep.Add(col);
                }
                keyDependsOn[col] = dependsOn;
                if (dependsOn.Count == 0)
                {
                    zeroDependencies.Add(col);
                }
            }

            // Get the base input columns.
            var baseInputs = keyDependsOn.Select(p => p.Key).Where(col => col.ReconcilerObj == baseReconciler).ToArray();

            // The columns that utilize the base reconciler should have no dependencies. This could only happen if
            // the caller of this function has introduced a situation whereby they are claiming they can reconcile
            // to a data-reader object but still have input data dependencies, which does not make sense and
            // indicates that there is a bug in that component code. Unfortunately we can only detect that condition,
            // not determine exactly how it arose, but we can still do so to indicate to the user that there is a
            // problem somewhere in the stack.
            ch.CheckParam(baseInputs.All(col => keyDependsOn[col].Count == 0),
                          nameof(input), "Bug detected where column producing object was yielding columns with dependencies.");

            // This holds the mappings of columns to names and back. Note that while the same column could be used on
            // the *output*, e.g., you could hypothetically have `(a: r.Foo, b: r.Foo)`, we treat that as the last thing
            // that is done.
            var nameMap = new BidirectionalDictionary <string, PipelineColumn>();

            // Check to see if we have any set of initial names. This is important in the case where we are mapping
            // in an input data view.
            foreach (var col in baseInputs)
            {
                string inputName = inputNameFunction(col);
                if (inputName != null)
                {
                    ch.Assert(!nameMap.ContainsKey(col));
                    ch.Assert(!nameMap.ContainsKey(inputName));
                    nameMap[col] = inputName;

                    ch.Trace($"Using input with name {inputName}.");
                }
            }

            estimator = null;
            var toCopy = new List <(string src, string dst)>();

            int tempNum = 0;

            // For all outputs, get potential name collisions with used inputs. Resolve by assigning the input a temporary name.
            foreach (var p in outPairs)
            {
                // If the name for the output is already used by one of the inputs, and this output column does not
                // happen to have the same name, then we need to rename that input to keep it available.
                if (nameMap.TryGetValue(p.Key, out var inputCol) && p.Value != inputCol)
                {
                    ch.Assert(baseInputs.Contains(inputCol));
                    string tempName = $"#Temp_{tempNum++}";
                    ch.Trace($"Input/output name collision: Renaming '{p.Key}' to '{tempName}'.");
                    toCopy.Add((p.Key, tempName));
                    nameMap[tempName] = nameMap[p.Key];
                    ch.Assert(!nameMap.ContainsKey(p.Key));
                }
                // If we already have a name for this output column, maybe it is used elsewhere. (This can happen when
                // the only thing done with an input is we rename it, or output it twice, or something like this.) In
                // this case it is most appropriate to delay renaming till after all other processing has been done in
                // that case. But otherwise we may as well just take the name.
                if (!nameMap.ContainsKey(p.Value))
                {
                    nameMap[p.Key] = p.Value;
                }
            }

            // If any renamings were necessary, create the CopyColumns estimator.
            if (toCopy.Count > 0)
            {
                estimator = new CopyColumnsEstimator(env, toCopy.ToArray());
            }

            // First clear the inputs from zero-dependencies yet to be resolved.
            foreach (var col in baseInputs)
            {
                ch.Assert(zeroDependencies.Contains(col));
                ch.Assert(col.ReconcilerObj == baseReconciler);

                zeroDependencies.Remove(col); // Make more efficient...
                if (!dependsOnKey.TryGetValue(col, out var depends))
                {
                    continue;
                }
                // If any of these base inputs do not have names because, for example, they do not directly appear
                // in the outputs and otherwise do not have names, assign them a name.
                if (!nameMap.ContainsKey(col))
                {
                    nameMap[col] = $"Temp_{tempNum++}";
                }

                foreach (var depender in depends)
                {
                    var dependencies = keyDependsOn[depender];
                    ch.Assert(dependencies.Contains(col));
                    dependencies.Remove(col);
                    if (dependencies.Count == 0)
                    {
                        zeroDependencies.Add(depender);
                    }
                }
                dependsOnKey.Remove(col);
            }

            // Call the reconciler to get the base reader estimator.
            var readerEstimator = baseReconciler.Reconcile(env, baseInputs, nameMap.AsOther(baseInputs));

            ch.AssertValueOrNull(readerEstimator);

            // Next we iteratively find those columns with zero dependencies, "create" them, and if anything depends on
            // these add them to the collection of zero dependencies, etc. etc.
            while (zeroDependencies.Count > 0)
            {
                // All columns with the same reconciler can be transformed together.

                // Note that the following policy of just taking the first group is not optimal. So for example, we
                // could have three columns, (a, b, c). If we had the output (a.X(), b.X() c.Y().X()), then maybe we'd
                // reconcile a.X() and b.X() together, then reconcile c.Y(), then reconcile c.Y().X() alone. Whereas, we
                // could have reconciled c.Y() first, then reconciled a.X(), b.X(), and c.Y().X() together.
                var group = zeroDependencies.GroupBy(p => p.ReconcilerObj).First();
                // Beyond that first group that *might* be a data reader reconciler, all subsequent operations will
                // be on where the data is already loaded and so accept data as an input, that is, they should produce
                // an estimator. If this is not the case something seriously wonky is going on, most probably that the
                // user tried to use a column from another source. If this is detected we can produce a sensible error
                // message to tell them not to do this.
                if (!(group.Key is EstimatorReconciler rec))
                {
                    throw ch.Except("Columns from multiple sources were detected. " +
                                    "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?");
                }
                PipelineColumn[] cols = group.ToArray();
                // All dependencies should, by this time, have names.
                ch.Assert(cols.SelectMany(c => c.Dependencies).All(dep => nameMap.ContainsKey(dep)));
                foreach (var newCol in cols)
                {
                    if (!nameMap.ContainsKey(newCol))
                    {
                        nameMap[newCol] = $"#Temp_{tempNum++}";
                    }
                }

                var localInputNames  = nameMap.AsOther(cols.SelectMany(c => c.Dependencies ?? Enumerable.Empty <PipelineColumn>()));
                var localOutputNames = nameMap.AsOther(cols);
                var usedNames        = new HashSet <string>(nameMap.Keys1.Except(localOutputNames.Values));

                var localEstimator = rec.Reconcile(env, cols, localInputNames, localOutputNames, usedNames);
                readerEstimator = readerEstimator?.Append(localEstimator);
                estimator       = estimator?.Append(localEstimator) ?? localEstimator;

                foreach (var newCol in cols)
                {
                    zeroDependencies.Remove(newCol); // Make more efficient!!

                    // Finally, we find all columns that depend on this one. If this happened to be the last pending
                    // dependency, then we add it to the list.
                    if (dependsOnKey.TryGetValue(newCol, out var depends))
                    {
                        foreach (var depender in depends)
                        {
                            var dependencies = keyDependsOn[depender];
                            Contracts.Assert(dependencies.Contains(newCol));
                            dependencies.Remove(newCol);
                            if (dependencies.Count == 0)
                            {
                                zeroDependencies.Add(depender);
                            }
                        }
                        dependsOnKey.Remove(newCol);
                    }
                }
            }

            if (keyDependsOn.Any(p => p.Value.Count > 0))
            {
                // This might happen if the user does something incredibly strange, like, say, take some prior
                // lambda, assign a column to a local variable, then re-use it downstream in a different lambdas.
                // The user would have to go to some extraorindary effort to do that, but nonetheless we want to
                // fail with a semi-sensible error message.
                throw ch.Except("There were some leftover columns with unresolved dependencies. " +
                                "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?");
            }

            // Now do the final renaming, if any is necessary.
            toCopy.Clear();
            foreach (var p in outPairs)
            {
                // TODO: Right now we just write stuff out. Once the copy-columns estimator is in place
                // we ought to do this for real.
                Contracts.Assert(nameMap.ContainsKey(p.Value));
                string currentName = nameMap[p.Value];
                if (currentName != p.Key)
                {
                    ch.Trace($"Will copy '{currentName}' to '{p.Key}'");
                    toCopy.Add((currentName, p.Key));
                }
            }

            // If any final renamings were necessary, insert the appropriate CopyColumns transform.
            if (toCopy.Count > 0)
            {
                var copyEstimator = new CopyColumnsEstimator(env, toCopy.ToArray());
                if (estimator == null)
                {
                    estimator = copyEstimator;
                }
                else
                {
                    estimator = estimator.Append(copyEstimator);
                }
            }

            ch.Trace($"Exiting {nameof(ReaderEstimatorAnalyzerHelper)}");

            return(readerEstimator);
        }
示例#30
0
        private static void TrainModel(string dataFile, string modelFile)
        {
            // Create MLContext to be shared across the model creation workflow objects
            var mlContext = new MLContext(seed: 0);

            // STEP 1: Loading the data
            Console.WriteLine($"Step 1: Loading the data ({dataFile})");
            var textLoader = mlContext.Data.TextReader(
                new TextLoader.Arguments
            {
                Separator    = ",",
                HasHeader    = true,
                AllowQuoting = true,
                AllowSparse  = true,
                Column       = new[]
                {
                    new TextLoader.Column("Id", DataKind.Text, 0),
                    new TextLoader.Column("Category", DataKind.Text, 1),
                    new TextLoader.Column("Content", DataKind.Text, 2),
                }
            });
            var trainingDataView = textLoader.Read(dataFile);

            // STEP 2: Common data process configuration with pipeline data transformations
            Console.WriteLine("Step 2: Map raw input data columns to ML.NET data");
            var dataProcessPipeline = mlContext.Transforms.Categorical.MapValueToKey("Category", DefaultColumnNames.Label)
                                      .Append(mlContext.Transforms.Text.FeaturizeText("Content", DefaultColumnNames.Features));

            // (OPTIONAL) Peek data (few records) in training DataView after applying the ProcessPipeline's transformations into "Features"
            // DataViewToConsole<JokeModel>(mlContext, trainingDataView, dataProcessPipeline, 2);

            // STEP 3: Create the selected training algorithm/trainer
            Console.WriteLine("Step 3: Create and configure the selected training algorithm (trainer)");
            IEstimator <ITransformer> trainer = mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent();

            // Alternative training
            //// var averagedPerceptionBinaryTrainer = mlContext.BinaryClassification.Trainers.AveragedPerceptron(
            ////     DefaultColumnNames.Label,
            ////     DefaultColumnNames.Features,
            ////     numIterations: 10);
            //// trainer = mlContext.MulticlassClassification.Trainers.OneVersusAll(averagedPerceptronBinaryTrainer);

            // Set the trainer/algorithm and map label to value (original readable state)
            var trainingPipeline = dataProcessPipeline.Append(trainer).Append(
                mlContext.Transforms.Conversion.MapKeyToValue(DefaultColumnNames.PredictedLabel));

            // STEP 4: Cross-Validate with single dataset (since we don't have two datasets, one for training and for evaluate)
            // in order to evaluate and get the model's accuracy metrics
            Console.WriteLine("Step 4: Cross-Validate with single dataset (alternatively we can divide it 80-20)");
            var crossValidationResults = mlContext.MulticlassClassification.CrossValidate(
                trainingDataView,
                trainingPipeline,
                numFolds: 10,
                labelColumn: "Label");

            PrintMulticlassClassificationFoldsAverageMetrics(trainer.ToString(), crossValidationResults);

            // STEP 5: Train the model fitting to the DataSet
            Console.WriteLine("Step 5: Train the model fitting to the DataSet");
            var trainedModel = trainingPipeline.Fit(trainingDataView);

            // STEP 6: Save/persist the trained model to a .ZIP file
            Console.WriteLine($"Step 6: Save the model to a file ({modelFile})");
            using (var fs = new FileStream(modelFile, FileMode.Create, FileAccess.Write, FileShare.Write))
            {
                mlContext.Model.Save(trainedModel, fs);
            }
        }