예제 #1
0
        public void TestSaveAndLoadTreeFeaturizer()
        {
            int dataPointCount = 200;
            var data           = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName          = "Features",
                LabelColumnName            = "Label"
            };

            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName  = "Features",
                TreesColumnName  = "Trees",
                LeavesColumnName = "Leaves",
                PathsColumnName  = "Paths",
                TrainerOptions   = trainerOptions
            };

            var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options)
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths"))
                           .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model      = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics    = ML.Regression.Evaluate(prediction);

            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);

            // Save the trained model into file.
            ITransformer loadedModel = null;
            var          tempPath    = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);

                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }
            var loadedPrediction = loadedModel.Transform(dataView);
            var loadedMetrics    = ML.Regression.Evaluate(loadedPrediction);

            Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError);
            Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError);
        }
예제 #2
0
        public void TestEstimatorSaveLoad()
        {
            IHostEnvironment env = new MLContext(1);
            var dataFile         = GetDataPath("images/images.tsv");
            var imageFolder      = Path.GetDirectoryName(dataFile);
            var data             = TextLoader.Create(env, new TextLoader.Options()
            {
                Columns = new[]
                {
                    new TextLoader.Column("ImagePath", DataKind.String, 0),
                    new TextLoader.Column("Name", DataKind.String, 1),
                }
            }, new MultiFileSource(dataFile));

            var pipe = new ImageLoadingEstimator(env, imageFolder, ("ImageReal", "ImagePath"))
                       .Append(new ImageResizingEstimator(env, "ImageReal", 100, 100, "ImageReal"))
                       .Append(new ImagePixelExtractingEstimator(env, "ImagePixels", "ImageReal"))
                       .Append(new ImageGrayscalingEstimator(env, ("ImageGray", "ImageReal")));

            pipe.GetOutputSchema(SchemaShape.Create(data.Schema));
            var model = pipe.Fit(data);

            var tempPath = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);
                ITransformer model2;
                using (var fs = file.OpenReadStream())
                    model2 = ML.Model.Load(fs, out var schema);

                var transformerChain = model2 as TransformerChain <ITransformer>;
                Assert.NotNull(transformerChain);

                var newCols = ((ImageLoadingTransformer)transformerChain.First()).Columns;
                var oldCols = ((ImageLoadingTransformer)model.First()).Columns;
                Assert.True(newCols
                            .Zip(oldCols, (x, y) => x == y)
                            .All(x => x));
            }
            Done();
        }
예제 #3
0
        public void TestEstimatorSaveLoad()
        {
            IHostEnvironment env = new MLContext();
            var dataFile         = GetDataPath("images/images.tsv");
            var imageFolder      = Path.GetDirectoryName(dataFile);
            var data             = TextLoader.Create(env, new TextLoader.Arguments()
            {
                Column = new[]
                {
                    new TextLoader.Column("ImagePath", DataKind.TX, 0),
                    new TextLoader.Column("Name", DataKind.TX, 1),
                }
            }, new MultiFileSource(dataFile));

            var pipe = new ImageLoadingEstimator(env, imageFolder, ("ImagePath", "ImageReal"))
                       .Append(new ImageResizingEstimator(env, "ImageReal", "ImageReal", 100, 100))
                       .Append(new ImagePixelExtractingEstimator(env, "ImageReal", "ImagePixels"))
                       .Append(new ImageGrayscalingEstimator(env, ("ImageReal", "ImageGray")));

            pipe.GetOutputSchema(Core.Data.SchemaShape.Create(data.Schema));
            var model = pipe.Fit(data);

            var tempPath = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    model.SaveTo(env, fs);
                var model2 = TransformerChain.LoadFrom(env, file.OpenReadStream());

                var newCols = ((ImageLoaderTransform)model2.First()).Columns;
                var oldCols = ((ImageLoaderTransform)model.First()).Columns;
                Assert.True(newCols
                            .Zip(oldCols, (x, y) => x == y)
                            .All(x => x));
            }
            Done();
        }
예제 #4
0
        public void TestOnnxTransformSaveAndLoadWithCustomShapes()
        {
            // The loaded model has input shape [-1, 3] and output shape [-1].
            var modelFile = Path.Combine(Directory.GetCurrentDirectory(), "unknowndimensions", "test_unknowndimensions_float.onnx");

            var dataPoints = new InputWithCustomShape[]
            {
                // It's a flattened 3-by-3 tensor.
                // [1.1, 1.3, 1.2]
                // |1.9, 1.3, 1.2|
                // [1.1, 1.3, 1.8]
                new InputWithCustomShape()
                {
                    input = new float[] { 1.1f, 1.3f, 1.2f, 1.9f, 1.3f, 1.2f, 1.1f, 1.3f, 1.8f }
                },
                // It's a flattened 3-by-3 tensor.
                // [0, 0, 1]
                // |1, 0, 0|
                // [1, 0, 0]
                new InputWithCustomShape()
                {
                    input = new float[] { 0f, 0f, 1f, 1f, 0f, 0f, 1f, 0f, 0f }
                }
            };

            var shapeDictionary = new Dictionary <string, int[]>()
            {
                { nameof(InputWithCustomShape.input), new int[] { 3, 3 } }
            };

            var dataView = ML.Data.LoadFromEnumerable(dataPoints);

            var pipeline = ML.Transforms.ApplyOnnxModel(nameof(PredictionWithCustomShape.argmax),
                                                        nameof(InputWithCustomShape.input), modelFile, shapeDictionary);

            var model = pipeline.Fit(dataView);

            // Save the trained ONNX transformer into file and then load it back.
            ITransformer loadedModel = null;
            var          tempPath    = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                // Save.
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);

                // Load.
                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }

            var transformedDataView = loadedModel.Transform(dataView);

            // Conduct the same check for all the 3 called public APIs.
            var transformedDataPoints = ML.Data.CreateEnumerable <PredictionWithCustomShape>(transformedDataView, false).ToList();

            // One data point generates one transformed data point.
            Assert.Equal(dataPoints.Count(), transformedDataPoints.Count);

            // Check result numbers. They are results of applying ONNX argmax along the second axis; for example
            // [1.1, 1.3, 1.2] ---> [1] because 1.3 (indexed by 1) is the largest element.
            // |1.9, 1.3, 1.2| ---> |0|         1.9             0
            // [1.1, 1.3, 1.8] ---> [2]         1.8             2
            var expectedResults = new long[][]
            {
                new long[] { 1, 0, 2 },
                new long[] { 2, 0, 0 }
            };

            for (int i = 0; i < transformedDataPoints.Count; ++i)
            {
                Assert.Equal(transformedDataPoints[i].argmax, expectedResults[i]);
            }

            (model as IDisposable)?.Dispose();
            (loadedModel as IDisposable)?.Dispose();
        }
예제 #5
0
        public void TestSaveAndLoadDoubleTreeFeaturizer()
        {
            int dataPointCount = 200;
            var data           = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList();
            var dataView       = ML.Data.LoadFromEnumerable(data);

            dataView = ML.Data.Cache(dataView);

            var trainerOptions = new FastForestRegressionTrainer.Options
            {
                NumberOfThreads            = 1,
                NumberOfTrees              = 10,
                NumberOfLeaves             = 4,
                MinimumExampleCountPerLeaf = 10,
                FeatureColumnName          = "Features",
                LabelColumnName            = "Label"
            };

            // Trains tree featurization on "Features" and applies on "CopiedFeatures".
            var options = new FastForestRegressionFeaturizationEstimator.Options()
            {
                InputColumnName  = "CopiedFeatures",
                TrainerOptions   = trainerOptions,
                TreesColumnName  = "OhMyTrees",
                LeavesColumnName = "OhMyLeaves",
                PathsColumnName  = "OhMyPaths"
            };

            var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features")
                           .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                           .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths"))
                           .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var model      = pipeline.Fit(dataView);
            var prediction = model.Transform(dataView);
            var metrics    = ML.Regression.Evaluate(prediction);

            Assert.True(metrics.MeanAbsoluteError < 0.25);
            Assert.True(metrics.MeanSquaredError < 0.1);

            // Save the trained model into file and then load it back.
            ITransformer loadedModel = null;
            var          tempPath    = Path.GetTempFileName();

            using (var file = new SimpleFileHandle(Env, tempPath, true, true))
            {
                using (var fs = file.CreateWriteStream())
                    ML.Model.Save(model, null, fs);

                using (var fs = file.OpenReadStream())
                    loadedModel = ML.Model.Load(fs, out var schema);
            }

            // Compute prediction using the loaded model.
            var loadedPrediction = loadedModel.Transform(dataView);
            var loadedMetrics    = ML.Regression.Evaluate(loadedPrediction);

            // Check if the loaded model produces the same result as the trained model.
            Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError);
            Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError);

            var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features")
                                 .Append(ML.Transforms.NormalizeBinning("CopiedFeatures"))
                                 .Append(ML.Transforms.FeaturizeByFastForestRegression(options))
                                 .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths"))
                                 .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures"));
            var secondModel      = secondPipeline.Fit(dataView);
            var secondPrediction = secondModel.Transform(dataView);
            var secondMetrics    = ML.Regression.Evaluate(secondPrediction);

            // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline
            // is different from the first pipeline.
            Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError);
            Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError);
        }
        private void RunCore(IChannel ch, string cmd)
        {
            Host.AssertValue(ch);
            Host.AssertNonEmpty(cmd);

            ch.Trace("Constructing trainer");
            ITrainer trainer = Args.Trainer.CreateComponent(Host);

            IPredictor inputPredictor = null;

            if (Args.ContinueTrain && !TrainUtils.TryLoadPredictor(ch, Host, Args.InputModelFile, out inputPredictor))
            {
                ch.Warning("No input model file specified or model file did not contain a predictor. The model state cannot be initialized.");
            }

            ch.Trace("Constructing the training pipeline");
            IDataView trainPipe = CreateLoader();

            ISchema schema = trainPipe.Schema;
            string  label  = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.LabelColumn),
                                                                 Args.LabelColumn, DefaultColumnNames.Label);
            string features = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.FeatureColumn),
                                                                  Args.FeatureColumn, DefaultColumnNames.Features);
            string group = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.GroupColumn),
                                                               Args.GroupColumn, DefaultColumnNames.GroupId);
            string weight = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.WeightColumn),
                                                                Args.WeightColumn, DefaultColumnNames.Weight);
            string name = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.NameColumn),
                                                              Args.NameColumn, DefaultColumnNames.Name);

            TrainUtils.AddNormalizerIfNeeded(Host, ch, trainer, ref trainPipe, features, Args.NormalizeFeatures);

            ch.Trace("Binding columns");
            var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, Args.CustomColumn);
            var data       = new RoleMappedData(trainPipe, label, features, group, weight, name, customCols);

            RoleMappedData validData = null;

            if (!string.IsNullOrWhiteSpace(Args.ValidationFile))
            {
                if (!trainer.Info.SupportsValidation)
                {
                    ch.Warning("Ignoring validationFile: Trainer does not accept validation dataset.");
                }
                else
                {
                    ch.Trace("Constructing the validation pipeline");
                    IDataView validPipe = CreateRawLoader(dataFile: Args.ValidationFile);
                    validPipe = ApplyTransformUtils.ApplyAllTransformsToData(Host, trainPipe, validPipe);
                    validData = new RoleMappedData(validPipe, data.Schema.GetColumnRoleNames());
                }
            }

            // In addition to the training set, some trainers can accept two data sets, validation set and test set,
            // in training phase. The major difference between validation set and test set is that training process may
            // indirectly use validation set to improve the model but the learned model should totally independent of test set.
            // Similar to validation set, the trainer can report the scores computed using test set.
            RoleMappedData testDataUsedInTrainer = null;

            if (!string.IsNullOrWhiteSpace(Args.TestFile))
            {
                // In contrast to the if-else block for validation above, we do not throw a warning if test file is provided
                // because this is TrainTest command.
                if (trainer.Info.SupportsTest)
                {
                    ch.Trace("Constructing the test pipeline");
                    IDataView testPipeUsedInTrainer = CreateRawLoader(dataFile: Args.TestFile);
                    testPipeUsedInTrainer = ApplyTransformUtils.ApplyAllTransformsToData(Host, trainPipe, testPipeUsedInTrainer);
                    testDataUsedInTrainer = new RoleMappedData(testPipeUsedInTrainer, data.Schema.GetColumnRoleNames());
                }
            }

            var predictor = TrainUtils.Train(Host, ch, data, trainer, validData,
                                             Args.Calibrator, Args.MaxCalibrationExamples, Args.CacheData, inputPredictor, testDataUsedInTrainer);

            IDataLoader testPipe;
            bool        hasOutfile   = !string.IsNullOrEmpty(Args.OutputModelFile);
            var         tempFilePath = hasOutfile ? null : Path.GetTempFileName();

            using (var file = new SimpleFileHandle(ch, hasOutfile ? Args.OutputModelFile : tempFilePath, true, !hasOutfile))
            {
                TrainUtils.SaveModel(Host, ch, file, predictor, data, cmd);
                ch.Trace("Constructing the testing pipeline");
                using (var stream = file.OpenReadStream())
                    using (var rep = RepositoryReader.Open(stream, ch))
                        testPipe = LoadLoader(rep, Args.TestFile, true);
            }

            // Score.
            ch.Trace("Scoring and evaluating");
            ch.Assert(Args.Scorer == null || Args.Scorer is ICommandLineComponentFactory, "TrainTestCommand should only be used from the command line.");
            IDataScorerTransform scorePipe = ScoreUtils.GetScorer(Args.Scorer, predictor, testPipe, features, group, customCols, Host, data.Schema);

            // Evaluate.
            var evaluator = Args.Evaluator?.CreateComponent(Host) ??
                            EvaluateUtils.GetEvaluator(Host, scorePipe.Schema);
            var dataEval = new RoleMappedData(scorePipe, label, features,
                                              group, weight, name, customCols, opt: true);
            var metrics = evaluator.Evaluate(dataEval);

            MetricWriter.PrintWarnings(ch, metrics);
            evaluator.PrintFoldResults(ch, metrics);
            if (!metrics.TryGetValue(MetricKinds.OverallMetrics, out var overall))
            {
                throw ch.Except("No overall metrics found");
            }
            overall = evaluator.GetOverallResults(overall);
            MetricWriter.PrintOverallMetrics(Host, ch, Args.SummaryFilename, overall, 1);
            evaluator.PrintAdditionalMetrics(ch, metrics);
            Dictionary <string, IDataView>[] metricValues = { metrics };
            SendTelemetryMetric(metricValues);
            if (!string.IsNullOrWhiteSpace(Args.OutputDataFile))
            {
                var perInst     = evaluator.GetPerInstanceMetrics(dataEval);
                var perInstData = new RoleMappedData(perInst, label, null, group, weight, name, customCols);
                var idv         = evaluator.GetPerInstanceDataViewToSave(perInstData);
                MetricWriter.SavePerInstance(Host, ch, Args.OutputDataFile, idv);
            }
        }