public void TestSaveAndLoadTreeFeaturizer() { int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "Label" }; var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths", TrainerOptions = trainerOptions }; var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); // Save the trained model into file. ITransformer loadedModel = null; var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(Env, tempPath, true, true)) { using (var fs = file.CreateWriteStream()) ML.Model.Save(model, null, fs); using (var fs = file.OpenReadStream()) loadedModel = ML.Model.Load(fs, out var schema); } var loadedPrediction = loadedModel.Transform(dataView); var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); }
public void TestEstimatorSaveLoad() { IHostEnvironment env = new MLContext(1); var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); var data = TextLoader.Create(env, new TextLoader.Options() { Columns = new[] { new TextLoader.Column("ImagePath", DataKind.String, 0), new TextLoader.Column("Name", DataKind.String, 1), } }, new MultiFileSource(dataFile)); var pipe = new ImageLoadingEstimator(env, imageFolder, ("ImageReal", "ImagePath")) .Append(new ImageResizingEstimator(env, "ImageReal", 100, 100, "ImageReal")) .Append(new ImagePixelExtractingEstimator(env, "ImagePixels", "ImageReal")) .Append(new ImageGrayscalingEstimator(env, ("ImageGray", "ImageReal"))); pipe.GetOutputSchema(SchemaShape.Create(data.Schema)); var model = pipe.Fit(data); var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(env, tempPath, true, true)) { using (var fs = file.CreateWriteStream()) ML.Model.Save(model, null, fs); ITransformer model2; using (var fs = file.OpenReadStream()) model2 = ML.Model.Load(fs, out var schema); var transformerChain = model2 as TransformerChain <ITransformer>; Assert.NotNull(transformerChain); var newCols = ((ImageLoadingTransformer)transformerChain.First()).Columns; var oldCols = ((ImageLoadingTransformer)model.First()).Columns; Assert.True(newCols .Zip(oldCols, (x, y) => x == y) .All(x => x)); } Done(); }
public void TestEstimatorSaveLoad() { IHostEnvironment env = new MLContext(); var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); var data = TextLoader.Create(env, new TextLoader.Arguments() { Column = new[] { new TextLoader.Column("ImagePath", DataKind.TX, 0), new TextLoader.Column("Name", DataKind.TX, 1), } }, new MultiFileSource(dataFile)); var pipe = new ImageLoadingEstimator(env, imageFolder, ("ImagePath", "ImageReal")) .Append(new ImageResizingEstimator(env, "ImageReal", "ImageReal", 100, 100)) .Append(new ImagePixelExtractingEstimator(env, "ImageReal", "ImagePixels")) .Append(new ImageGrayscalingEstimator(env, ("ImageReal", "ImageGray"))); pipe.GetOutputSchema(Core.Data.SchemaShape.Create(data.Schema)); var model = pipe.Fit(data); var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(env, tempPath, true, true)) { using (var fs = file.CreateWriteStream()) model.SaveTo(env, fs); var model2 = TransformerChain.LoadFrom(env, file.OpenReadStream()); var newCols = ((ImageLoaderTransform)model2.First()).Columns; var oldCols = ((ImageLoaderTransform)model.First()).Columns; Assert.True(newCols .Zip(oldCols, (x, y) => x == y) .All(x => x)); } Done(); }
public void TestOnnxTransformSaveAndLoadWithCustomShapes() { // The loaded model has input shape [-1, 3] and output shape [-1]. var modelFile = Path.Combine(Directory.GetCurrentDirectory(), "unknowndimensions", "test_unknowndimensions_float.onnx"); var dataPoints = new InputWithCustomShape[] { // It's a flattened 3-by-3 tensor. // [1.1, 1.3, 1.2] // |1.9, 1.3, 1.2| // [1.1, 1.3, 1.8] new InputWithCustomShape() { input = new float[] { 1.1f, 1.3f, 1.2f, 1.9f, 1.3f, 1.2f, 1.1f, 1.3f, 1.8f } }, // It's a flattened 3-by-3 tensor. // [0, 0, 1] // |1, 0, 0| // [1, 0, 0] new InputWithCustomShape() { input = new float[] { 0f, 0f, 1f, 1f, 0f, 0f, 1f, 0f, 0f } } }; var shapeDictionary = new Dictionary <string, int[]>() { { nameof(InputWithCustomShape.input), new int[] { 3, 3 } } }; var dataView = ML.Data.LoadFromEnumerable(dataPoints); var pipeline = ML.Transforms.ApplyOnnxModel(nameof(PredictionWithCustomShape.argmax), nameof(InputWithCustomShape.input), modelFile, shapeDictionary); var model = pipeline.Fit(dataView); // Save the trained ONNX transformer into file and then load it back. ITransformer loadedModel = null; var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(Env, tempPath, true, true)) { // Save. using (var fs = file.CreateWriteStream()) ML.Model.Save(model, null, fs); // Load. using (var fs = file.OpenReadStream()) loadedModel = ML.Model.Load(fs, out var schema); } var transformedDataView = loadedModel.Transform(dataView); // Conduct the same check for all the 3 called public APIs. var transformedDataPoints = ML.Data.CreateEnumerable <PredictionWithCustomShape>(transformedDataView, false).ToList(); // One data point generates one transformed data point. Assert.Equal(dataPoints.Count(), transformedDataPoints.Count); // Check result numbers. They are results of applying ONNX argmax along the second axis; for example // [1.1, 1.3, 1.2] ---> [1] because 1.3 (indexed by 1) is the largest element. // |1.9, 1.3, 1.2| ---> |0| 1.9 0 // [1.1, 1.3, 1.8] ---> [2] 1.8 2 var expectedResults = new long[][] { new long[] { 1, 0, 2 }, new long[] { 2, 0, 0 } }; for (int i = 0; i < transformedDataPoints.Count; ++i) { Assert.Equal(transformedDataPoints[i].argmax, expectedResults[i]); } (model as IDisposable)?.Dispose(); (loadedModel as IDisposable)?.Dispose(); }
public void TestSaveAndLoadDoubleTreeFeaturizer() { int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10, NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", LabelColumnName = "Label" }; // Trains tree featurization on "Features" and applies on "CopiedFeatures". var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "CopiedFeatures", TrainerOptions = trainerOptions, TreesColumnName = "OhMyTrees", LeavesColumnName = "OhMyLeaves", PathsColumnName = "OhMyPaths" }; var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); // Save the trained model into file and then load it back. ITransformer loadedModel = null; var tempPath = Path.GetTempFileName(); using (var file = new SimpleFileHandle(Env, tempPath, true, true)) { using (var fs = file.CreateWriteStream()) ML.Model.Save(model, null, fs); using (var fs = file.OpenReadStream()) loadedModel = ML.Model.Load(fs, out var schema); } // Compute prediction using the loaded model. var loadedPrediction = loadedModel.Transform(dataView); var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); // Check if the loaded model produces the same result as the trained model. Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") .Append(ML.Transforms.NormalizeBinning("CopiedFeatures")) .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var secondModel = secondPipeline.Fit(dataView); var secondPrediction = secondModel.Transform(dataView); var secondMetrics = ML.Regression.Evaluate(secondPrediction); // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline // is different from the first pipeline. Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError); Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError); }
private void RunCore(IChannel ch, string cmd) { Host.AssertValue(ch); Host.AssertNonEmpty(cmd); ch.Trace("Constructing trainer"); ITrainer trainer = Args.Trainer.CreateComponent(Host); IPredictor inputPredictor = null; if (Args.ContinueTrain && !TrainUtils.TryLoadPredictor(ch, Host, Args.InputModelFile, out inputPredictor)) { ch.Warning("No input model file specified or model file did not contain a predictor. The model state cannot be initialized."); } ch.Trace("Constructing the training pipeline"); IDataView trainPipe = CreateLoader(); ISchema schema = trainPipe.Schema; string label = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.LabelColumn), Args.LabelColumn, DefaultColumnNames.Label); string features = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.FeatureColumn), Args.FeatureColumn, DefaultColumnNames.Features); string group = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.GroupColumn), Args.GroupColumn, DefaultColumnNames.GroupId); string weight = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.WeightColumn), Args.WeightColumn, DefaultColumnNames.Weight); string name = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(Arguments.NameColumn), Args.NameColumn, DefaultColumnNames.Name); TrainUtils.AddNormalizerIfNeeded(Host, ch, trainer, ref trainPipe, features, Args.NormalizeFeatures); ch.Trace("Binding columns"); var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, Args.CustomColumn); var data = new RoleMappedData(trainPipe, label, features, group, weight, name, customCols); RoleMappedData validData = null; if (!string.IsNullOrWhiteSpace(Args.ValidationFile)) { if (!trainer.Info.SupportsValidation) { ch.Warning("Ignoring validationFile: Trainer does not accept validation dataset."); } else { ch.Trace("Constructing the validation pipeline"); IDataView validPipe = CreateRawLoader(dataFile: Args.ValidationFile); validPipe = ApplyTransformUtils.ApplyAllTransformsToData(Host, trainPipe, validPipe); validData = new RoleMappedData(validPipe, data.Schema.GetColumnRoleNames()); } } // In addition to the training set, some trainers can accept two data sets, validation set and test set, // in training phase. The major difference between validation set and test set is that training process may // indirectly use validation set to improve the model but the learned model should totally independent of test set. // Similar to validation set, the trainer can report the scores computed using test set. RoleMappedData testDataUsedInTrainer = null; if (!string.IsNullOrWhiteSpace(Args.TestFile)) { // In contrast to the if-else block for validation above, we do not throw a warning if test file is provided // because this is TrainTest command. if (trainer.Info.SupportsTest) { ch.Trace("Constructing the test pipeline"); IDataView testPipeUsedInTrainer = CreateRawLoader(dataFile: Args.TestFile); testPipeUsedInTrainer = ApplyTransformUtils.ApplyAllTransformsToData(Host, trainPipe, testPipeUsedInTrainer); testDataUsedInTrainer = new RoleMappedData(testPipeUsedInTrainer, data.Schema.GetColumnRoleNames()); } } var predictor = TrainUtils.Train(Host, ch, data, trainer, validData, Args.Calibrator, Args.MaxCalibrationExamples, Args.CacheData, inputPredictor, testDataUsedInTrainer); IDataLoader testPipe; bool hasOutfile = !string.IsNullOrEmpty(Args.OutputModelFile); var tempFilePath = hasOutfile ? null : Path.GetTempFileName(); using (var file = new SimpleFileHandle(ch, hasOutfile ? Args.OutputModelFile : tempFilePath, true, !hasOutfile)) { TrainUtils.SaveModel(Host, ch, file, predictor, data, cmd); ch.Trace("Constructing the testing pipeline"); using (var stream = file.OpenReadStream()) using (var rep = RepositoryReader.Open(stream, ch)) testPipe = LoadLoader(rep, Args.TestFile, true); } // Score. ch.Trace("Scoring and evaluating"); ch.Assert(Args.Scorer == null || Args.Scorer is ICommandLineComponentFactory, "TrainTestCommand should only be used from the command line."); IDataScorerTransform scorePipe = ScoreUtils.GetScorer(Args.Scorer, predictor, testPipe, features, group, customCols, Host, data.Schema); // Evaluate. var evaluator = Args.Evaluator?.CreateComponent(Host) ?? EvaluateUtils.GetEvaluator(Host, scorePipe.Schema); var dataEval = new RoleMappedData(scorePipe, label, features, group, weight, name, customCols, opt: true); var metrics = evaluator.Evaluate(dataEval); MetricWriter.PrintWarnings(ch, metrics); evaluator.PrintFoldResults(ch, metrics); if (!metrics.TryGetValue(MetricKinds.OverallMetrics, out var overall)) { throw ch.Except("No overall metrics found"); } overall = evaluator.GetOverallResults(overall); MetricWriter.PrintOverallMetrics(Host, ch, Args.SummaryFilename, overall, 1); evaluator.PrintAdditionalMetrics(ch, metrics); Dictionary <string, IDataView>[] metricValues = { metrics }; SendTelemetryMetric(metricValues); if (!string.IsNullOrWhiteSpace(Args.OutputDataFile)) { var perInst = evaluator.GetPerInstanceMetrics(dataEval); var perInstData = new RoleMappedData(perInst, label, null, group, weight, name, customCols); var idv = evaluator.GetPerInstanceDataViewToSave(perInstData); MetricWriter.SavePerInstance(Host, ch, Args.OutputDataFile, idv); } }