public void KeyToValue() { string dataPath = GetDataPath("breast-cancer.txt"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("ScalarString", DataKind.String, 0), new TextLoader.Column("VectorString", DataKind.String, 1, 4), }); var transformedData = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingEstimator.ColumnOptions("A", "ScalarString"), new ValueToKeyMappingEstimator.ColumnOptions("B", "VectorString") }) .Fit(data).Transform(data); var est = ML.Transforms.Conversion.MapKeyToValue("ScalarString", "A") .Append(ML.Transforms.Conversion.MapKeyToValue("VectorString", "B")); TestEstimatorCore(est, transformedData, invalidInput: data); var data2Transformed = est.Fit(transformedData).Transform(transformedData); // Check that term and ToValue are round-trippable. var dataLeft = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data).Transform(data); var dataRight = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data2Transformed).Transform(data2Transformed); TestCommon.CheckSameSchemas(dataLeft.Schema, dataRight.Schema); CheckSameValues(dataLeft, dataRight); Done(); }
public void SaveLoaderAndTransformerAndLoad() { var mlContext = new MLContext(); var file = new MultiFileSource(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename)); var loader = mlContext.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var estimator = mlContext.Transforms.NormalizeMinMax("Features"); var data = loader.Load(file); var model = estimator.Fit(data); // First get the input schema. var expectedInputSchema = loader.GetOutputSchema(); Assert.Equal(2, expectedInputSchema.Count); Assert.NotNull(expectedInputSchema.GetColumnOrNull("Label")); Assert.NotNull(expectedInputSchema.GetColumnOrNull("Features")); Assert.True(expectedInputSchema["Features"].HasSlotNames()); string modelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-model.zip"); mlContext.Model.Save(model, loader, modelPath); // Reload the loader and schema. Load(mlContext, modelPath, out var loadedWithSchema, out var loadedInputSchema, out var loadedWithLoader, out var loadedLoaderWithTransformer); Assert.IsType <NormalizingTransformer>(loadedWithSchema); Assert.IsType <NormalizingTransformer>(loadedWithLoader); Assert.IsType <TextLoader>(loadedLoaderWithTransformer); TestCommon.CheckSameSchemas(expectedInputSchema, loadedInputSchema); var reloadedLoaderInputSchema = loadedLoaderWithTransformer.GetOutputSchema(); TestCommon.CheckSameSchemas(expectedInputSchema, reloadedLoaderInputSchema); }
public void FitPipelineSaveModelAndPredict() { var mlContext = new MLContext(seed: 1); // Get the dataset. var data = mlContext.Data.LoadFromTextFile <HousingRegression>(TestCommon.GetDataPath(DataDir, TestDatasets.housing.trainFilename), hasHeader: true); // Create a pipeline to train on the housing data. var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .Append(mlContext.Regression.Trainers.FastTree( new FastTreeRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10 })); // Fit the pipeline. var model = pipeline.Fit(data); var modelPath = TestCommon.DeleteOutputPath(OutDir, "fitPipelineSaveModelAndPredict.zip"); // Save model to a file. mlContext.Model.Save(model, data.Schema, modelPath); // Load model from a file. ITransformer serializedModel; using (var file = File.OpenRead(modelPath)) { serializedModel = mlContext.Model.Load(file, out var serializedSchema); TestCommon.CheckSameSchemas(data.Schema, serializedSchema); } // Create prediction engine and test predictions. var originalPredictionEngine = mlContext.Model.CreatePredictionEngine <HousingRegression, ScoreColumn>(model); var serializedPredictionEngine = mlContext.Model.CreatePredictionEngine <HousingRegression, ScoreColumn>(serializedModel); // Take a handful of examples out of the dataset and compute predictions. var dataEnumerator = mlContext.Data.CreateEnumerable <HousingRegression>(mlContext.Data.TakeRows(data, 5), false); foreach (var row in dataEnumerator) { var originalPrediction = originalPredictionEngine.Predict(row); var serializedPrediction = serializedPredictionEngine.Predict(row); // Check that the predictions are identical. Assert.Equal(originalPrediction.Score, serializedPrediction.Score); } }
public void SaveCompositeLoaderAndLoad() { var mlContext = new MLContext(); var file = new MultiFileSource(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename)); var loader = mlContext.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var composite = loader.Append(mlContext.Transforms.NormalizeMinMax("Features")); var loaderWithEmbeddedModel = composite.Fit(file); string modelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-model.zip"); mlContext.Model.Save(null, loaderWithEmbeddedModel, modelPath); Load(mlContext, modelPath, out var loadedWithSchema, out var loadedSchema, out var loadedWithLoader, out var loadedLoaderWithTransformer); // Because we saved the transform model as part of the composite loader, with no transforms, // the transform that should be loaded should be an empty transformer chain, since the "model," // such as it is, has been combined with the loader. Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(loadedWithSchema)); Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(loadedWithLoader)); var expectedSchema = loaderWithEmbeddedModel.GetOutputSchema(); Assert.True(expectedSchema.Count == 3); Assert.NotNull(expectedSchema.GetColumnOrNull("Label")); Assert.NotNull(expectedSchema.GetColumnOrNull("Features")); Assert.True(expectedSchema["Features"].HasSlotNames()); TestCommon.CheckSameSchemas(loaderWithEmbeddedModel.GetOutputSchema(), loadedSchema); var schemaFromLoadedLoader = loadedLoaderWithTransformer.GetOutputSchema(); TestCommon.CheckSameSchemas(loaderWithEmbeddedModel.GetOutputSchema(), schemaFromLoadedLoader); // The type of the loader itself should be a composite data loader, and its single transformer // should be the normalizing transformer. var compositeLoader = Assert.IsType <CompositeDataLoader <IMultiStreamSource, ITransformer> >(loadedLoaderWithTransformer); var chainFromLoader = compositeLoader.Transformer; Assert.IsType <NormalizingTransformer>(Assert.Single(compositeLoader.Transformer)); }
public void LoadModelAndExtractPredictor() { var mlContext = new MLContext(1); var file = new MultiFileSource(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename)); var loader = mlContext.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var data = loader.Load(file); // Pipeline. var pipeline = mlContext.BinaryClassification.Trainers.Gam(); // Define the same pipeline starting with the loader. var pipeline1 = loader.Append(mlContext.BinaryClassification.Trainers.Gam()); // Train. var transformerModel = pipeline.Fit(data); var compositeLoaderModel = pipeline1.Fit(file); // Save and reload the "same" model with some differences in structure. // In this case we are saving the transformer model, but *not* the loader, just the schema from that loader. string modelAndSchemaPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-model-schema.zip"); mlContext.Model.Save(transformerModel, data.Schema, modelAndSchemaPath); // In this case we have combined the loader with the transformer model to form a "composite" loader, and are just // saving that one loader to this file. string compositeLoaderModelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-composite-model.zip"); mlContext.Model.Save(null, compositeLoaderModel, compositeLoaderModelPath); // In this case we are saving the transformer model, as well as the associated data loader. string loaderAndTransformerModelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-loader-transformer.zip"); mlContext.Model.Save(transformerModel, loader, loaderAndTransformerModelPath); ITransformer loadedTransformerModel; IDataLoader <IMultiStreamSource> loadedCompositeLoader; ITransformer loadedTransformerModel1; using (var fs = File.OpenRead(modelAndSchemaPath)) loadedTransformerModel = mlContext.Model.Load(fs, out var loadedSchema); using (var fs = File.OpenRead(compositeLoaderModelPath)) { // This model can be loaded either as a composite data loader, // a transformer model + an input schema, or a transformer model + a data loader. var t = mlContext.Model.LoadWithDataLoader(fs, out loadedCompositeLoader); // This is a bit strange, as it seems to test that it can reload from the same // stream twice opened only once, which as far as I know is not really a requirement // of the design or API, but we are nonetheless testing it. If this winds up failing, // I'm not sure we should really insist on this as a design requirement. var t1 = mlContext.Model.Load(fs, out var s); TestCommon.CheckSameSchemas(loadedCompositeLoader.GetOutputSchema(), s); // We combined the GAM with the loader, so the remaining chain should just be empty. Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t)); Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t1)); } using (var fs = File.OpenRead(loaderAndTransformerModelPath)) { // This model can be loaded either as a composite data loader, // a transformer model + an input schema, or a transformer model + a data loader. var t = mlContext.Model.Load(fs, out var s); TestCommon.CheckSameSchemas(loader.GetOutputSchema(), s); loadedTransformerModel1 = mlContext.Model.LoadWithDataLoader(fs, out var l); }
public void TestSvmLightLoaderAndSaverWithTermMapping() { // Test with a term mapping, instead of the actual SVM^light format that // requires positive integers. ALso check that qid works here. var path = CreateDataset("-data.txt", new string[] { "1 qid:1 aurora:3.14159 beachwood:123", "-1 qid:5 beachwood:345 chagrin:-21", }); var model = ML.Data.CreateSvmLightLoaderWithFeatureNames(dataSample: new MultiFileSource(path)); var data = model.Load(path); Assert.True(data.Schema["Features"].Type.GetValueCount() == 3); var schemaDef = SchemaDefinition.Create(typeof(SvmLightOutput)); schemaDef["Features"].ColumnType = new VectorDataViewType(NumberDataViewType.Single, 3); schemaDef["Features"].AddAnnotation( AnnotationUtils.Kinds.SlotNames, new VBuffer <ReadOnlyMemory <char> >(3, new[] { "aurora".AsMemory(), "beachwood".AsMemory(), "chagrin".AsMemory() }), new VectorDataViewType(TextDataViewType.Instance, 3)); var expectedData = ML.Data.LoadFromEnumerable(new SvmLightOutput[] { new SvmLightOutput() { Label = 1, Weight = 1, GroupId = 1, Features = new VBuffer <float>(3, 2, new[] { 3.14159f, 123f }, new[] { 0, 1 }) }, new SvmLightOutput() { Label = -1, Weight = 1, GroupId = 5, Features = new VBuffer <float>(3, 2, new[] { 345f, -21f }, new[] { 1, 2 }) }, }, schemaDef); CheckSameValues(data, expectedData, checkId: false); TestCommon.CheckSameSchemas(data.Schema, expectedData.Schema); // Save, reload and compare dataviews again. var outputPath = DeleteOutputPath(TestName + "-saved-data.txt"); using (var stream = File.Create(outputPath)) ML.Data.SaveInSvmLightFormat(expectedData, stream, zeroBasedIndexing: true, rowGroupColumnName: "GroupId"); data = ML.Data.LoadFromSvmLightFile(outputPath, zeroBased: true); CheckSameValues(data, expectedData, checkId: false); // We reload the model, but on a new set of data. The "euclid" key should be // ignored as it would not have been detected by the term transform. path = CreateDataset("-data2.txt", new string[] { "-1 aurora:1 chagrin:2", "1 chagrin:3 euclid:4" }); data = model.Load(path); Assert.True(data.Schema["Features"].Type.GetValueCount() == 3); expectedData = ML.Data.LoadFromEnumerable(new SvmLightOutput[] { new SvmLightOutput() { Label = -1, Weight = 1, Features = new VBuffer <float>(3, 2, new[] { 1f, 2f }, new[] { 0, 2 }) }, new SvmLightOutput() { Label = 1, Weight = 1, Features = new VBuffer <float>(3, 1, new[] { 3f }, new[] { 2 }) }, }, schemaDef); CheckSameValues(data, expectedData, checkId: false); // Save, reload and compare dataviews again. outputPath = DeleteOutputPath(TestName + "-saved-data2.txt"); using (var stream = File.Create(outputPath)) ML.Data.SaveInSvmLightFormat(expectedData, stream); data = ML.Data.LoadFromSvmLightFile(outputPath); CheckSameValues(data, expectedData, checkId: false); }
public void SaveAndLoadModelWithLoader() { var mlContext = new MLContext(); var file = new MultiFileSource(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename)); var loader = mlContext.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var data = loader.Load(file); // Pipeline. var pipeline = mlContext.BinaryClassification.Trainers.Gam(); // Train. var model = pipeline.Fit(data); // Save and reload. string modelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-model.zip"); mlContext.Model.Save(model, loader, modelPath); IDataLoader <IMultiStreamSource> loadedLoader; ITransformer loadedModelWithoutLoader; ITransformer loadedModelWithLoader; DataViewSchema loadedSchema; using (var fs = File.OpenRead(modelPath)) { loadedModelWithLoader = mlContext.Model.LoadWithDataLoader(fs, out loadedLoader); Assert.IsAssignableFrom <ISingleFeaturePredictionTransformer <object> >(loadedModelWithLoader); loadedModelWithoutLoader = mlContext.Model.Load(fs, out loadedSchema); Assert.IsAssignableFrom <ISingleFeaturePredictionTransformer <object> >(loadedModelWithoutLoader); TestCommon.CheckSameSchemas(loadedLoader.GetOutputSchema(), loadedSchema); } // When using a novel data source other than one derived from the loader, we will not have // the slot names. data = mlContext.Data.LoadFromEnumerable(new[] { new InputData() }); data = loadedModelWithoutLoader.Transform(data); Assert.False(data.Schema["Features"].HasSlotNames()); // When we plumb the loaded schema through the transformer though, we should have slot names. var noLoaderTransformedSchema = loadedModelWithoutLoader.GetOutputSchema(loadedSchema); Assert.True(noLoaderTransformedSchema["Features"].HasSlotNames()); data = loadedLoader.Load(file); Assert.True(data.Schema["Features"].HasSlotNames()); VBuffer <ReadOnlyMemory <char> > slotNames = default; data.Schema["Features"].GetSlotNames(ref slotNames); var ageIndex = FindIndex(slotNames.GetValues(), "age"); var singleFeaturePredictionTransformer = loadedModelWithLoader as ISingleFeaturePredictionTransformer <object>; Assert.NotNull(singleFeaturePredictionTransformer); var calibratedModelParameters = singleFeaturePredictionTransformer.Model as CalibratedModelParametersBase; Assert.NotNull(calibratedModelParameters); var gamModel = calibratedModelParameters.SubModel as GamBinaryModelParameters; Assert.NotNull(gamModel); var ageBinUpperBounds = gamModel.GetBinUpperBounds(ageIndex); var ageBinEffects = gamModel.GetBinEffects(ageIndex); }