public void FeatureSelectionWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(ML, "text", "bag_of_words") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words", "bag_of_words_count", 10) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words", "bag_of_words_mi", labelColumn: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "bag_of_words_count", "bag_of_words_mi" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("FeatureSelection", "featureselection.tsv"); Done(); }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(dataPath).AsDynamic; var est = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = ML.Transforms.Text.NormalizeText("text") .Append(ML.Transforms.Text.TokenizeWords("words", "text")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words")) .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("text", "NoDefaultStopwords", "NoStopWords").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }
public void SimpleImageSmokeTest() { var env = new MLContext(0); var reader = TextLoaderStatic.CreateReader(env, ctx => ctx.LoadText(0).LoadAsImage().AsGrayscale().Resize(10, 8).ExtractPixels()); var schema = reader.AsDynamic.GetOutputSchema(); Assert.True(schema.TryGetColumnIndex("Data", out int col), "Could not find 'Data' column"); var type = schema[col].Type; var vecType = type as VectorType; Assert.True(vecType?.Size > 0, $"Type was supposed to be known size vector but was instead '{type}'"); Assert.Equal(NumberType.R4, vecType.ItemType); Assert.Equal(3, vecType.Dimensions.Length); Assert.Equal(3, vecType.Dimensions[0]); Assert.Equal(8, vecType.Dimensions[1]); Assert.Equal(10, vecType.Dimensions[2]); var readAsImage = TextLoaderStatic.CreateReader(env, ctx => ctx.LoadText(0).LoadAsImage()); var est = readAsImage.MakeNewEstimator().Append(r => r.AsGrayscale().Resize(10, 8).ExtractPixels()); var pipe = readAsImage.Append(est); }
public void WhiteningWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = new VectorWhiteningEstimator(ML, "whitened1", "features") .Append(new VectorWhiteningEstimator(ML, "whitened2", "features", kind: WhiteningKind.Pca, pcaNum: 5)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("whitened1", "whitened2").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4); Done(); }
public void GcnWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = ML.Transforms.Projection.GlobalContrastNormalize("gcnNorm1", "features") .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnNorm2", "features", substractMean: false, useStdDev: true, scale: 3)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("gcnNorm1", "gcnNorm2").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "gcnNorm.tsv", digitsOfPrecision: 4); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(ML, "bag_of_words", "text"). Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", invertHash: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("text", "bag_of_words", "bag_of_wordshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void SdcaWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); var data = TextLoaderStatic.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) .Read(dataPath).Cache(); var binaryTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscent( new SdcaBinaryTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(binaryTrainer, data.AsDynamic); var regressionTrainer = ML.Regression.Trainers.StochasticDualCoordinateAscent( new SdcaRegressionTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(regressionTrainer, data.AsDynamic); var mcTrainer = ML.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( new SdcaMultiClassTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(mcTrainer, data.AsDynamic); Done(); }
public void LpNormWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = new LpNormalizingEstimator(ML, "lpNorm1", "features") .Append(new LpNormalizingEstimator(ML, "lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, substractMean: true)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "lpNorm1", "lpNorm2" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "lpNorm.tsv"); Done(); }
public void TestWordEmbeddings() { var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); var testDataPath = GetDataPath(TestDatasets.Sentiment.testFilename); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), SentimentText: ctx.LoadText(1)), hasHeader: true) .Read(dataPath); var dynamicData = new TextFeaturizingEstimator(Env, "SentimentText_Features", "SentimentText", args => { args.OutputTokens = true; args.KeepPunctuations = false; args.UseStopRemover = true; args.VectorNormalizer = TextFeaturizingEstimator.TextNormKind.None; args.UseCharExtractor = false; args.UseWordExtractor = false; }).Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( SentimentText_Features_TransformedText: ctx.Text.VarVector, SentimentText: ctx.Text.Scalar, label: ctx.Bool.Scalar)); var est = data2.MakeNewEstimator() .Append(row => row.SentimentText_Features_TransformedText.WordEmbeddings()); TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); Done(); }
public void RffStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(Env, ctx => ( VectorFloat: ctx.LoadFloat(1, 8), Label: ctx.LoadFloat(0) )); var data = reader.Read(dataPath); var est = data.MakeNewEstimator() .Append(row => ( RffVectorFloat: row.VectorFloat.LowerVectorSizeWithRandomFourierTransformation(3, true), row.Label)); TestEstimatorCore(est.AsDynamic, data.AsDynamic); var outputPath = GetOutputPath("Rff", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Rff", "featurized.tsv"); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) .Append(new NgramExtractingEstimator(ML, "ngrams", "terms")) .Append(new NgramHashingEstimator(ML, "ngramshash", "terms")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "ngrams.tsv"); Done(); }
public static void Bar() { IHostEnvironment env = null; var text = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1), numericFeatures: ctx.LoadFloat(2, 5))); var est = text.MakeNewEstimator(); // This should work. est.Append(r => r.text); // These should not. est.Append(r => 5); est.Append(r => new { r.text, bad = 2 }); // This should work. est.Append(r => Tuple.Create(r.text, r.numericFeatures)); // This should work. est.Append(r => (a: r.text, b: r.label, c: (d: r.text, r.label))); // This should not, and it should indicate a path to the problematic item. est.Append(r => (a: r.text, b: r.label, c: (d: r.text, 5.2f))); // Check a different entrance into static land now, with one of the asserts. var view = text.Read(null).AsDynamic; // Despite the fact that the names are all wrong, this should still work // from the point of view of this analyzer. view.AssertStatic(env, c => ( stay: c.KeyU4.TextValues.Scalar, awhile: c.KeyU1.I4Values.Vector)); // However, this should not. view.AssertStatic(env, c => ( and: c.KeyU4.TextValues.Scalar, listen: 1l)); }
public void KeyToVectorStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4) )); var data = reader.Read(dataPath); // Non-pigsty Term. var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingTransformer.ColumnInfo("ScalarString", "A"), new ValueToKeyMappingTransformer.ColumnInfo("VectorString", "B") }) .Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( A: ctx.KeyU4.TextValues.Scalar, B: ctx.KeyU4.TextValues.Vector)); var est = data2.MakeNewEstimator() .Append(row => ( ScalarString: row.A.ToVector(), VectorString: row.B.ToVector(), VectorBaggedString: row.B.ToBaggedVector() )); TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); Done(); }
public static void SdcaRegression() { // Downloading a regression dataset from github.com/dotnet/machinelearning // this will create a housing.txt file in the filsystem this code will run // you can open the file to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); // Creating the ML.Net IHostEnvironment object, needed for the pipeline var mlContext = new MLContext(); // Creating a data reader, based on the format of the data var reader = TextLoaderStatic.CreateReader(mlContext, c => ( label: c.LoadFloat(0), features: c.LoadFloat(1, 6) ), separator: '\t', hasHeader: true); // Read the data, and leave 10% out, so we can use them for testing var data = reader.Read(dataFile); var(trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); // The predictor that gets produced out of training LinearRegressionModelParameters pred = null; // Create the estimator var learningPipeline = reader.MakeNewEstimator() .Append(r => (r.label, score: mlContext.Regression.Trainers.Sdca( r.label, r.features, l1Threshold: 0f, maxIterations: 100, onFit: p => pred = p) ) ); // Fit this pipeline to the training data var model = learningPipeline.Fit(trainData); // Check the weights that the model learned VBuffer <float> weights = default; pred.GetFeatureWeights(ref weights); var weightsValues = weights.GetValues(); Console.WriteLine($"weight 0 - {weightsValues[0]}"); Console.WriteLine($"weight 1 - {weightsValues[1]}"); // Evaluate how the model is doing on the test data var dataWithPredictions = model.Transform(testData); var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); Console.WriteLine($"L1 - {metrics.L1}"); // 3.7226085 Console.WriteLine($"L2 - {metrics.L2}"); // 24.250636 Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 24.25063 Console.WriteLine($"RMS - {metrics.Rms}"); // 4.924493 Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.565467 }
public void SdcaMulticlass() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new MulticlassClassificationContext(env); var reader = TextLoaderStatic.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); MulticlassLogisticRegressionModelParameters pred = null; var loss = new HingeLoss(1); // With a custom loss function we no longer get calibrated predictions. var est = reader.MakeNewEstimator() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, preds: ctx.Trainers.Sdca( r.label, r.features, maxIterations: 2, loss: loss, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); VBuffer <float>[] weights = default; pred.GetWeights(ref weights, out int n); Assert.True(n == 3 && n == weights.Length); foreach (var w in weights) { Assert.True(w.Length == 4); } var biases = pred.GetBiases(); Assert.True(biases.Count() == 3); var data = model.Read(dataSource); // Just output some data on the schema for fun. var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.Count; ++c) { Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); } var metrics = ctx.Evaluate(data, r => r.label, r => r.preds, 2); Assert.True(metrics.LogLoss > 0); Assert.True(metrics.TopKAccuracy > 0); }
public void KMeans() { var env = new MLContext(seed: 0, conc: 1); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var reader = TextLoaderStatic.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); KMeansModelParameters pred = null; var est = reader.MakeNewEstimator() .AppendCacheCheckpoint() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, r.features, preds: env.Clustering.Trainers.KMeans(r.features, clustersCount: 3, onFit: p => pred = p, advancedSettings: s => s.NumThreads = 1))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); VBuffer <float>[] centroids = default; int k; pred.GetClusterCentroids(ref centroids, out k); Assert.True(k == 3); var data = model.Read(dataSource); var metrics = env.Clustering.Evaluate(data, r => r.preds.score, r => r.label, r => r.features); Assert.NotNull(metrics); Assert.InRange(metrics.AvgMinScore, 0.5262, 0.5264); Assert.InRange(metrics.Nmi, 0.73, 0.77); Assert.InRange(metrics.Dbi, 0.662, 0.667); metrics = env.Clustering.Evaluate(data, r => r.preds.score, label: r => r.label); Assert.NotNull(metrics); Assert.InRange(metrics.AvgMinScore, 0.5262, 0.5264); Assert.True(metrics.Dbi == 0.0); metrics = env.Clustering.Evaluate(data, r => r.preds.score, features: r => r.features); Assert.True(double.IsNaN(metrics.Nmi)); metrics = env.Clustering.Evaluate(data, r => r.preds.score); Assert.NotNull(metrics); Assert.InRange(metrics.AvgMinScore, 0.5262, 0.5264); Assert.True(double.IsNaN(metrics.Nmi)); Assert.True(metrics.Dbi == 0.0); }
public static void Bar() { DataReader <IMultiStreamSource, T> Foo1 <T>(Func <TextLoaderStatic.Context, T> m) { IHostEnvironment env = null; // We ought to fail here. return(TextLoaderStatic.CreateReader(env, m)); } DataReader <IMultiStreamSource, T> Foo2 <[IsShape] T>(Func <TextLoaderStatic.Context, T> m)
public static void FastTreeRegression() { // Downloading a regression dataset from github.com/dotnet/machinelearning // this will create a housing.txt file in the filsystem this code will run // you can open the file to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); // Creating a data reader, based on the format of the data var reader = TextLoaderStatic.CreateReader(mlContext, c => ( label: c.LoadFloat(0), features: c.LoadFloat(1, 6) ), separator: '\t', hasHeader: true); // Read the data, and leave 10% out, so we can use them for testing var data = reader.Read(dataFile); // The predictor that gets produced out of training FastTreeRegressionModelParameters pred = null; // Create the estimator var learningPipeline = reader.MakeNewEstimator() .Append(r => (r.label, score: mlContext.Regression.Trainers.FastTree( r.label, r.features, numTrees: 100, // try: (int) 20-2000 numLeaves: 20, // try: (int) 2-128 minDatapointsInLeaves: 10, // try: (int) 1-100 learningRate: 0.2, // try: (float) 0.025-0.4 onFit: p => pred = p) ) ); var cvResults = mlContext.Regression.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5); var averagedMetrics = ( L1 : cvResults.Select(r => r.metrics.L1).Average(), L2 : cvResults.Select(r => r.metrics.L2).Average(), LossFn : cvResults.Select(r => r.metrics.LossFn).Average(), Rms : cvResults.Select(r => r.metrics.Rms).Average(), RSquared : cvResults.Select(r => r.metrics.RSquared).Average() ); Console.WriteLine($"L1 - {averagedMetrics.L1}"); // 3.091095 Console.WriteLine($"L2 - {averagedMetrics.L2}"); // 20.351073 Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); // 20.351074 Console.WriteLine($"RMS - {averagedMetrics.Rms}"); // 4.478358 Console.WriteLine($"RSquared - {averagedMetrics.RSquared}"); // 0.754977 }
public void DropSlotsTransform() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(ML, ctx => ( ScalarFloat: ctx.LoadFloat(1), ScalarDouble: ctx.LoadDouble(1), VectorFloat: ctx.LoadFloat(1, 4), VectorDouble: ctx.LoadDouble(4, 8) )); var data = reader.Read(new MultiFileSource(dataPath)).AsDynamic; var columns = new[]
public void TestGetColumn() { var path = GetDataPath(TestDatasets.breastCancer.trainFilename); var env = new MLContext(); var data = TextLoaderStatic.CreateReader(env, ctx => ( floatScalar: ctx.LoadFloat(1), floatVector: ctx.LoadFloat(2, 6), stringScalar: ctx.LoadText(4), stringVector: ctx.LoadText(5, 7) )).Read(path); Action <Action> mustFail = (Action action) => { try { action(); Assert.False(true); } catch (ArgumentOutOfRangeException) { } catch (InvalidOperationException) { } catch (TargetInvocationException ex) { Exception e; for (e = ex; e.InnerException != null; e = e.InnerException) { } Assert.True(e is ArgumentOutOfRangeException || e is InvalidOperationException); Assert.True(e.IsMarked()); } }; var enum1 = data.AsDynamic.GetColumn <float>(env, "floatScalar").ToArray(); var enum2 = data.AsDynamic.GetColumn <float[]>(env, "floatVector").ToArray(); var enum3 = data.AsDynamic.GetColumn <VBuffer <float> >(env, "floatVector").ToArray(); var enum4 = data.AsDynamic.GetColumn <string>(env, "stringScalar").ToArray(); var enum5 = data.AsDynamic.GetColumn <string[]>(env, "stringVector").ToArray(); mustFail(() => data.AsDynamic.GetColumn <float[]>(env, "floatScalar")); mustFail(() => data.AsDynamic.GetColumn <int[]>(env, "floatVector")); mustFail(() => data.AsDynamic.GetColumn <int>(env, "floatScalar")); mustFail(() => data.AsDynamic.GetColumn <int?>(env, "floatScalar")); mustFail(() => data.AsDynamic.GetColumn <string>(env, "floatScalar")); // Static types. var enum8 = data.GetColumn(r => r.floatScalar); var enum9 = data.GetColumn(r => r.floatVector); var enum10 = data.GetColumn(r => r.stringScalar); var enum11 = data.GetColumn(r => r.stringVector); }
public void MultiClassNaiveBayesTrainer() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new MulticlassClassificationContext(env); var reader = TextLoaderStatic.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); MultiClassNaiveBayesModelParameters pred = null; // With a custom loss function we no longer get calibrated predictions. var est = reader.MakeNewEstimator() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, preds: ctx.Trainers.MultiClassNaiveBayesTrainer( r.label, r.features, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); int[] labelHistogram = default; int[][] featureHistogram = default; pred.GetLabelHistogram(ref labelHistogram, out int labelCount1); pred.GetFeatureHistogram(ref featureHistogram, out int labelCount2, out int featureCount); Assert.True(labelCount1 == 3 && labelCount1 == labelCount2 && labelCount1 <= labelHistogram.Length); for (int i = 0; i < labelCount1; i++) { Assert.True(featureCount == 4 && (featureCount <= featureHistogram[i].Length)); } var data = model.Read(dataSource); // Just output some data on the schema for fun. var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.Count; ++c) { Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); } var metrics = ctx.Evaluate(data, r => r.label, r => r.preds, 2); Assert.True(metrics.LogLoss > 0); Assert.True(metrics.TopKAccuracy > 0); }
public void CategoricalHashStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Read(dataPath); var wrongCollection = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var invalidData = ML.Data.ReadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( row.ScalarString, row.VectorString, // Create a VarVector column VarVectorString: row.ScalarString.TokenizeText())). Append(row => ( A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag), D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin), E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin), F: row.VarVectorString.OneHotHashEncoding() )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4); var view = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E", "F" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true); } CheckEquality("CategoricalHash", "featurized.tsv"); Done(); }
public void SdcaBinaryClassification() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoaderStatic.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryModelParameters pred = null; ParameterMixingCalibratedPredictor cali = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.Sdca(r.label, r.features, maxIterations: 2, onFit: (p, c) => { pred = p; cali = c; }, advancedSettings: s => s.NumThreads = 1))); var pipe = reader.Append(est); Assert.Null(pred); Assert.Null(cali); var model = pipe.Fit(dataSource); Assert.NotNull(pred); Assert.NotNull(cali); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights.Count); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); Assert.InRange(metrics.LogLoss, 0, double.PositiveInfinity); Assert.InRange(metrics.Entropy, 0, double.PositiveInfinity); // Just output some data on the schema for fun. var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.Count; ++c) { Console.WriteLine($"{schema[c].Name}, {schema[c].Type}"); } }
public void LdaWorkout() { IHostEnvironment env = new MLContext(seed: 42, conc: 1); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(env, "text", "bag_of_words"). Append(new LatentDirichletAllocationEstimator(env, "bag_of_words", "topics", 10, numIterations: 10, resetRandomGenerator: true)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // In this test it manifests because of the WordBagEstimator in the estimator chain // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ldatopics.tsv"); using (var ch = env.Start("save")) { var saver = new TextSaver(env, new TextSaver.Arguments { Silent = true, OutputHeader = false, Dense = true }); var transformer = est.Fit(data.AsDynamic); var transformedData = transformer.Transform(data.AsDynamic); IDataView savedData = TakeFilter.Create(env, transformedData, 4); savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size); } // Diabling this check due to the following issue with consitency of output. // `seed` specified in ConsoleEnvironment has no effect. // https://github.com/dotnet/machinelearning/issues/1004 // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server // CheckEquality("Text", "ldatopics.tsv"); Done(); }
void TestNgramCompatColumns() { string dropModelPath = GetDataPath("backcompat/ngram.zip"); string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( Sentiment: ctx.LoadBool(0), SentimentText: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); using (FileStream fs = File.OpenRead(dropModelPath)) { var result = ModelFileUtils.LoadTransforms(Env, data.AsDynamic, fs); var featureColumn = result.Schema.GetColumnOrNull("Features"); Assert.NotNull(featureColumn); } }
public void OnlineLinearWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); var regressionData = TextLoaderStatic.CreateReader(ML, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) .Read(dataPath); var regressionPipe = regressionData.MakeNewEstimator() .Append(r => (r.Label, Features: r.Features.Normalize())); var regressionTrainData = regressionPipe.Fit(regressionData).Transform(regressionData).AsDynamic; var ogdTrainer = new OnlineGradientDescentTrainer(ML, "Label", "Features"); TestEstimatorCore(ogdTrainer, regressionTrainData); var ogdModel = ogdTrainer.Fit(regressionTrainData); ogdTrainer.Train(regressionTrainData, ogdModel.Model); var binaryData = TextLoaderStatic.CreateReader(ML, ctx => (Label: ctx.LoadBool(0), Features: ctx.LoadFloat(1, 10))) .Read(dataPath); var binaryPipe = binaryData.MakeNewEstimator() .Append(r => (r.Label, Features: r.Features.Normalize())); var binaryTrainData = binaryPipe.Fit(binaryData).Transform(binaryData).AsDynamic; var apTrainer = new AveragedPerceptronTrainer(ML, "Label", "Features", lossFunction: new HingeLoss(), advancedSettings: s => { s.LearningRate = 0.5f; }); TestEstimatorCore(apTrainer, binaryTrainData); var apModel = apTrainer.Fit(binaryTrainData); apTrainer.Train(binaryTrainData, apModel.Model); var svmTrainer = new LinearSvmTrainer(ML, "Label", "Features"); TestEstimatorCore(svmTrainer, binaryTrainData); var svmModel = svmTrainer.Fit(binaryTrainData); svmTrainer.Train(binaryTrainData, apModel.Model); Done(); }
public static void Bar() { IHostEnvironment env = null; var text = TextLoaderStatic.CreateReader(env, ctx => new { Label = ctx.LoadBool(0), Text = ctx.LoadText(1), NumericFeatures = ctx.LoadFloat(2, 5) }); var est = text.MakeNewEstimator(); // This should work. est.Append(r => new { r.Text }); IDataView view = null; view.AssertStatic(env, c => new Class1(c.I4.Scalar, c.Bool.Vector)); view.AssertStatic(env, c => new Class2 { F1 = c.I4.Scalar, F2 = c.Bool.Vector }); view.AssertStatic(env, c => new Class3 <Class2> { F1 = new Class1(c.I4.Scalar, c.Bool.Vector), F2 = new Class2 { F1 = c.I4.Scalar, F2 = c.Bool.Vector } }); view.AssertStatic(env, c => new Class4 { F1 = c.I4.Scalar }); view.AssertStatic <Class5>(env, c => null); view.AssertStatic(env, c => new Class6(c.I4.Scalar, c.Bool.Vector)); view.AssertStatic(env, c => new Class7 { F2 = c.Bool.Vector }); view.AssertStatic(env, c => new Class8(c.I4.Scalar, c.Bool.Vector)); view.AssertStatic(env, c => new Class9 { F1 = c.I4.Scalar, F2 = c.Bool.Vector }); view.AssertStatic(env, c => new Class10(c.I4.Scalar, c.Bool.Vector)); view.AssertStatic(env, c => new Class11(c.I4.Scalar, c.Bool.Vector, c.Bool.Vector)); // This is wrong but should not fail with our diagnostic since there is a deeper problem that the class // simply is not there. var text2 = TextLoaderStatic.CreateReader(env, ctx => new MissingClass(ctx.LoadText(0))); }
public void TestTensorFlowStaticWithSchema() { const string modelLocation = "cifar_model/frozen_model.pb"; var mlContext = new MLContext(seed: 1, conc: 1); var tensorFlowModel = TensorFlowUtils.LoadTensorFlowModel(mlContext, modelLocation); var schema = tensorFlowModel.GetInputSchema(); Assert.True(schema.TryGetColumnIndex("Input", out int column)); var type = (VectorType)schema[column].Type; var imageHeight = type.Dimensions[0]; var imageWidth = type.Dimensions[1]; var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); var data = TextLoaderStatic.CreateReader(mlContext, ctx => ( imagePath: ctx.LoadText(0), name: ctx.LoadText(1))) .Read(dataFile); // Note that CamelCase column names are there to match the TF graph node names. var pipe = data.MakeNewEstimator() .Append(row => ( row.name, Input: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true))) .Append(row => (row.name, Output: row.Input.ApplyTensorFlowGraph(tensorFlowModel))); TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); var result = pipe.Fit(data).Transform(data).AsDynamic; result.Schema.TryGetColumnIndex("Output", out int output); using (var cursor = result.GetRowCursor(result.Schema["Output"])) { var buffer = default(VBuffer <float>); var getter = cursor.GetGetter <VBuffer <float> >(output); var numRows = 0; while (cursor.MoveNext()) { getter(ref buffer); Assert.Equal(10, buffer.Length); numRows += 1; } Assert.Equal(4, numRows); } }
[ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // x86 fails with "An attempt was made to load a program with an incorrect format." public void OnnxStatic() { if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return; } var modelFile = "squeezenet/00000001/model.onnx"; var env = new MLContext(conc: 1); var imageHeight = 224; var imageWidth = 224; var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); var data = TextLoaderStatic.CreateReader(env, ctx => ( imagePath: ctx.LoadText(0), name: ctx.LoadText(1))) .Read(dataFile); // Note that CamelCase column names are there to match the TF graph node names. var pipe = data.MakeNewEstimator() .Append(row => ( row.name, data_0: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleaveArgb: true))) .Append(row => (row.name, softmaxout_1: row.data_0.ApplyOnnxModel(modelFile))); TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); var result = pipe.Fit(data).Transform(data).AsDynamic; result.Schema.TryGetColumnIndex("softmaxout_1", out int output); using (var cursor = result.GetRowCursor(result.Schema["softmaxout_1"])) { var buffer = default(VBuffer <float>); var getter = cursor.GetGetter <VBuffer <float> >(output); var numRows = 0; while (cursor.MoveNext()) { getter(ref buffer); Assert.Equal(1000, buffer.Length); numRows += 1; } Assert.Equal(4, numRows); } }