public void PcaWorkout() { var data = TextLoaderStatic.CreateReader(_env, c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var invalidData = TextLoaderStatic.CreateReader(_env, c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var estNonDefaultArgs = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 3, weightColumn: "weight", overSampling: 2, center: false); TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic); Done(); }
public void TestPcaEstimator() { var data = TextLoaderStatic.CreateReader(_env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(_dataSource); var est = new PrincipalComponentAnalysisEstimator(_env, "features", "pca", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = _env.Start("save")) { IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(_env, savedData, new[] { "pca" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true); } CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4); Done(); }
public void OnnxStatic() { var env = new MLContext(null, 1); var imageHeight = 224; var imageWidth = 224; var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); var data = TextLoaderStatic.CreateReader(env, ctx => ( imagePath: ctx.LoadText(0), name: ctx.LoadText(1))) .Read(dataFile); var pipe = data.MakeNewEstimator() .Append(row => ( row.name, data_0: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true))) .Append(row => (row.name, output_1: row.data_0.DnnImageFeaturizer(m => m.ModelSelector.ResNet18(m.Environment, m.OutputColumn, m.InputColumn)))); TestEstimatorCore(pipe.AsDynamic, data.AsDynamic); var result = pipe.Fit(data).Transform(data).AsDynamic; result.Schema.TryGetColumnIndex("output_1", out int output); using (var cursor = result.GetRowCursor(result.Schema["output_1"])) { var buffer = default(VBuffer <float>); var getter = cursor.GetGetter <VBuffer <float> >(output); var numRows = 0; while (cursor.MoveNext()) { getter(ref buffer); Assert.Equal(512, buffer.Length); numRows += 1; } Assert.Equal(4, numRows); } }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Load(sentimentDataPath); var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath) .AsDynamic; var feat = data.MakeNewEstimator() .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options { OutputTokensColumnName = "OutputTokens", })); TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = ((IHostEnvironment)ML).Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true }); var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4); savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
public void CrossValidate() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new MulticlassClassificationContext(env); var reader = TextLoaderStatic.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); var est = reader.MakeNewEstimator() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, preds: ctx.Trainers.Sdca( r.label, r.features, maxIterations: 2))); var results = ctx.CrossValidate(reader.Read(dataSource), est, r => r.label) .Select(x => x.metrics).ToArray(); Assert.Equal(5, results.Length); Assert.True(results.All(x => x.LogLoss > 0)); }
public void SdcaWorkout() { var dataPath = GetDataPath("breast-cancer.txt"); var data = TextLoaderStatic.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10))) .Read(dataPath).Cache(); var binaryTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscent( new SdcaBinaryTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(binaryTrainer, data.AsDynamic); var nonCalibratedBinaryTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( new SdcaNonCalibratedBinaryTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(nonCalibratedBinaryTrainer, data.AsDynamic); var regressionTrainer = ML.Regression.Trainers.StochasticDualCoordinateAscent( new SdcaRegressionTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(regressionTrainer, data.AsDynamic); var mcTrainer = ML.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( new SdcaMultiClassTrainer.Options { ConvergenceTolerance = 1e-2f }); TestEstimatorCore(mcTrainer, data.AsDynamic); Done(); }
public void KeyToValuePigsty() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateReader(Env, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4) )); var data = reader.Read(dataPath); // Non-pigsty Term. var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"), new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString") }) .Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( A: ctx.KeyU4.TextValues.Scalar, B: ctx.KeyU4.TextValues.Vector)); var est = data2.MakeNewEstimator() .Append(row => ( ScalarString: row.A.ToValue(), VectorString: row.B.ToValue())); TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); var data2Transformed = est.Fit(data2).Transform(data2).AsDynamic; // Check that term and ToValue are round-trippable. var dataLeft = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data.AsDynamic).Transform(data.AsDynamic); var dataRight = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data2Transformed).Transform(data2Transformed); CheckSameSchemas(dataLeft.Schema, dataRight.Schema); CheckSameValues(dataLeft, dataRight); Done(); }
public void WordBagWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordBagEstimator(Env, "text", "bag_of_words"). Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash", invertHash: -1)); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "bag_of_words.tsv"); Done(); }
public void CategoricalStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( ScalarString: ctx.LoadText(1), VectorString: ctx.LoadText(1, 4))); var data = reader.Load(dataPath); var wrongCollection = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var invalidData = ML.Data.LoadFromEnumerable(wrongCollection); var est = data.MakeNewEstimator(). Append(row => ( A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind), B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind), C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag), D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin), E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin) )); TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Categorical", "featurized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); var view = ML.Transforms.SelectColumns("A", "B", "C", "D", "E").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true); CheckEquality("Categorical", "featurized.tsv"); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) .Append(new NgramExtractingEstimator(Env, "terms", "ngrams")) .Append(new NgramHashingEstimator(Env, "terms", "ngramshash")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TextFeaturizerWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath) .AsDynamic; var feat = data.MakeNewEstimator() .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; })); TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "featurized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, feat.Fit(data).Transform(data).AsDynamic, 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "Data", "Data_TransformedText" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "featurized.tsv"); Done(); }
public void TestWordEmbeddings() { var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); var testDataPath = GetDataPath(TestDatasets.Sentiment.testFilename); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), SentimentText: ctx.LoadText(1)), hasHeader: true) .Read(dataPath); var dynamicData = TextFeaturizingEstimator.Create(Env, new TextFeaturizingEstimator.Arguments() { Column = new TextFeaturizingEstimator.Column { Name = "SentimentText_Features", Source = new[] { "SentimentText" } }, OutputTokens = true, KeepPunctuations = false, UsePredefinedStopWordRemover = true, VectorNormalizer = TextFeaturizingEstimator.TextNormKind.None, CharFeatureExtractor = null, WordFeatureExtractor = null, }, data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( SentimentText_Features_TransformedText: ctx.Text.VarVector, SentimentText: ctx.Text.Scalar, label: ctx.Bool.Scalar)); var est = data2.MakeNewEstimator() .Append(row => row.SentimentText_Features_TransformedText.WordEmbeddings()); TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic); Done(); }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = ML.Transforms.Text.NormalizeText("text") .Append(ML.Transforms.Text.TokenizeWords("text", "words")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("words", "NoDefaultStopwords")) .Append(ML.Transforms.Text.RemoveStopWords("words", "NoStopWords", "xbox", "this", "is", "a", "the", "THAT", "bY")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "NoDefaultStopwords", "NoStopWords" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }
public void AveragePerceptronCalibration() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoaderStatic.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryModelParameters pred = null; var loss = new HingeLoss(1); var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss, numIterations: 2, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights.Count); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); }
public void RffStatic() { string dataPath = GetDataPath("breast-cancer.txt"); var reader = TextLoaderStatic.CreateLoader(ML, ctx => ( VectorFloat: ctx.LoadFloat(1, 8), Label: ctx.LoadFloat(0) )); var data = reader.Load(dataPath); var est = data.MakeNewEstimator() .Append(row => ( RffVectorFloat: row.VectorFloat.LowerVectorSizeWithRandomFourierTransformation(3, true), row.Label)); TestEstimatorCore(est.AsDynamic, data.AsDynamic); var outputPath = GetOutputPath("Rff", "featurized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Rff", "featurized.tsv"); Done(); }
public void LpGcNormAndWhiteningWorkout() { string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var data = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var invalidData = TextLoaderStatic.CreateReader(ML, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true) .Read(dataSource); var est = ML.Transforms.Projection.LpNormalize("lpnorm", "features") .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnorm", "features")) .Append(new VectorWhiteningEstimator(ML, "whitened", "features")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, OutputHeader = false }); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("lpnorm", "gcnorm", "whitened").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4); Done(); }
public void SimpleTextLoaderCopyColumnsTest() { var env = new MLContext(0); const string data = "0 hello 3.14159 -0 2\n" + "1 1 2 4 15"; var dataSource = new BytesStreamSource(data); var text = TextLoaderStatic.CreateLoader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1), numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3. dataSource, separator: ' '); // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now. // For now, just operate over the actual `IDataView`. var textData = text.Load(dataSource).AsDynamic; Action <DataViewSchema, string> CheckSchemaHasColumn = (dataSchema, name) => { Assert.True(dataSchema.GetColumnOrNull(name).HasValue, "Could not find column '" + name + "'"); }; var schema = textData.Schema; // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. CheckSchemaHasColumn(schema, "label"); CheckSchemaHasColumn(schema, "text"); CheckSchemaHasColumn(schema, "numericFeatures"); // Next verify they have the expected types. Assert.Equal(BooleanDataViewType.Instance, schema["label"].Type); Assert.Equal(TextDataViewType.Instance, schema["text"].Type); Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["numericFeatures"].Type); // Next actually inspect the data. using (var cursor = textData.GetRowCursorForAllColumns()) { var textGetter = cursor.GetGetter <ReadOnlyMemory <char> >(schema["text"]); var numericFeaturesGetter = cursor.GetGetter <VBuffer <float> >(schema["numericFeatures"]); ReadOnlyMemory <char> textVal = default; var labelGetter = cursor.GetGetter <bool>(schema["label"]); bool labelVal = default; VBuffer <float> numVal = default; void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2) { labelGetter(ref labelVal); textGetter(ref textVal); numericFeaturesGetter(ref numVal); Assert.True(tx.AsSpan().SequenceEqual(textVal.Span)); Assert.Equal((bool)bl, labelVal); Assert.Equal(3, numVal.Length); Assert.Equal(v0, numVal.GetItemOrDefault(0)); Assert.Equal(v1, numVal.GetItemOrDefault(1)); Assert.Equal(v2, numVal.GetItemOrDefault(2)); } Assert.True(cursor.MoveNext(), "Could not move even to first row"); CheckValuesSame(false, "hello", 3.14159f, -0f, 2f); Assert.True(cursor.MoveNext(), "Could not move to second row"); CheckValuesSame(true, "1", 2f, 4f, 15f); Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two"); } // The next step where we shuffle the names around a little bit is one where we are // testing out the implicit usage of copy columns. var est = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures)); var newText = text.Append(est); var newTextData = newText.Fit(dataSource).Load(dataSource); schema = newTextData.AsDynamic.Schema; // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. CheckSchemaHasColumn(schema, "label"); CheckSchemaHasColumn(schema, "text"); // Next verify they have the expected types. Assert.Equal(BooleanDataViewType.Instance, schema["text"].Type); Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["label"].Type); }
public static void LightGbmBinaryClassification() { // Downloading a classification dataset from github.com/dotnet/machinelearning. // It will be stored in the same path as the executable string dataFilePath = SamplesUtils.DatasetUtils.DownloadAdultDataset(); // Data Preview // 1. Column [Label]: IsOver50K (boolean) // 2. Column: workclass (text/categorical) // 3. Column: education (text/categorical) // 4. Column: marital-status (text/categorical) // 5. Column: occupation (text/categorical) // 6. Column: relationship (text/categorical) // 7. Column: ethnicity (text/categorical) // 8. Column: sex (text/categorical) // 9. Column: native-country-region (text/categorical) // 10. Column: age (numeric) // 11. Column: fnlwgt (numeric) // 12. Column: education-num (numeric) // 13. Column: capital-gain (numeric) // 14. Column: capital-loss (numeric) // 15. Column: hours-per-week (numeric) // Creating the ML.Net IHostEnvironment object, needed for the pipeline var mlContext = new MLContext(); // Creating Data Loader with the initial schema based on the format of the data var loader = TextLoaderStatic.CreateLoader( mlContext, c => ( Age: c.LoadFloat(0), Workclass: c.LoadText(1), Fnlwgt: c.LoadFloat(2), Education: c.LoadText(3), EducationNum: c.LoadFloat(4), MaritalStatus: c.LoadText(5), Occupation: c.LoadText(6), Relationship: c.LoadText(7), Ethnicity: c.LoadText(8), Sex: c.LoadText(9), CapitalGain: c.LoadFloat(10), CapitalLoss: c.LoadFloat(11), HoursPerWeek: c.LoadFloat(12), NativeCountry: c.LoadText(13), IsOver50K: c.LoadBool(14)), separator: ',', hasHeader: true); // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFilePath); var(trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); // Create the Estimator var learningPipeline = loader.MakeNewEstimator() .Append(row => ( Features: row.Age.ConcatWith( row.EducationNum, row.MaritalStatus.OneHotEncoding(), row.Occupation.OneHotEncoding(), row.Relationship.OneHotEncoding(), row.Ethnicity.OneHotEncoding(), row.Sex.OneHotEncoding(), row.HoursPerWeek, row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)), Label: row.IsOver50K)) .Append(row => ( Features: row.Features.Normalize(), Label: row.Label, Score: mlContext.BinaryClassification.Trainers.LightGbm( row.Label, row.Features, numberOfLeaves: 4, minimumExampleCountPerLeaf: 6, learningRate: 0.001))) .Append(row => ( Label: row.Label, Score: row.Score, PredictedLabel: row.Score.predictedLabel)); // Fit this Pipeline to the Training Data var model = learningPipeline.Fit(trainData); // Evaluate how the model is doing on the test data var dataWithPredictions = model.Transform(testData); var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score); Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.84 Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}"); // 0.89 Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.64 Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88 Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.91 Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.68 Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.60 }
public static void Example() { // Downloading a regression dataset from github.com/dotnet/machinelearning // this will create a housing.txt file in the filsystem. // You can open the file to see the data. string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset(); // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); // Creating a data loader, based on the format of the data var loader = TextLoaderStatic.CreateLoader(mlContext, c => ( label: c.LoadFloat(0), features: c.LoadFloat(1, 6) ), separator: '\t', hasHeader: true); // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(new MultiFileSource(dataFile)); var(trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // The predictor that gets produced out of training LightGbmRegressionModelParameters pred = null; // Create the estimator var learningPipeline = loader.MakeNewEstimator() .Append(r => (r.label, score: mlContext.Regression.Trainers.LightGbm( r.label, r.features, numberOfLeaves: 4, minimumExampleCountPerLeaf: 6, learningRate: 0.001, onFit: p => pred = p) ) ); // Fit this pipeline to the training data var model = learningPipeline.Fit(trainData); // Check the weights that the model learned VBuffer <float> weights = default; pred.GetFeatureWeights(ref weights); var weightsValues = weights.GetValues(); Console.WriteLine($"weight 0 - {weightsValues[0]}"); Console.WriteLine($"weight 1 - {weightsValues[1]}"); // Evaluate how the model is doing on the test data var dataWithPredictions = model.Transform(testData); var metrics = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score); Console.WriteLine($"L1 - {metrics.MeanAbsoluteError}"); // 4.9669731 Console.WriteLine($"L2 - {metrics.MeanSquaredError}"); // 51.37296 Console.WriteLine($"LossFunction - {metrics.LossFunction}"); // 51.37296 Console.WriteLine($"RMS - {metrics.RootMeanSquaredError}"); // 7.167493 Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.079478 }
public void FastTreeRegressionRepresentation() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var dataSource = new MultiFileSource(dataPath); var catalog = new RegressionCatalog(env); var reader = TextLoaderStatic.CreateLoader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var opts = new FastTreeRegressionTrainer.Options() { NumTrees = 10, NumLeaves = 5, NumThreads = 1 }; FastTreeRegressionModelParameters pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); var treeCollection = pred.TrainedTreeEnsemble; Assert.Equal(0, treeCollection.Bias); Assert.Equal(10, treeCollection.Trees.Count); Assert.Equal(10, treeCollection.TreeWeights.Count); var trees = treeCollection.Trees; Assert.Equal(4, trees[0].NumNodes); // Numerical split. There is no categorical split so the follwoing vector contains 0-element. var categoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(0); Assert.Equal(0, categoricalSplitFeatures.Count); // Numerical split. There is no categorical split so the follwoing vector contains 0-element. var categoricalSplitFeatureRange = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0); Assert.Equal(0, categoricalSplitFeatureRange.Count); var expectedGtChild = new int[] { 3, 2, -4, -5 }; Assert.Equal(4, trees[0].GtChild.Count); Assert.Equal(expectedGtChild, trees[0].GtChild); var expectedLteChild = new int[] { 1, -1, -3, -2 }; Assert.Equal(4, trees[0].LteChild.Count); Assert.Equal(expectedLteChild, trees[0].LteChild); var expectedCategoricalSplitFlags = new bool[] { false, false, false, false }; Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags); var expectedNumericalSplitFeatureIndexes = new int[] { 0, 10, 2, 10 }; Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count); Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes); var expectedNumericalSplitThresholds = new float[] { 0.14f, -0.645f, -0.095f, 0.31f }; Assert.Equal(4, trees[0].NumericalSplitThresholds.Count); for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) { Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); } Assert.Equal(5, trees[0].NumLeaves); var expectedLeafValues = new double[] { 40.159015006449692, 80.434805844435061, 57.072130551545513, 82.898710076162757, 104.17547955322266 }; Assert.Equal(5, trees[0].LeafValues.Count); for (int i = 0; i < trees[0].LeafValues.Count; ++i) { Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6); } }
public void FastTreeRegressionRepresentationWithCategoricalSplit() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var dataSource = new MultiFileSource(dataPath); var catalog = new RegressionCatalog(env); var reader = TextLoaderStatic.CreateLoader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true); FastTreeRegressionModelParameters pred = null; var opts = new FastTreeRegressionTrainer.Options() { CategoricalSplit = true, NumTrees = 3, NumLeaves = 5, NumThreads = 1, // This is the minimal samples to form a split (i.e., generating two extra nodes/leaves). For a small data set, // we should set a small value. Otherwise, the trained trees could be empty. MinDocumentsInLeafs = 2 }; var est = reader.MakeNewEstimator() .Append(r => (r.label, features: r.features.OneHotEncoding())) .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); var treeCollection = pred.TrainedTreeEnsemble; Assert.Equal(0, treeCollection.Bias); Assert.Equal(3, treeCollection.Trees.Count); Assert.Equal(3, treeCollection.TreeWeights.Count); var trees = treeCollection.Trees; Assert.Equal(4, trees[0].NumNodes); var expectedGtChild = new int[] { 3, -3, -4, -5 }; Assert.Equal(4, trees[0].GtChild.Count); Assert.Equal(expectedGtChild, trees[0].GtChild); var expectedLteChild = new int[] { 1, 2, -1, -2 }; Assert.Equal(4, trees[0].LteChild.Count); Assert.Equal(expectedLteChild, trees[0].LteChild); var expectedCategoricalSplitFlags = new bool[] { true, true, true, true }; Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags); var expectedNumericalSplitFeatureIndexes = new int[] { 5312, 2, 2126, 533 }; Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count); Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes); var expectedNumericalSplitThresholds = new float[] { 0.5f, 0.5f, 0.5f, 0.5f }; Assert.Equal(4, trees[0].NumericalSplitThresholds.Count); for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) { Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); } var actualCategoricalRanges0 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0); Assert.Equal(actualCategoricalRanges0, new int[] { 5312, 5782 }); var actualCategoricalRanges1 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(1); Assert.Equal(actualCategoricalRanges1, new int[] { 2, 417 }); var actualCategoricalRanges2 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(2); Assert.Equal(actualCategoricalRanges2, new int[] { 2126, 2593 }); var actualCategoricalRanges3 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(3); Assert.Equal(actualCategoricalRanges3, new int[] { 533, 983 }); int[] expectedCounts = { 62, 52, 54, 22 }; int[] expectedStarts = { 5315, 10, 2141, 533 }; int[] expectedEnds = { 5782, 401, 2558, 874 }; for (int i = 0; i < trees[0].NumNodes; ++i) { // Retrieve i-th node's split features. var actualCategoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(i); Assert.Equal(expectedCounts[i], actualCategoricalSplitFeatures.Count); Assert.Equal(expectedStarts[i], actualCategoricalSplitFeatures[0]); Assert.Equal(expectedEnds[i], actualCategoricalSplitFeatures[expectedCounts[i] - 1]); } Assert.Equal(5, trees[0].NumLeaves); var expectedLeafValues = new double[] { 48.456055413607892, 86.584156799316418, 87.017326642027, 76.381184971185391, 117.68872643673058 }; Assert.Equal(5, trees[0].LeafValues.Count); for (int i = 0; i < trees[0].LeafValues.Count; ++i) { Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6); } }
public static void FeatureSelectionTransform() { // Downloading a classification dataset from github.com/dotnet/machinelearning. // It will be stored in the same path as the executable string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset(); // Data Preview // 1. Label 0=benign, 1=malignant // 2. Clump Thickness 1 - 10 // 3. Uniformity of Cell Size 1 - 10 // 4. Uniformity of Cell Shape 1 - 10 // 5. Marginal Adhesion 1 - 10 // 6. Single Epithelial Cell Size 1 - 10 // 7. Bare Nuclei 1 - 10 // 8. Bland Chromatin 1 - 10 // 9. Normal Nucleoli 1 - 10 // 10. Mitoses 1 - 10 // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var ml = new MLContext(); // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from // all the feature columns into entries of a vector of a single column named "Features". var loader = TextLoaderStatic.CreateLoader(ml, c => ( Label: c.LoadBool(0), Features: c.LoadFloat(1, 9) ), separator: '\t', hasHeader: true); // Then, we use the loader to load the data as an IDataView. var data = loader.Load(dataFilePath); // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data. // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default // values than the specified count. This transformation can be used to remove slots with too many missing values. // We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information // between features and label. var pipeline = loader.MakeNewEstimator() .Append(r => ( FeaturesCountSelect: r.Features.SelectFeaturesBasedOnCount(count: 695), Label: r.Label )) .Append(r => ( FeaturesCountSelect: r.FeaturesCountSelect, FeaturesMISelect: r.FeaturesCountSelect.SelectFeaturesBasedOnMutualInformation(r.Label, slotsInOutput: 5), Label: r.Label )); // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data. var transformedData = pipeline.Fit(data).Transform(data); // Small helper to print the data inside a column, in the console. Only prints the first 10 rows. Action <string, IEnumerable <VBuffer <float> > > printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); int count = 0; foreach (var row in column) { foreach (var value in row.GetValues()) { Console.Write($"{value}\t"); } Console.WriteLine(""); count++; if (count >= 10) { break; } } Console.WriteLine("==================================================="); }; // Print the data that results from the transformations. var countSelectColumn = transformedData.AsDynamic.GetColumn <VBuffer <float> >(transformedData.AsDynamic.Schema["FeaturesCountSelect"]); var MISelectColumn = transformedData.AsDynamic.GetColumn <VBuffer <float> >(transformedData.AsDynamic.Schema["FeaturesMISelect"]); printHelper("FeaturesCountSelect", countSelectColumn); printHelper("FeaturesMISelect", MISelectColumn); // Below is the output of the this code. We see that some slots habe been dropped by the first transformation. // Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation // with the label column. // FeaturesCountSelect column obtained post-transformation. // 5 4 4 5 7 3 2 1 // 3 1 1 1 2 3 1 1 // 6 8 8 1 3 3 7 1 // 4 1 1 3 2 3 1 1 // 8 10 10 8 7 9 7 1 // 1 1 1 1 2 3 1 1 // 2 1 2 1 2 3 1 1 // 2 1 1 1 2 1 1 5 // 4 2 1 1 2 2 1 1 // 1 1 1 1 1 3 1 1 // =================================================== // FeaturesMISelect column obtained post-transformation. // 4 4 7 3 2 // 1 1 2 3 1 // 8 8 3 3 7 // 1 1 2 3 1 // 10 10 7 9 7 // 1 1 2 3 1 // 1 2 2 3 1 // 1 1 2 1 1 // 2 1 2 2 1 // 1 1 1 3 1 // =================================================== }