// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>. public static void FastTreeBinaryClassification() { // Downloading a classification dataset from github.com/dotnet/machinelearning. // It will be stored in the same path as the executable string dataFilePath = SamplesUtils.DatasetUtils.DownloadAdultDataset(); // Data Preview // 1. Column [Label]: IsOver50K (boolean) // 2. Column: workclass (text/categorical) // 3. Column: education (text/categorical) // 4. Column: marital-status (text/categorical) // 5. Column: occupation (text/categorical) // 6. Column: relationship (text/categorical) // 7. Column: ethnicity (text/categorical) // 8. Column: sex (text/categorical) // 9. Column: native-country-region (text/categorical) // 10. Column: age (numeric) // 11. Column: fnlwgt (numeric) // 12. Column: education-num (numeric) // 13. Column: capital-gain (numeric) // 14. Column: capital-loss (numeric) // 15. Column: hours-per-week (numeric) // Creating the ML.Net IHostEnvironment object, needed for the pipeline var mlContext = new MLContext(); // Creating Data Loader with the initial schema based on the format of the data var loader = TextLoaderStatic.CreateLoader( mlContext, c => ( Age: c.LoadFloat(0), Workclass: c.LoadText(1), Fnlwgt: c.LoadFloat(2), Education: c.LoadText(3), EducationNum: c.LoadFloat(4), MaritalStatus: c.LoadText(5), Occupation: c.LoadText(6), Relationship: c.LoadText(7), Ethnicity: c.LoadText(8), Sex: c.LoadText(9), CapitalGain: c.LoadFloat(10), CapitalLoss: c.LoadFloat(11), HoursPerWeek: c.LoadFloat(12), NativeCountry: c.LoadText(13), IsOver50K: c.LoadBool(14)), separator: ',', hasHeader: true); // Loader the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFilePath); var(trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); // Create the Estimator var learningPipeline = loader.MakeNewEstimator() .Append(row => ( Features: row.Age.ConcatWith( row.EducationNum, row.MaritalStatus.OneHotEncoding(), row.Occupation.OneHotEncoding(), row.Relationship.OneHotEncoding(), row.Ethnicity.OneHotEncoding(), row.Sex.OneHotEncoding(), row.HoursPerWeek, row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)), Label: row.IsOver50K)) .Append(row => ( Features: row.Features.Normalize(), Label: row.Label, Score: mlContext.BinaryClassification.Trainers.FastTree( row.Label, row.Features, numberOfTrees: 100, // try: (int) 20-2000 numberOfLeaves: 20, // try: (int) 2-128 minimumExampleCountPerLeaf: 10, // try: (int) 1-100 learningRate: 0.2))) // try: (float) 0.025-0.4 .Append(row => ( Label: row.Label, Score: row.Score, PredictedLabel: row.Score.predictedLabel)); // Fit this Pipeline to the Training Data var model = learningPipeline.Fit(trainData); // Evaluate how the model is doing on the test data var dataWithPredictions = model.Transform(testData); var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score); Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.84 Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}"); // 0.89 Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.64 Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88 Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.91 Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.68 Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.60 }
public void SimpleTextLoaderCopyColumnsTest() { var env = new MLContext(0); const string data = "0 hello 3.14159 -0 2\n" + "1 1 2 4 15"; var dataSource = new BytesStreamSource(data); var text = TextLoaderStatic.CreateLoader(env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1), numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3. dataSource, separator: ' '); // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now. // For now, just operate over the actual `IDataView`. var textData = text.Load(dataSource).AsDynamic; Action <DataViewSchema, string> CheckSchemaHasColumn = (dataSchema, name) => { Assert.True(dataSchema.GetColumnOrNull(name).HasValue, "Could not find column '" + name + "'"); }; var schema = textData.Schema; // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. CheckSchemaHasColumn(schema, "label"); CheckSchemaHasColumn(schema, "text"); CheckSchemaHasColumn(schema, "numericFeatures"); // Next verify they have the expected types. Assert.Equal(BooleanDataViewType.Instance, schema["label"].Type); Assert.Equal(TextDataViewType.Instance, schema["text"].Type); Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["numericFeatures"].Type); // Next actually inspect the data. using (var cursor = textData.GetRowCursorForAllColumns()) { var textGetter = cursor.GetGetter <ReadOnlyMemory <char> >(schema["text"]); var numericFeaturesGetter = cursor.GetGetter <VBuffer <float> >(schema["numericFeatures"]); ReadOnlyMemory <char> textVal = default; var labelGetter = cursor.GetGetter <bool>(schema["label"]); bool labelVal = default; VBuffer <float> numVal = default; void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2) { labelGetter(ref labelVal); textGetter(ref textVal); numericFeaturesGetter(ref numVal); Assert.True(tx.AsSpan().SequenceEqual(textVal.Span)); Assert.Equal((bool)bl, labelVal); Assert.Equal(3, numVal.Length); Assert.Equal(v0, numVal.GetItemOrDefault(0)); Assert.Equal(v1, numVal.GetItemOrDefault(1)); Assert.Equal(v2, numVal.GetItemOrDefault(2)); } Assert.True(cursor.MoveNext(), "Could not move even to first row"); CheckValuesSame(false, "hello", 3.14159f, -0f, 2f); Assert.True(cursor.MoveNext(), "Could not move to second row"); CheckValuesSame(true, "1", 2f, 4f, 15f); Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two"); } // The next step where we shuffle the names around a little bit is one where we are // testing out the implicit usage of copy columns. var est = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures)); var newText = text.Append(est); var newTextData = newText.Fit(dataSource).Load(dataSource); schema = newTextData.AsDynamic.Schema; // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple. CheckSchemaHasColumn(schema, "label"); CheckSchemaHasColumn(schema, "text"); // Next verify they have the expected types. Assert.Equal(BooleanDataViewType.Instance, schema["text"].Type); Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["label"].Type); }