Example #1
0
        // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>.
        public static void FastTreeBinaryClassification()
        {
            // Downloading a classification dataset from github.com/dotnet/machinelearning.
            // It will be stored in the same path as the executable
            string dataFilePath = SamplesUtils.DatasetUtils.DownloadAdultDataset();

            // Data Preview
            // 1. Column [Label]: IsOver50K (boolean)
            // 2. Column: workclass (text/categorical)
            // 3. Column: education (text/categorical)
            // 4. Column: marital-status (text/categorical)
            // 5. Column: occupation (text/categorical)
            // 6. Column: relationship (text/categorical)
            // 7. Column: ethnicity (text/categorical)
            // 8. Column: sex (text/categorical)
            // 9. Column: native-country-region (text/categorical)
            // 10. Column: age (numeric)
            // 11. Column: fnlwgt (numeric)
            // 12. Column: education-num (numeric)
            // 13. Column: capital-gain (numeric)
            // 14. Column: capital-loss (numeric)
            // 15. Column: hours-per-week (numeric)

            // Creating the ML.Net IHostEnvironment object, needed for the pipeline
            var mlContext = new MLContext();

            // Creating Data Loader with the initial schema based on the format of the data
            var loader = TextLoaderStatic.CreateLoader(
                mlContext,
                c => (
                    Age: c.LoadFloat(0),
                    Workclass: c.LoadText(1),
                    Fnlwgt: c.LoadFloat(2),
                    Education: c.LoadText(3),
                    EducationNum: c.LoadFloat(4),
                    MaritalStatus: c.LoadText(5),
                    Occupation: c.LoadText(6),
                    Relationship: c.LoadText(7),
                    Ethnicity: c.LoadText(8),
                    Sex: c.LoadText(9),
                    CapitalGain: c.LoadFloat(10),
                    CapitalLoss: c.LoadFloat(11),
                    HoursPerWeek: c.LoadFloat(12),
                    NativeCountry: c.LoadText(13),
                    IsOver50K: c.LoadBool(14)),
                separator: ',',
                hasHeader: true);

            // Loader the data, and leave 10% out, so we can use them for testing
            var data = loader.Load(dataFilePath);

            var(trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);

            // Create the Estimator
            var learningPipeline = loader.MakeNewEstimator()
                                   .Append(row => (
                                               Features: row.Age.ConcatWith(
                                                   row.EducationNum,
                                                   row.MaritalStatus.OneHotEncoding(),
                                                   row.Occupation.OneHotEncoding(),
                                                   row.Relationship.OneHotEncoding(),
                                                   row.Ethnicity.OneHotEncoding(),
                                                   row.Sex.OneHotEncoding(),
                                                   row.HoursPerWeek,
                                                   row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)),
                                               Label: row.IsOver50K))
                                   .Append(row => (
                                               Features: row.Features.Normalize(),
                                               Label: row.Label,
                                               Score: mlContext.BinaryClassification.Trainers.FastTree(
                                                   row.Label,
                                                   row.Features,
                                                   numberOfTrees: 100,             // try: (int) 20-2000
                                                   numberOfLeaves: 20,             // try: (int) 2-128
                                                   minimumExampleCountPerLeaf: 10, // try: (int) 1-100
                                                   learningRate: 0.2)))            // try: (float) 0.025-0.4
                                   .Append(row => (
                                               Label: row.Label,
                                               Score: row.Score,
                                               PredictedLabel: row.Score.predictedLabel));

            // Fit this Pipeline to the Training Data
            var model = learningPipeline.Fit(trainData);

            // Evaluate how the model is doing on the test data
            var dataWithPredictions = model.Transform(testData);

            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score);

            Console.WriteLine($"Accuracy: {metrics.Accuracy}");                    // 0.84
            Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}");                // 0.89
            Console.WriteLine($"F1 Score: {metrics.F1Score}");                     // 0.64

            Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88
            Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}");       // 0.91
            Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.68
            Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}");       // 0.60
        }
Example #2
0
        public void SimpleTextLoaderCopyColumnsTest()
        {
            var env = new MLContext(0);

            const string data = "0 hello 3.14159 -0 2\n"
                                + "1 1 2 4 15";
            var dataSource = new BytesStreamSource(data);

            var text = TextLoaderStatic.CreateLoader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1),
                                                         numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3.
                                                     dataSource, separator: ' ');

            // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions
            // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now.
            // For now, just operate over the actual `IDataView`.
            var textData = text.Load(dataSource).AsDynamic;

            Action <DataViewSchema, string> CheckSchemaHasColumn = (dataSchema, name) =>
            {
                Assert.True(dataSchema.GetColumnOrNull(name).HasValue, "Could not find column '" + name + "'");
            };

            var schema = textData.Schema;

            // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple.
            CheckSchemaHasColumn(schema, "label");
            CheckSchemaHasColumn(schema, "text");
            CheckSchemaHasColumn(schema, "numericFeatures");
            // Next verify they have the expected types.
            Assert.Equal(BooleanDataViewType.Instance, schema["label"].Type);
            Assert.Equal(TextDataViewType.Instance, schema["text"].Type);
            Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["numericFeatures"].Type);
            // Next actually inspect the data.
            using (var cursor = textData.GetRowCursorForAllColumns())
            {
                var textGetter                = cursor.GetGetter <ReadOnlyMemory <char> >(schema["text"]);
                var numericFeaturesGetter     = cursor.GetGetter <VBuffer <float> >(schema["numericFeatures"]);
                ReadOnlyMemory <char> textVal = default;
                var             labelGetter   = cursor.GetGetter <bool>(schema["label"]);
                bool            labelVal      = default;
                VBuffer <float> numVal        = default;

                void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2)
                {
                    labelGetter(ref labelVal);
                    textGetter(ref textVal);
                    numericFeaturesGetter(ref numVal);
                    Assert.True(tx.AsSpan().SequenceEqual(textVal.Span));
                    Assert.Equal((bool)bl, labelVal);
                    Assert.Equal(3, numVal.Length);
                    Assert.Equal(v0, numVal.GetItemOrDefault(0));
                    Assert.Equal(v1, numVal.GetItemOrDefault(1));
                    Assert.Equal(v2, numVal.GetItemOrDefault(2));
                }

                Assert.True(cursor.MoveNext(), "Could not move even to first row");
                CheckValuesSame(false, "hello", 3.14159f, -0f, 2f);
                Assert.True(cursor.MoveNext(), "Could not move to second row");
                CheckValuesSame(true, "1", 2f, 4f, 15f);
                Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two");
            }

            // The next step where we shuffle the names around a little bit is one where we are
            // testing out the implicit usage of copy columns.

            var est         = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures));
            var newText     = text.Append(est);
            var newTextData = newText.Fit(dataSource).Load(dataSource);

            schema = newTextData.AsDynamic.Schema;
            // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple.
            CheckSchemaHasColumn(schema, "label");
            CheckSchemaHasColumn(schema, "text");
            // Next verify they have the expected types.
            Assert.Equal(BooleanDataViewType.Instance, schema["text"].Type);
            Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["label"].Type);
        }