Example #1
0
        public void PcaWorkout()
        {
            var data = TextLoaderStatic.CreateReader(_env,
                                                     c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)),
                                                     separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var invalidData = TextLoaderStatic.CreateReader(_env,
                                                            c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(_dataSource);

            var est = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 4, seed: 10);

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var estNonDefaultArgs = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 3, weightColumn: "weight", overSampling: 2, center: false);

            TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            Done();
        }
Example #2
0
        public void TestPcaEstimator()
        {
            var data = TextLoaderStatic.CreateReader(_env,
                                                     c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                     separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = new PrincipalComponentAnalysisEstimator(_env, "features", "pca", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = _env.Start("save"))
            {
                IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(_env, savedData, new[] { "pca" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void OnnxStatic()
        {
            var env         = new MLContext(null, 1);
            var imageHeight = 224;
            var imageWidth  = 224;
            var dataFile    = GetDataPath("images/images.tsv");
            var imageFolder = Path.GetDirectoryName(dataFile);

            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         imagePath: ctx.LoadText(0),
                                                         name: ctx.LoadText(1)))
                       .Read(dataFile);

            var pipe = data.MakeNewEstimator()
                       .Append(row => (
                                   row.name,
                                   data_0: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true)))
                       .Append(row => (row.name, output_1: row.data_0.DnnImageFeaturizer(m => m.ModelSelector.ResNet18(m.Environment, m.OutputColumn, m.InputColumn))));

            TestEstimatorCore(pipe.AsDynamic, data.AsDynamic);

            var result = pipe.Fit(data).Transform(data).AsDynamic;

            result.Schema.TryGetColumnIndex("output_1", out int output);
            using (var cursor = result.GetRowCursor(result.Schema["output_1"]))
            {
                var buffer  = default(VBuffer <float>);
                var getter  = cursor.GetGetter <VBuffer <float> >(output);
                var numRows = 0;
                while (cursor.MoveNext())
                {
                    getter(ref buffer);
                    Assert.Equal(512, buffer.Length);
                    numRows += 1;
                }
                Assert.Equal(4, numRows);
            }
        }
Example #4
0
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Load(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Load(sentimentDataPath)
                              .AsDynamic;

            var feat = data.MakeNewEstimator()
                       .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options {
                OutputTokensColumnName = "OutputTokens",
            }));

            TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = ((IHostEnvironment)ML).Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4);
                savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
Example #5
0
        public void CrossValidate()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx    = new MulticlassClassificationContext(env);
            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            var est = reader.MakeNewEstimator()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(
                                        r.label,
                                        r.features,
                                        maxIterations: 2)));

            var results = ctx.CrossValidate(reader.Read(dataSource), est, r => r.label)
                          .Select(x => x.metrics).ToArray();

            Assert.Equal(5, results.Length);
            Assert.True(results.All(x => x.LogLoss > 0));
        }
Example #6
0
        public void SdcaWorkout()
        {
            var dataPath = GetDataPath("breast-cancer.txt");

            var data = TextLoaderStatic.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10)))
                       .Read(dataPath).Cache();

            var binaryTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
                new SdcaBinaryTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(binaryTrainer, data.AsDynamic);

            var nonCalibratedBinaryTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
                new SdcaNonCalibratedBinaryTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(nonCalibratedBinaryTrainer, data.AsDynamic);

            var regressionTrainer = ML.Regression.Trainers.StochasticDualCoordinateAscent(
                new SdcaRegressionTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(regressionTrainer, data.AsDynamic);

            var mcTrainer = ML.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(
                new SdcaMultiClassTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(mcTrainer, data.AsDynamic);

            Done();
        }
Example #7
0
        public void KeyToValuePigsty()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)
                                                                ));

            var data = reader.Read(dataPath);

            // Non-pigsty Term.
            var dynamicData = new ValueToKeyMappingEstimator(Env, new[] {
                new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"),
                new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString")
            })
                              .Fit(data.AsDynamic).Transform(data.AsDynamic);

            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     A: ctx.KeyU4.TextValues.Scalar,
                                                     B: ctx.KeyU4.TextValues.Vector));

            var est = data2.MakeNewEstimator()
                      .Append(row => (
                                  ScalarString: row.A.ToValue(),
                                  VectorString: row.B.ToValue()));

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);

            var data2Transformed = est.Fit(data2).Transform(data2).AsDynamic;
            // Check that term and ToValue are round-trippable.
            var dataLeft  = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data.AsDynamic).Transform(data.AsDynamic);
            var dataRight = ML.Transforms.SelectColumns(new[] { "ScalarString", "VectorString" }).Fit(data2Transformed).Transform(data2Transformed);

            CheckSameSchemas(dataLeft.Schema, dataRight.Schema);
            CheckSameValues(dataLeft, dataRight);
            Done();
        }
        public void WordBagWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(Env, "text", "bag_of_words").
                      Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash", invertHash: -1));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "bag_of_words.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "bag_of_words.tsv");
            Done();
        }
        public void CategoricalStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Load(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 2, C = 3,
                                          }, new TestClass()
                                          {
                                              A = 4, B = 5, C = 6
                                          } };

            var invalidData = ML.Data.LoadFromEnumerable(wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin)
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Categorical", "featurized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4);
            var view       = ML.Transforms.SelectColumns("A", "B", "C", "D", "E").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true);

            CheckEquality("Categorical", "featurized.tsv");
            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramExtractingEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashingEstimator(Env, "terms", "ngramshash"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath)
                              .AsDynamic;

            var feat = data.MakeNewEstimator()
                       .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; }));

            TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, feat.Fit(data).Transform(data).AsDynamic, 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "Data", "Data_TransformedText" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
        public void TestWordEmbeddings()
        {
            var dataPath     = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var testDataPath = GetDataPath(TestDatasets.Sentiment.testFilename);

            var data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         SentimentText: ctx.LoadText(1)), hasHeader: true)
                       .Read(dataPath);

            var dynamicData = TextFeaturizingEstimator.Create(Env, new TextFeaturizingEstimator.Arguments()
            {
                Column = new TextFeaturizingEstimator.Column
                {
                    Name   = "SentimentText_Features",
                    Source = new[] { "SentimentText" }
                },
                OutputTokens                 = true,
                KeepPunctuations             = false,
                UsePredefinedStopWordRemover = true,
                VectorNormalizer             = TextFeaturizingEstimator.TextNormKind.None,
                CharFeatureExtractor         = null,
                WordFeatureExtractor         = null,
            }, data.AsDynamic);

            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     SentimentText_Features_TransformedText: ctx.Text.VarVector,
                                                     SentimentText: ctx.Text.Scalar,
                                                     label: ctx.Bool.Scalar));

            var est = data2.MakeNewEstimator()
                      .Append(row => row.SentimentText_Features_TransformedText.WordEmbeddings());

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);
            Done();
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);
            var est = ML.Transforms.Text.NormalizeText("text")
                      .Append(ML.Transforms.Text.TokenizeWords("text", "words"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("words", "NoDefaultStopwords"))
                      .Append(ML.Transforms.Text.RemoveStopWords("words", "NoStopWords", "xbox", "this", "is", "a", "the", "THAT", "bY"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "NoDefaultStopwords", "NoStopWords" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
Example #14
0
        public void AveragePerceptronCalibration()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryModelParameters pred = null;

            var loss = new HingeLoss(1);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss,
                                                                                    numIterations: 2, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Example #15
0
        public void RffStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                VectorFloat: ctx.LoadFloat(1, 8),
                                                                Label: ctx.LoadFloat(0)
                                                                ));

            var data = reader.Load(dataPath);

            var est = data.MakeNewEstimator()
                      .Append(row => (
                                  RffVectorFloat: row.VectorFloat.LowerVectorSizeWithRandomFourierTransformation(3, true), row.Label));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic);

            var outputPath = GetOutputPath("Rff", "featurized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);
            CheckEquality("Rff", "featurized.tsv");
            Done();
        }
        public void LpGcNormAndWhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = ML.Transforms.Projection.LpNormalize("lpnorm", "features")
                      .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnorm", "features"))
                      .Append(new VectorWhiteningEstimator(ML, "whitened", "features"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ML.Transforms.SelectColumns("lpnorm", "gcnorm", "whitened").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void SimpleTextLoaderCopyColumnsTest()
        {
            var env = new MLContext(0);

            const string data = "0 hello 3.14159 -0 2\n"
                                + "1 1 2 4 15";
            var dataSource = new BytesStreamSource(data);

            var text = TextLoaderStatic.CreateLoader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1),
                                                         numericFeatures: ctx.LoadFloat(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3.
                                                     dataSource, separator: ' ');

            // While we have a type-safe wrapper for `IDataView` it is utterly useless except as an input to the `Fit` functions
            // of the other statically typed wrappers. We perhaps ought to make it useful in its own right, but perhaps not now.
            // For now, just operate over the actual `IDataView`.
            var textData = text.Load(dataSource).AsDynamic;

            Action <DataViewSchema, string> CheckSchemaHasColumn = (dataSchema, name) =>
            {
                Assert.True(dataSchema.GetColumnOrNull(name).HasValue, "Could not find column '" + name + "'");
            };

            var schema = textData.Schema;

            // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple.
            CheckSchemaHasColumn(schema, "label");
            CheckSchemaHasColumn(schema, "text");
            CheckSchemaHasColumn(schema, "numericFeatures");
            // Next verify they have the expected types.
            Assert.Equal(BooleanDataViewType.Instance, schema["label"].Type);
            Assert.Equal(TextDataViewType.Instance, schema["text"].Type);
            Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["numericFeatures"].Type);
            // Next actually inspect the data.
            using (var cursor = textData.GetRowCursorForAllColumns())
            {
                var textGetter                = cursor.GetGetter <ReadOnlyMemory <char> >(schema["text"]);
                var numericFeaturesGetter     = cursor.GetGetter <VBuffer <float> >(schema["numericFeatures"]);
                ReadOnlyMemory <char> textVal = default;
                var             labelGetter   = cursor.GetGetter <bool>(schema["label"]);
                bool            labelVal      = default;
                VBuffer <float> numVal        = default;

                void CheckValuesSame(bool bl, string tx, float v0, float v1, float v2)
                {
                    labelGetter(ref labelVal);
                    textGetter(ref textVal);
                    numericFeaturesGetter(ref numVal);
                    Assert.True(tx.AsSpan().SequenceEqual(textVal.Span));
                    Assert.Equal((bool)bl, labelVal);
                    Assert.Equal(3, numVal.Length);
                    Assert.Equal(v0, numVal.GetItemOrDefault(0));
                    Assert.Equal(v1, numVal.GetItemOrDefault(1));
                    Assert.Equal(v2, numVal.GetItemOrDefault(2));
                }

                Assert.True(cursor.MoveNext(), "Could not move even to first row");
                CheckValuesSame(false, "hello", 3.14159f, -0f, 2f);
                Assert.True(cursor.MoveNext(), "Could not move to second row");
                CheckValuesSame(true, "1", 2f, 4f, 15f);
                Assert.False(cursor.MoveNext(), "Moved to third row, but there should have been only two");
            }

            // The next step where we shuffle the names around a little bit is one where we are
            // testing out the implicit usage of copy columns.

            var est         = text.MakeNewEstimator().Append(r => (text: r.label, label: r.numericFeatures));
            var newText     = text.Append(est);
            var newTextData = newText.Fit(dataSource).Load(dataSource);

            schema = newTextData.AsDynamic.Schema;
            // First verify that the columns are there. There ought to be at least one column corresponding to the identifiers in the tuple.
            CheckSchemaHasColumn(schema, "label");
            CheckSchemaHasColumn(schema, "text");
            // Next verify they have the expected types.
            Assert.Equal(BooleanDataViewType.Instance, schema["text"].Type);
            Assert.Equal(new VectorType(NumberDataViewType.Single, 3), schema["label"].Type);
        }
        public static void LightGbmBinaryClassification()
        {
            // Downloading a classification dataset from github.com/dotnet/machinelearning.
            // It will be stored in the same path as the executable
            string dataFilePath = SamplesUtils.DatasetUtils.DownloadAdultDataset();

            // Data Preview
            // 1. Column [Label]: IsOver50K (boolean)
            // 2. Column: workclass (text/categorical)
            // 3. Column: education (text/categorical)
            // 4. Column: marital-status (text/categorical)
            // 5. Column: occupation (text/categorical)
            // 6. Column: relationship (text/categorical)
            // 7. Column: ethnicity (text/categorical)
            // 8. Column: sex (text/categorical)
            // 9. Column: native-country-region (text/categorical)
            // 10. Column: age (numeric)
            // 11. Column: fnlwgt (numeric)
            // 12. Column: education-num (numeric)
            // 13. Column: capital-gain (numeric)
            // 14. Column: capital-loss (numeric)
            // 15. Column: hours-per-week (numeric)

            // Creating the ML.Net IHostEnvironment object, needed for the pipeline
            var mlContext = new MLContext();

            // Creating Data Loader with the initial schema based on the format of the data
            var loader = TextLoaderStatic.CreateLoader(
                mlContext,
                c => (
                    Age: c.LoadFloat(0),
                    Workclass: c.LoadText(1),
                    Fnlwgt: c.LoadFloat(2),
                    Education: c.LoadText(3),
                    EducationNum: c.LoadFloat(4),
                    MaritalStatus: c.LoadText(5),
                    Occupation: c.LoadText(6),
                    Relationship: c.LoadText(7),
                    Ethnicity: c.LoadText(8),
                    Sex: c.LoadText(9),
                    CapitalGain: c.LoadFloat(10),
                    CapitalLoss: c.LoadFloat(11),
                    HoursPerWeek: c.LoadFloat(12),
                    NativeCountry: c.LoadText(13),
                    IsOver50K: c.LoadBool(14)),
                separator: ',',
                hasHeader: true);

            // Load the data, and leave 10% out, so we can use them for testing
            var data = loader.Load(dataFilePath);

            var(trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);

            // Create the Estimator
            var learningPipeline = loader.MakeNewEstimator()
                                   .Append(row => (
                                               Features: row.Age.ConcatWith(
                                                   row.EducationNum,
                                                   row.MaritalStatus.OneHotEncoding(),
                                                   row.Occupation.OneHotEncoding(),
                                                   row.Relationship.OneHotEncoding(),
                                                   row.Ethnicity.OneHotEncoding(),
                                                   row.Sex.OneHotEncoding(),
                                                   row.HoursPerWeek,
                                                   row.NativeCountry.OneHotEncoding().SelectFeaturesBasedOnCount(count: 10)),
                                               Label: row.IsOver50K))
                                   .Append(row => (
                                               Features: row.Features.Normalize(),
                                               Label: row.Label,
                                               Score: mlContext.BinaryClassification.Trainers.LightGbm(
                                                   row.Label,
                                                   row.Features,
                                                   numberOfLeaves: 4,
                                                   minimumExampleCountPerLeaf: 6,
                                                   learningRate: 0.001)))
                                   .Append(row => (
                                               Label: row.Label,
                                               Score: row.Score,
                                               PredictedLabel: row.Score.predictedLabel));

            // Fit this Pipeline to the Training Data
            var model = learningPipeline.Fit(trainData);

            // Evaluate how the model is doing on the test data
            var dataWithPredictions = model.Transform(testData);

            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, row => row.Label, row => row.Score);

            Console.WriteLine($"Accuracy: {metrics.Accuracy}");                    // 0.84
            Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve}");                // 0.89
            Console.WriteLine($"F1 Score: {metrics.F1Score}");                     // 0.64

            Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88
            Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}");       // 0.91
            Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.68
            Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}");       // 0.60
        }
        public static void Example()
        {
            // Downloading a regression dataset from github.com/dotnet/machinelearning
            // this will create a housing.txt file in the filsystem.
            // You can open the file to see the data.
            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();

            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var mlContext = new MLContext();

            // Creating a data loader, based on the format of the data
            var loader = TextLoaderStatic.CreateLoader(mlContext, c => (
                                                           label: c.LoadFloat(0),
                                                           features: c.LoadFloat(1, 6)
                                                           ),
                                                       separator: '\t', hasHeader: true);

            // Load the data, and leave 10% out, so we can use them for testing
            var data = loader.Load(new MultiFileSource(dataFile));

            var(trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);

            // The predictor that gets produced out of training
            LightGbmRegressionModelParameters pred = null;

            // Create the estimator
            var learningPipeline = loader.MakeNewEstimator()
                                   .Append(r => (r.label, score: mlContext.Regression.Trainers.LightGbm(
                                                     r.label,
                                                     r.features,
                                                     numberOfLeaves: 4,
                                                     minimumExampleCountPerLeaf: 6,
                                                     learningRate: 0.001,
                                                     onFit: p => pred = p)
                                                 )
                                           );

            // Fit this pipeline to the training data
            var model = learningPipeline.Fit(trainData);

            // Check the weights that the model learned
            VBuffer <float> weights = default;

            pred.GetFeatureWeights(ref weights);

            var weightsValues = weights.GetValues();

            Console.WriteLine($"weight 0 - {weightsValues[0]}");
            Console.WriteLine($"weight 1 - {weightsValues[1]}");

            // Evaluate how the model is doing on the test data
            var dataWithPredictions = model.Transform(testData);
            var metrics             = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score);

            Console.WriteLine($"L1 - {metrics.MeanAbsoluteError}");      // 4.9669731
            Console.WriteLine($"L2 - {metrics.MeanSquaredError}");       // 51.37296
            Console.WriteLine($"LossFunction - {metrics.LossFunction}"); // 51.37296
            Console.WriteLine($"RMS - {metrics.RootMeanSquaredError}");  // 7.167493
            Console.WriteLine($"RSquared - {metrics.RSquared}");         // 0.079478
        }
        public void FastTreeRegressionRepresentation()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var catalog = new RegressionCatalog(env);

            var reader = TextLoaderStatic.CreateLoader(env,
                                                       c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                       separator: ';', hasHeader: true);

            var opts = new FastTreeRegressionTrainer.Options()
            {
                NumTrees   = 10,
                NumLeaves  = 5,
                NumThreads = 1
            };

            FastTreeRegressionModelParameters pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts,
                                                                              onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var treeCollection = pred.TrainedTreeEnsemble;

            Assert.Equal(0, treeCollection.Bias);
            Assert.Equal(10, treeCollection.Trees.Count);
            Assert.Equal(10, treeCollection.TreeWeights.Count);

            var trees = treeCollection.Trees;

            Assert.Equal(4, trees[0].NumNodes);

            // Numerical split. There is no categorical split so the follwoing vector contains 0-element.
            var categoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(0);

            Assert.Equal(0, categoricalSplitFeatures.Count);

            // Numerical split. There is no categorical split so the follwoing vector contains 0-element.
            var categoricalSplitFeatureRange = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0);

            Assert.Equal(0, categoricalSplitFeatureRange.Count);

            var expectedGtChild = new int[] { 3, 2, -4, -5 };

            Assert.Equal(4, trees[0].GtChild.Count);
            Assert.Equal(expectedGtChild, trees[0].GtChild);

            var expectedLteChild = new int[] { 1, -1, -3, -2 };

            Assert.Equal(4, trees[0].LteChild.Count);
            Assert.Equal(expectedLteChild, trees[0].LteChild);

            var expectedCategoricalSplitFlags = new bool[] { false, false, false, false };

            Assert.Equal(4, trees[0].CategoricalSplitFlags.Count);
            Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags);

            var expectedNumericalSplitFeatureIndexes = new int[] { 0, 10, 2, 10 };

            Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count);
            Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes);

            var expectedNumericalSplitThresholds = new float[] { 0.14f, -0.645f, -0.095f, 0.31f };

            Assert.Equal(4, trees[0].NumericalSplitThresholds.Count);
            for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i)
            {
                Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6);
            }

            Assert.Equal(5, trees[0].NumLeaves);

            var expectedLeafValues = new double[] { 40.159015006449692, 80.434805844435061, 57.072130551545513, 82.898710076162757, 104.17547955322266 };

            Assert.Equal(5, trees[0].LeafValues.Count);
            for (int i = 0; i < trees[0].LeafValues.Count; ++i)
            {
                Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6);
            }
        }
        public void FastTreeRegressionRepresentationWithCategoricalSplit()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var catalog = new RegressionCatalog(env);

            var reader = TextLoaderStatic.CreateLoader(env,
                                                       c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                       separator: ';', hasHeader: true);

            FastTreeRegressionModelParameters pred = null;

            var opts = new FastTreeRegressionTrainer.Options()
            {
                CategoricalSplit = true,
                NumTrees         = 3,
                NumLeaves        = 5,
                NumThreads       = 1,
                // This is the minimal samples to form a split (i.e., generating two extra nodes/leaves). For a small data set,
                // we should set a small value. Otherwise, the trained trees could be empty.
                MinDocumentsInLeafs = 2
            };

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, features: r.features.OneHotEncoding()))
                      .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts,
                                                                              onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var treeCollection = pred.TrainedTreeEnsemble;

            Assert.Equal(0, treeCollection.Bias);
            Assert.Equal(3, treeCollection.Trees.Count);
            Assert.Equal(3, treeCollection.TreeWeights.Count);

            var trees = treeCollection.Trees;

            Assert.Equal(4, trees[0].NumNodes);

            var expectedGtChild = new int[] { 3, -3, -4, -5 };

            Assert.Equal(4, trees[0].GtChild.Count);
            Assert.Equal(expectedGtChild, trees[0].GtChild);

            var expectedLteChild = new int[] { 1, 2, -1, -2 };

            Assert.Equal(4, trees[0].LteChild.Count);
            Assert.Equal(expectedLteChild, trees[0].LteChild);

            var expectedCategoricalSplitFlags = new bool[] { true, true, true, true };

            Assert.Equal(4, trees[0].CategoricalSplitFlags.Count);
            Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags);

            var expectedNumericalSplitFeatureIndexes = new int[] { 5312, 2, 2126, 533 };

            Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count);
            Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes);

            var expectedNumericalSplitThresholds = new float[] { 0.5f, 0.5f, 0.5f, 0.5f };

            Assert.Equal(4, trees[0].NumericalSplitThresholds.Count);
            for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i)
            {
                Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6);
            }

            var actualCategoricalRanges0 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0);

            Assert.Equal(actualCategoricalRanges0, new int[] { 5312, 5782 });

            var actualCategoricalRanges1 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(1);

            Assert.Equal(actualCategoricalRanges1, new int[] { 2, 417 });

            var actualCategoricalRanges2 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(2);

            Assert.Equal(actualCategoricalRanges2, new int[] { 2126, 2593 });

            var actualCategoricalRanges3 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(3);

            Assert.Equal(actualCategoricalRanges3, new int[] { 533, 983 });

            int[] expectedCounts = { 62, 52, 54, 22 };
            int[] expectedStarts = { 5315, 10, 2141, 533 };
            int[] expectedEnds   = { 5782, 401, 2558, 874 };
            for (int i = 0; i < trees[0].NumNodes; ++i)
            {
                // Retrieve i-th node's split features.
                var actualCategoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(i);
                Assert.Equal(expectedCounts[i], actualCategoricalSplitFeatures.Count);
                Assert.Equal(expectedStarts[i], actualCategoricalSplitFeatures[0]);
                Assert.Equal(expectedEnds[i], actualCategoricalSplitFeatures[expectedCounts[i] - 1]);
            }

            Assert.Equal(5, trees[0].NumLeaves);

            var expectedLeafValues = new double[] { 48.456055413607892, 86.584156799316418, 87.017326642027, 76.381184971185391, 117.68872643673058 };

            Assert.Equal(5, trees[0].LeafValues.Count);
            for (int i = 0; i < trees[0].LeafValues.Count; ++i)
            {
                Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6);
            }
        }
Example #22
0
        public static void FeatureSelectionTransform()
        {
            // Downloading a classification dataset from github.com/dotnet/machinelearning.
            // It will be stored in the same path as the executable
            string dataFilePath = SamplesUtils.DatasetUtils.DownloadBreastCancerDataset();

            // Data Preview
            //    1. Label							0=benign, 1=malignant
            //    2. Clump Thickness               1 - 10
            //    3. Uniformity of Cell Size       1 - 10
            //    4. Uniformity of Cell Shape      1 - 10
            //    5. Marginal Adhesion             1 - 10
            //    6. Single Epithelial Cell Size   1 - 10
            //    7. Bare Nuclei                   1 - 10
            //    8. Bland Chromatin               1 - 10
            //    9. Normal Nucleoli               1 - 10
            //   10. Mitoses                       1 - 10

            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var ml = new MLContext();

            // First, we define the loader: specify the data columns and where to find them in the text file. Notice that we combine entries from
            // all the feature columns into entries of a vector of a single column named "Features".
            var loader = TextLoaderStatic.CreateLoader(ml, c => (
                                                           Label: c.LoadBool(0),
                                                           Features: c.LoadFloat(1, 9)
                                                           ),
                                                       separator: '\t', hasHeader: true);

            // Then, we use the loader to load the data as an IDataView.
            var data = loader.Load(dataFilePath);

            // Second, we define the transformations that we apply on the data. Remember that an Estimator does not transform data
            // directly, but it needs to be trained on data using .Fit(), and it will output a Transformer, which can transform data.

            // In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default
            // values than the specified count. This transformation can be used to remove slots with too many missing values.
            // We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature
            // vector based on highest mutual information between that slot and a specified label. Notice that it is possible to
            // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information
            // between features and label.
            var pipeline = loader.MakeNewEstimator()
                           .Append(r => (
                                       FeaturesCountSelect: r.Features.SelectFeaturesBasedOnCount(count: 695),
                                       Label: r.Label
                                       ))
                           .Append(r => (
                                       FeaturesCountSelect: r.FeaturesCountSelect,
                                       FeaturesMISelect: r.FeaturesCountSelect.SelectFeaturesBasedOnMutualInformation(r.Label, slotsInOutput: 5),
                                       Label: r.Label
                                       ));


            // The pipeline can then be trained, using .Fit(), and the resulting transformer can be used to transform data.
            var transformedData = pipeline.Fit(data).Transform(data);

            // Small helper to print the data inside a column, in the console. Only prints the first 10 rows.
            Action <string, IEnumerable <VBuffer <float> > > printHelper = (columnName, column) =>
            {
                Console.WriteLine($"{columnName} column obtained post-transformation.");
                int count = 0;
                foreach (var row in column)
                {
                    foreach (var value in row.GetValues())
                    {
                        Console.Write($"{value}\t");
                    }
                    Console.WriteLine("");
                    count++;
                    if (count >= 10)
                    {
                        break;
                    }
                }

                Console.WriteLine("===================================================");
            };

            // Print the data that results from the transformations.
            var countSelectColumn = transformedData.AsDynamic.GetColumn <VBuffer <float> >(transformedData.AsDynamic.Schema["FeaturesCountSelect"]);
            var MISelectColumn    = transformedData.AsDynamic.GetColumn <VBuffer <float> >(transformedData.AsDynamic.Schema["FeaturesMISelect"]);

            printHelper("FeaturesCountSelect", countSelectColumn);
            printHelper("FeaturesMISelect", MISelectColumn);

            // Below is the output of the this code. We see that some slots habe been dropped by the first transformation.
            // Among the remaining slots, the second transformation only preserves the top 5 slots based on mutualinformation
            // with the label column.

            // FeaturesCountSelect column obtained post-transformation.
            // 5       4       4       5       7       3       2       1
            // 3       1       1       1       2       3       1       1
            // 6       8       8       1       3       3       7       1
            // 4       1       1       3       2       3       1       1
            // 8       10      10      8       7       9       7       1
            // 1       1       1       1       2       3       1       1
            // 2       1       2       1       2       3       1       1
            // 2       1       1       1       2       1       1       5
            // 4       2       1       1       2       2       1       1
            // 1       1       1       1       1       3       1       1
            // ===================================================
            // FeaturesMISelect column obtained post-transformation.
            // 4       4       7       3       2
            // 1       1       2       3       1
            // 8       8       3       3       7
            // 1       1       2       3       1
            // 10      10      7       9       7
            // 1       1       2       3       1
            // 1       2       2       3       1
            // 1       1       2       1       1
            // 2       1       2       2       1
            // 1       1       1       3       1
            // ===================================================
        }