Example #1
0
        public void FeatureSelectionWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(ML, "text", "bag_of_words")
                      .AppendCacheCheckpoint(ML)
                      .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words", "bag_of_words_count", 10)
                              .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words", "bag_of_words_mi", labelColumn: "label")));

            var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "bag_of_words_count", "bag_of_words_mi" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("FeatureSelection", "featureselection.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadText(1)), hasHeader: true)
                              .Read(dataPath).AsDynamic;

            var est       = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
            var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata);

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);
            var est = ML.Transforms.Text.NormalizeText("text")
                      .Append(ML.Transforms.Text.TokenizeWords("words", "text"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
                      .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ML.Transforms.SelectColumns("text", "NoDefaultStopwords", "NoStopWords").Fit(savedData).Transform(savedData);
            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
Example #4
0
        public void SimpleImageSmokeTest()
        {
            var env = new MLContext(0);

            var reader = TextLoaderStatic.CreateReader(env,
                                                       ctx => ctx.LoadText(0).LoadAsImage().AsGrayscale().Resize(10, 8).ExtractPixels());

            var schema = reader.AsDynamic.GetOutputSchema();

            Assert.True(schema.TryGetColumnIndex("Data", out int col), "Could not find 'Data' column");
            var type    = schema[col].Type;
            var vecType = type as VectorType;

            Assert.True(vecType?.Size > 0, $"Type was supposed to be known size vector but was instead '{type}'");
            Assert.Equal(NumberType.R4, vecType.ItemType);
            Assert.Equal(3, vecType.Dimensions.Length);
            Assert.Equal(3, vecType.Dimensions[0]);
            Assert.Equal(8, vecType.Dimensions[1]);
            Assert.Equal(10, vecType.Dimensions[2]);

            var readAsImage = TextLoaderStatic.CreateReader(env,
                                                            ctx => ctx.LoadText(0).LoadAsImage());
            var est  = readAsImage.MakeNewEstimator().Append(r => r.AsGrayscale().Resize(10, 8).ExtractPixels());
            var pipe = readAsImage.Append(est);
        }
        public void WhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new VectorWhiteningEstimator(ML, "whitened1", "features")
                      .Append(new VectorWhiteningEstimator(ML, "whitened2", "features", kind: WhiteningKind.Pca, pcaNum: 5));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ML.Transforms.SelectColumns("whitened1", "whitened2").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void GcnWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = ML.Transforms.Projection.GlobalContrastNormalize("gcnNorm1", "features")
                      .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnNorm2", "features", substractMean: false, useStdDev: true, scale: 3));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ML.Transforms.SelectColumns("gcnNorm1", "gcnNorm2").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "gcnNorm.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void WordBagWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(ML, "bag_of_words", "text").
                      Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", invertHash: -1));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ML.Transforms.SelectColumns("text", "bag_of_words", "bag_of_wordshash").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "bag_of_words.tsv");
            Done();
        }
Example #8
0
        public void SdcaWorkout()
        {
            var dataPath = GetDataPath("breast-cancer.txt");

            var data = TextLoaderStatic.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10)))
                       .Read(dataPath).Cache();

            var binaryTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscent(
                new SdcaBinaryTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(binaryTrainer, data.AsDynamic);

            var regressionTrainer = ML.Regression.Trainers.StochasticDualCoordinateAscent(
                new SdcaRegressionTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(regressionTrainer, data.AsDynamic);

            var mcTrainer = ML.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(
                new SdcaMultiClassTrainer.Options {
                ConvergenceTolerance = 1e-2f
            });

            TestEstimatorCore(mcTrainer, data.AsDynamic);

            Done();
        }
Example #9
0
        public void LpNormWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new LpNormalizingEstimator(ML, "lpNorm1", "features")
                      .Append(new LpNormalizingEstimator(ML, "lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, substractMean: true));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "lpNorm1", "lpNorm2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpNorm.tsv");
            Done();
        }
Example #10
0
        public void TestWordEmbeddings()
        {
            var dataPath     = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var testDataPath = GetDataPath(TestDatasets.Sentiment.testFilename);

            var data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         SentimentText: ctx.LoadText(1)), hasHeader: true)
                       .Read(dataPath);
            var dynamicData = new TextFeaturizingEstimator(Env, "SentimentText_Features", "SentimentText", args =>
            {
                args.OutputTokens     = true;
                args.KeepPunctuations = false;
                args.UseStopRemover   = true;
                args.VectorNormalizer = TextFeaturizingEstimator.TextNormKind.None;
                args.UseCharExtractor = false;
                args.UseWordExtractor = false;
            }).Fit(data.AsDynamic).Transform(data.AsDynamic);
            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     SentimentText_Features_TransformedText: ctx.Text.VarVector,
                                                     SentimentText: ctx.Text.Scalar,
                                                     label: ctx.Bool.Scalar));

            var est = data2.MakeNewEstimator()
                      .Append(row => row.SentimentText_Features_TransformedText.WordEmbeddings());

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);
            Done();
        }
Example #11
0
        public void RffStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                VectorFloat: ctx.LoadFloat(1, 8),
                                                                Label: ctx.LoadFloat(0)
                                                                ));

            var data = reader.Read(dataPath);

            var est = data.MakeNewEstimator()
                      .Append(row => (
                                  RffVectorFloat: row.VectorFloat.LowerVectorSizeWithRandomFourierTransformation(3, true), row.Label));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic);

            var outputPath = GetOutputPath("Rff", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Rff", "featurized.tsv");
            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(ML, "terms", "text"))
                      .Append(new NgramExtractingEstimator(ML, "ngrams", "terms"))
                      .Append(new NgramHashingEstimator(ML, "ngramshash", "terms"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
Example #13
0
        public static void Bar()
        {
            IHostEnvironment env = null;
            var text             = TextLoaderStatic.CreateReader(env, ctx => (
                                                                     label: ctx.LoadBool(0),
                                                                     text: ctx.LoadText(1),
                                                                     numericFeatures: ctx.LoadFloat(2, 5)));

            var est = text.MakeNewEstimator();

            // This should work.
            est.Append(r => r.text);
            // These should not.
            est.Append(r => 5);
            est.Append(r => new { r.text, bad = 2 });
            // This should work.
            est.Append(r => Tuple.Create(r.text, r.numericFeatures));
            // This should work.
            est.Append(r => (a: r.text, b: r.label, c: (d: r.text, r.label)));
            // This should not, and it should indicate a path to the problematic item.
            est.Append(r => (a: r.text, b: r.label, c: (d: r.text, 5.2f)));

            // Check a different entrance into static land now, with one of the asserts.
            var view = text.Read(null).AsDynamic;

            // Despite the fact that the names are all wrong, this should still work
            // from the point of view of this analyzer.
            view.AssertStatic(env, c => (
                                  stay: c.KeyU4.TextValues.Scalar,
                                  awhile: c.KeyU1.I4Values.Vector));
            // However, this should not.
            view.AssertStatic(env, c => (
                                  and: c.KeyU4.TextValues.Scalar,
                                  listen: 1l));
        }
Example #14
0
        public void KeyToVectorStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)
                                                                ));

            var data = reader.Read(dataPath);

            // Non-pigsty Term.
            var dynamicData = new ValueToKeyMappingEstimator(Env, new[] {
                new ValueToKeyMappingTransformer.ColumnInfo("ScalarString", "A"),
                new ValueToKeyMappingTransformer.ColumnInfo("VectorString", "B")
            })
                              .Fit(data.AsDynamic).Transform(data.AsDynamic);

            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     A: ctx.KeyU4.TextValues.Scalar,
                                                     B: ctx.KeyU4.TextValues.Vector));

            var est = data2.MakeNewEstimator()
                      .Append(row => (
                                  ScalarString: row.A.ToVector(),
                                  VectorString: row.B.ToVector(),
                                  VectorBaggedString: row.B.ToBaggedVector()
                                  ));

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);

            Done();
        }
Example #15
0
        public static void SdcaRegression()
        {
            // Downloading a regression dataset from github.com/dotnet/machinelearning
            // this will create a housing.txt file in the filsystem this code will run
            // you can open the file to see the data.
            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();

            // Creating the ML.Net IHostEnvironment object, needed for the pipeline
            var mlContext = new MLContext();

            // Creating a data reader, based on the format of the data
            var reader = TextLoaderStatic.CreateReader(mlContext, c => (
                                                           label: c.LoadFloat(0),
                                                           features: c.LoadFloat(1, 6)
                                                           ),
                                                       separator: '\t', hasHeader: true);

            // Read the data, and leave 10% out, so we can use them for testing
            var data = reader.Read(dataFile);

            var(trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1);

            // The predictor that gets produced out of training
            LinearRegressionModelParameters pred = null;

            // Create the estimator
            var learningPipeline = reader.MakeNewEstimator()
                                   .Append(r => (r.label, score: mlContext.Regression.Trainers.Sdca(
                                                     r.label,
                                                     r.features,
                                                     l1Threshold: 0f,
                                                     maxIterations: 100,
                                                     onFit: p => pred = p)
                                                 )
                                           );

            // Fit this pipeline to the training data
            var model = learningPipeline.Fit(trainData);

            // Check the weights that the model learned
            VBuffer <float> weights = default;

            pred.GetFeatureWeights(ref weights);

            var weightsValues = weights.GetValues();

            Console.WriteLine($"weight 0 - {weightsValues[0]}");
            Console.WriteLine($"weight 1 - {weightsValues[1]}");

            // Evaluate how the model is doing on the test data
            var dataWithPredictions = model.Transform(testData);
            var metrics             = mlContext.Regression.Evaluate(dataWithPredictions, r => r.label, r => r.score);

            Console.WriteLine($"L1 - {metrics.L1}");               // 3.7226085
            Console.WriteLine($"L2 - {metrics.L2}");               // 24.250636
            Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 24.25063
            Console.WriteLine($"RMS - {metrics.Rms}");             // 4.924493
            Console.WriteLine($"RSquared - {metrics.RSquared}");   // 0.565467
        }
Example #16
0
        public void SdcaMulticlass()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx    = new MulticlassClassificationContext(env);
            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            MulticlassLogisticRegressionModelParameters pred = null;

            var loss = new HingeLoss(1);

            // With a custom loss function we no longer get calibrated predictions.
            var est = reader.MakeNewEstimator()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(
                                        r.label,
                                        r.features,
                                        maxIterations: 2,
                                        loss: loss, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            VBuffer <float>[] weights = default;
            pred.GetWeights(ref weights, out int n);
            Assert.True(n == 3 && n == weights.Length);
            foreach (var w in weights)
            {
                Assert.True(w.Length == 4);
            }

            var biases = pred.GetBiases();

            Assert.True(biases.Count() == 3);

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.Count; ++c)
            {
                Console.WriteLine($"{schema[c].Name}, {schema[c].Type}");
            }

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds, 2);

            Assert.True(metrics.LogLoss > 0);
            Assert.True(metrics.TopKAccuracy > 0);
        }
Example #17
0
        public void KMeans()
        {
            var env        = new MLContext(seed: 0, conc: 1);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            KMeansModelParameters pred = null;

            var est = reader.MakeNewEstimator()
                      .AppendCacheCheckpoint()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, r.features, preds: env.Clustering.Trainers.KMeans(r.features, clustersCount: 3, onFit: p => pred = p, advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            VBuffer <float>[] centroids = default;
            int k;

            pred.GetClusterCentroids(ref centroids, out k);

            Assert.True(k == 3);

            var data = model.Read(dataSource);

            var metrics = env.Clustering.Evaluate(data, r => r.preds.score, r => r.label, r => r.features);

            Assert.NotNull(metrics);

            Assert.InRange(metrics.AvgMinScore, 0.5262, 0.5264);
            Assert.InRange(metrics.Nmi, 0.73, 0.77);
            Assert.InRange(metrics.Dbi, 0.662, 0.667);

            metrics = env.Clustering.Evaluate(data, r => r.preds.score, label: r => r.label);
            Assert.NotNull(metrics);

            Assert.InRange(metrics.AvgMinScore, 0.5262, 0.5264);
            Assert.True(metrics.Dbi == 0.0);

            metrics = env.Clustering.Evaluate(data, r => r.preds.score, features: r => r.features);
            Assert.True(double.IsNaN(metrics.Nmi));

            metrics = env.Clustering.Evaluate(data, r => r.preds.score);
            Assert.NotNull(metrics);
            Assert.InRange(metrics.AvgMinScore, 0.5262, 0.5264);
            Assert.True(double.IsNaN(metrics.Nmi));
            Assert.True(metrics.Dbi == 0.0);
        }
Example #18
0
        public static void Bar()
        {
            DataReader <IMultiStreamSource, T> Foo1 <T>(Func <TextLoaderStatic.Context, T> m)
            {
                IHostEnvironment env = null;

                // We ought to fail here.
                return(TextLoaderStatic.CreateReader(env, m));
            }

            DataReader <IMultiStreamSource, T> Foo2 <[IsShape] T>(Func <TextLoaderStatic.Context, T> m)
Example #19
0
        public static void FastTreeRegression()
        {
            // Downloading a regression dataset from github.com/dotnet/machinelearning
            // this will create a housing.txt file in the filsystem this code will run
            // you can open the file to see the data.
            string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();

            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var mlContext = new MLContext();

            // Creating a data reader, based on the format of the data
            var reader = TextLoaderStatic.CreateReader(mlContext, c => (
                                                           label: c.LoadFloat(0),
                                                           features: c.LoadFloat(1, 6)
                                                           ),
                                                       separator: '\t', hasHeader: true);

            // Read the data, and leave 10% out, so we can use them for testing
            var data = reader.Read(dataFile);

            // The predictor that gets produced out of training
            FastTreeRegressionModelParameters pred = null;

            // Create the estimator
            var learningPipeline = reader.MakeNewEstimator()
                                   .Append(r => (r.label, score: mlContext.Regression.Trainers.FastTree(
                                                     r.label,
                                                     r.features,
                                                     numTrees: 100,             // try: (int) 20-2000
                                                     numLeaves: 20,             // try: (int) 2-128
                                                     minDatapointsInLeaves: 10, // try: (int) 1-100
                                                     learningRate: 0.2,         // try: (float) 0.025-0.4
                                                     onFit: p => pred = p)
                                                 )
                                           );

            var cvResults       = mlContext.Regression.CrossValidate(data, learningPipeline, r => r.label, numFolds: 5);
            var averagedMetrics = (
                L1 : cvResults.Select(r => r.metrics.L1).Average(),
                L2 : cvResults.Select(r => r.metrics.L2).Average(),
                LossFn : cvResults.Select(r => r.metrics.LossFn).Average(),
                Rms : cvResults.Select(r => r.metrics.Rms).Average(),
                RSquared : cvResults.Select(r => r.metrics.RSquared).Average()
                );

            Console.WriteLine($"L1 - {averagedMetrics.L1}");               // 3.091095
            Console.WriteLine($"L2 - {averagedMetrics.L2}");               // 20.351073
            Console.WriteLine($"LossFunction - {averagedMetrics.LossFn}"); // 20.351074
            Console.WriteLine($"RMS - {averagedMetrics.Rms}");             // 4.478358
            Console.WriteLine($"RSquared - {averagedMetrics.RSquared}");   // 0.754977
        }
        public void DropSlotsTransform()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                ScalarFloat: ctx.LoadFloat(1),
                                                                ScalarDouble: ctx.LoadDouble(1),
                                                                VectorFloat: ctx.LoadFloat(1, 4),
                                                                VectorDouble: ctx.LoadDouble(4, 8)
                                                                ));

            var data = reader.Read(new MultiFileSource(dataPath)).AsDynamic;

            var columns = new[]
Example #21
0
        public void TestGetColumn()
        {
            var path = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var env  = new MLContext();
            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         floatScalar: ctx.LoadFloat(1),
                                                         floatVector: ctx.LoadFloat(2, 6),
                                                         stringScalar: ctx.LoadText(4),
                                                         stringVector: ctx.LoadText(5, 7)
                                                         )).Read(path);

            Action <Action> mustFail = (Action action) =>
            {
                try
                {
                    action();
                    Assert.False(true);
                }
                catch (ArgumentOutOfRangeException) { }
                catch (InvalidOperationException) { }
                catch (TargetInvocationException ex)
                {
                    Exception e;
                    for (e = ex; e.InnerException != null; e = e.InnerException)
                    {
                    }
                    Assert.True(e is ArgumentOutOfRangeException || e is InvalidOperationException);
                    Assert.True(e.IsMarked());
                }
            };

            var enum1 = data.AsDynamic.GetColumn <float>(env, "floatScalar").ToArray();
            var enum2 = data.AsDynamic.GetColumn <float[]>(env, "floatVector").ToArray();
            var enum3 = data.AsDynamic.GetColumn <VBuffer <float> >(env, "floatVector").ToArray();

            var enum4 = data.AsDynamic.GetColumn <string>(env, "stringScalar").ToArray();
            var enum5 = data.AsDynamic.GetColumn <string[]>(env, "stringVector").ToArray();

            mustFail(() => data.AsDynamic.GetColumn <float[]>(env, "floatScalar"));
            mustFail(() => data.AsDynamic.GetColumn <int[]>(env, "floatVector"));
            mustFail(() => data.AsDynamic.GetColumn <int>(env, "floatScalar"));
            mustFail(() => data.AsDynamic.GetColumn <int?>(env, "floatScalar"));
            mustFail(() => data.AsDynamic.GetColumn <string>(env, "floatScalar"));

            // Static types.
            var enum8  = data.GetColumn(r => r.floatScalar);
            var enum9  = data.GetColumn(r => r.floatVector);
            var enum10 = data.GetColumn(r => r.stringScalar);
            var enum11 = data.GetColumn(r => r.stringVector);
        }
Example #22
0
        public void MultiClassNaiveBayesTrainer()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx    = new MulticlassClassificationContext(env);
            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            MultiClassNaiveBayesModelParameters pred = null;

            // With a custom loss function we no longer get calibrated predictions.
            var est = reader.MakeNewEstimator()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, preds: ctx.Trainers.MultiClassNaiveBayesTrainer(
                                        r.label,
                                        r.features, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            int[]   labelHistogram   = default;
            int[][] featureHistogram = default;
            pred.GetLabelHistogram(ref labelHistogram, out int labelCount1);
            pred.GetFeatureHistogram(ref featureHistogram, out int labelCount2, out int featureCount);
            Assert.True(labelCount1 == 3 && labelCount1 == labelCount2 && labelCount1 <= labelHistogram.Length);
            for (int i = 0; i < labelCount1; i++)
            {
                Assert.True(featureCount == 4 && (featureCount <= featureHistogram[i].Length));
            }

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.Count; ++c)
            {
                Console.WriteLine($"{schema[c].Name}, {schema[c].Type}");
            }

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds, 2);

            Assert.True(metrics.LogLoss > 0);
            Assert.True(metrics.TopKAccuracy > 0);
        }
        public void CategoricalHashStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = "1", B = "2", C = "3",
                                          }, new TestClass()
                                          {
                                              A = "4", B = "5", C = "6"
                                          } };

            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         row.ScalarString,
                                         row.VectorString,
                                         // Create a VarVector column
                                         VarVectorString: row.ScalarString.TokenizeText())).
                              Append(row => (
                                         A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin),
                                         F: row.VarVectorString.OneHotHashEncoding()
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E", "F" });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("CategoricalHash", "featurized.tsv");
            Done();
        }
Example #24
0
        public void SdcaBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryModelParameters        pred = null;
            ParameterMixingCalibratedPredictor cali = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(r.label, r.features,
                                                                      maxIterations: 2,
                                                                      onFit: (p, c) => { pred = p; cali = c; },
                                                                      advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            Assert.Null(cali);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            Assert.NotNull(cali);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
            Assert.InRange(metrics.LogLoss, 0, double.PositiveInfinity);
            Assert.InRange(metrics.Entropy, 0, double.PositiveInfinity);

            // Just output some data on the schema for fun.
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.Count; ++c)
            {
                Console.WriteLine($"{schema[c].Name}, {schema[c].Type}");
            }
        }
        public void LdaWorkout()
        {
            IHostEnvironment env = new MLContext(seed: 42, conc: 1);
            string           sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1)), hasHeader: true)
                       .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(env, "text", "bag_of_words").
                      Append(new LatentDirichletAllocationEstimator(env, "bag_of_words", "topics", 10, numIterations: 10,
                                                                    resetRandomGenerator: true));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // In this test it manifests because of the WordBagEstimator in the estimator chain
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ldatopics.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false, Dense = true
                });
                var       transformer     = est.Fit(data.AsDynamic);
                var       transformedData = transformer.Transform(data.AsDynamic);
                IDataView savedData       = TakeFilter.Create(env, transformedData, 4);
                savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

                Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size);
            }

            // Diabling this check due to the following issue with consitency of output.
            // `seed` specified in ConsoleEnvironment has no effect.
            // https://github.com/dotnet/machinelearning/issues/1004
            // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server
            // CheckEquality("Text", "ldatopics.tsv");
            Done();
        }
        void TestNgramCompatColumns()
        {
            string dropModelPath     = GetDataPath("backcompat/ngram.zip");
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            Sentiment: ctx.LoadBool(0),
                                                            SentimentText: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            using (FileStream fs = File.OpenRead(dropModelPath))
            {
                var result        = ModelFileUtils.LoadTransforms(Env, data.AsDynamic, fs);
                var featureColumn = result.Schema.GetColumnOrNull("Features");
                Assert.NotNull(featureColumn);
            }
        }
Example #27
0
        public void OnlineLinearWorkout()
        {
            var dataPath = GetDataPath("breast-cancer.txt");

            var regressionData = TextLoaderStatic.CreateReader(ML, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10)))
                                 .Read(dataPath);

            var regressionPipe = regressionData.MakeNewEstimator()
                                 .Append(r => (r.Label, Features: r.Features.Normalize()));

            var regressionTrainData = regressionPipe.Fit(regressionData).Transform(regressionData).AsDynamic;

            var ogdTrainer = new OnlineGradientDescentTrainer(ML, "Label", "Features");

            TestEstimatorCore(ogdTrainer, regressionTrainData);
            var ogdModel = ogdTrainer.Fit(regressionTrainData);

            ogdTrainer.Train(regressionTrainData, ogdModel.Model);

            var binaryData = TextLoaderStatic.CreateReader(ML, ctx => (Label: ctx.LoadBool(0), Features: ctx.LoadFloat(1, 10)))
                             .Read(dataPath);

            var binaryPipe = binaryData.MakeNewEstimator()
                             .Append(r => (r.Label, Features: r.Features.Normalize()));

            var binaryTrainData = binaryPipe.Fit(binaryData).Transform(binaryData).AsDynamic;
            var apTrainer       = new AveragedPerceptronTrainer(ML, "Label", "Features", lossFunction: new HingeLoss(), advancedSettings: s =>
            {
                s.LearningRate = 0.5f;
            });

            TestEstimatorCore(apTrainer, binaryTrainData);

            var apModel = apTrainer.Fit(binaryTrainData);

            apTrainer.Train(binaryTrainData, apModel.Model);

            var svmTrainer = new LinearSvmTrainer(ML, "Label", "Features");

            TestEstimatorCore(svmTrainer, binaryTrainData);

            var svmModel = svmTrainer.Fit(binaryTrainData);

            svmTrainer.Train(binaryTrainData, apModel.Model);

            Done();
        }
Example #28
0
        public static void Bar()
        {
            IHostEnvironment env = null;
            var text             = TextLoaderStatic.CreateReader(env, ctx => new
            {
                Label           = ctx.LoadBool(0),
                Text            = ctx.LoadText(1),
                NumericFeatures = ctx.LoadFloat(2, 5)
            });

            var est = text.MakeNewEstimator();

            // This should work.
            est.Append(r => new { r.Text });

            IDataView view = null;

            view.AssertStatic(env, c => new Class1(c.I4.Scalar, c.Bool.Vector));
            view.AssertStatic(env, c => new Class2 {
                F1 = c.I4.Scalar, F2 = c.Bool.Vector
            });
            view.AssertStatic(env, c => new Class3 <Class2>
            {
                F1 = new Class1(c.I4.Scalar, c.Bool.Vector),
                F2 = new Class2 {
                    F1 = c.I4.Scalar, F2 = c.Bool.Vector
                }
            });
            view.AssertStatic(env, c => new Class4 {
                F1 = c.I4.Scalar
            });
            view.AssertStatic <Class5>(env, c => null);
            view.AssertStatic(env, c => new Class6(c.I4.Scalar, c.Bool.Vector));
            view.AssertStatic(env, c => new Class7 {
                F2 = c.Bool.Vector
            });
            view.AssertStatic(env, c => new Class8(c.I4.Scalar, c.Bool.Vector));
            view.AssertStatic(env, c => new Class9 {
                F1 = c.I4.Scalar, F2 = c.Bool.Vector
            });
            view.AssertStatic(env, c => new Class10(c.I4.Scalar, c.Bool.Vector));
            view.AssertStatic(env, c => new Class11(c.I4.Scalar, c.Bool.Vector, c.Bool.Vector));

            // This is wrong but should not fail with our diagnostic since there is a deeper problem that the class
            // simply is not there.
            var text2 = TextLoaderStatic.CreateReader(env, ctx => new MissingClass(ctx.LoadText(0)));
        }
        public void TestTensorFlowStaticWithSchema()
        {
            const string modelLocation = "cifar_model/frozen_model.pb";

            var mlContext       = new MLContext(seed: 1, conc: 1);
            var tensorFlowModel = TensorFlowUtils.LoadTensorFlowModel(mlContext, modelLocation);
            var schema          = tensorFlowModel.GetInputSchema();

            Assert.True(schema.TryGetColumnIndex("Input", out int column));
            var type        = (VectorType)schema[column].Type;
            var imageHeight = type.Dimensions[0];
            var imageWidth  = type.Dimensions[1];

            var dataFile    = GetDataPath("images/images.tsv");
            var imageFolder = Path.GetDirectoryName(dataFile);

            var data = TextLoaderStatic.CreateReader(mlContext, ctx => (
                                                         imagePath: ctx.LoadText(0),
                                                         name: ctx.LoadText(1)))
                       .Read(dataFile);

            // Note that CamelCase column names are there to match the TF graph node names.
            var pipe = data.MakeNewEstimator()
                       .Append(row => (
                                   row.name,
                                   Input: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleave: true)))
                       .Append(row => (row.name, Output: row.Input.ApplyTensorFlowGraph(tensorFlowModel)));

            TestEstimatorCore(pipe.AsDynamic, data.AsDynamic);

            var result = pipe.Fit(data).Transform(data).AsDynamic;

            result.Schema.TryGetColumnIndex("Output", out int output);
            using (var cursor = result.GetRowCursor(result.Schema["Output"]))
            {
                var buffer  = default(VBuffer <float>);
                var getter  = cursor.GetGetter <VBuffer <float> >(output);
                var numRows = 0;
                while (cursor.MoveNext())
                {
                    getter(ref buffer);
                    Assert.Equal(10, buffer.Length);
                    numRows += 1;
                }
                Assert.Equal(4, numRows);
            }
        }
Example #30
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // x86 fails with "An attempt was made to load a program with an incorrect format."
        public void OnnxStatic()
        {
            if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
            {
                return;
            }

            var modelFile = "squeezenet/00000001/model.onnx";

            var env         = new MLContext(conc: 1);
            var imageHeight = 224;
            var imageWidth  = 224;
            var dataFile    = GetDataPath("images/images.tsv");
            var imageFolder = Path.GetDirectoryName(dataFile);

            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         imagePath: ctx.LoadText(0),
                                                         name: ctx.LoadText(1)))
                       .Read(dataFile);

            // Note that CamelCase column names are there to match the TF graph node names.
            var pipe = data.MakeNewEstimator()
                       .Append(row => (
                                   row.name,
                                   data_0: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleaveArgb: true)))
                       .Append(row => (row.name, softmaxout_1: row.data_0.ApplyOnnxModel(modelFile)));

            TestEstimatorCore(pipe.AsDynamic, data.AsDynamic);

            var result = pipe.Fit(data).Transform(data).AsDynamic;

            result.Schema.TryGetColumnIndex("softmaxout_1", out int output);

            using (var cursor = result.GetRowCursor(result.Schema["softmaxout_1"]))
            {
                var buffer  = default(VBuffer <float>);
                var getter  = cursor.GetGetter <VBuffer <float> >(output);
                var numRows = 0;
                while (cursor.MoveNext())
                {
                    getter(ref buffer);
                    Assert.Equal(1000, buffer.Length);
                    numRows += 1;
                }
                Assert.Equal(4, numRows);
            }
        }