Esempio n. 1
0
        static void Main(string[] args)
        {
            var onnxModel      = Path.Combine("Assets", "Model", "model.onnx");
            var imagePathsData = Path.Combine("Assets", "Data", "images.tsv");
            var imageFolder    = Path.Combine("Assets", "Images");

            using (var environment = new ConsoleEnvironment())
            {
                var imageHeight = 64;
                var imageWidth  = 64;

                var loader = TextLoader.CreateReader(environment, context => (
                                                         ImagePath: context.LoadText(0),
                                                         Name: context.LoadText(1)),
                                                     separator: '\t',
                                                     hasHeader: false);

                var data = loader.Read(new MultiFileSource(imagePathsData));

                var estimator = loader.MakeNewEstimator()
                                .Append(row => (
                                            Name: row.Name,
                                            input: row.ImagePath.LoadAsImage(imageFolder).AsGrayscale().Resize(imageWidth, imageHeight).ExtractPixels()))
                                .Append(row => (row.Name, EmotionScores: row.input.ApplyOnnxModel(onnxModel)));

                var model = estimator.Fit(data);

                var predictionFunction = model.AsDynamic.MakePredictionFunction <EmotionData, EmotionPrediction>(environment);

                var prediction = predictionFunction.Predict(new EmotionData()
                {
                    ImagePath = "1.jpg"
                });

                int emotion = GetEmotion(prediction.PredictedLabels);

                Console.WriteLine(GetEmotionString(emotion));

                Console.ReadLine();
            }
        }
Esempio n. 2
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only
        public void MultiClassLightGBM()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx    = new MulticlassClassificationContext(env);
            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            OvaPredictor pred = null;

            // With a custom loss function we no longer get calibrated predictions.
            var est = reader.MakeNewEstimator()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, preds: ctx.Trainers.LightGbm(
                                        r.label,
                                        r.features, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}");
            }

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds, 2);

            Assert.True(metrics.LogLoss > 0);
            Assert.True(metrics.TopKAccuracy > 0);
        }
Esempio n. 3
0
        public void ClassifyGithubIssues()
        {
            var env = new LocalEnvironment(new SysRandom(0), verbose: true);

            string dataPath = "corefx-issues-train.tsv";

            // Create reader with specific schema.
            // string :ID, string: Area, string:Title, string:Description
            var reader = TextLoader.CreateReader(env, ctx =>
                                                 (area: ctx.LoadText(1),
                                                  title: ctx.LoadText(2),
                                                  description: ctx.LoadText(3),
                                                  dataPath,
                                                  useHeader: true));

            var estimator = reader.MakeEstimator()
                            .Append(row => (
                                        // Convert string label to key.
                                        label: row.area.Dictionarize(),
                                        // Featurizes 'description'
                                        description: row.description.FeaturizeText(),
                                        // Featurizes 'title'
                                        title: row.title.FeaturizeText()))
                            .Append(row => (
                                        // Concatenate the two features into a vector.
                                        features: row.description.ConcatWith(r.title),
                                        // Preserve the label
                                        label: row.label))
                            .Append(row => r.label.PredictSdcaMultiClass(row.features));

            // Read the data
            var data = reader.Read(dataPath);

            // Fit the data
            var model = estimator.Fit(data);

            string modelPath = "github-Model.zip";

            // We don't currently have the WriteAsync
            await model.WriteAsync(modelPath);
        }
Esempio n. 4
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only
        public void LightGbmBinaryClassification()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.LightGbm(r.label, r.features,
                                                                          numBoostRound: 10,
                                                                          numLeaves: 5,
                                                                          learningRate: 0.01,
                                                                          onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Esempio n. 5
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only
        public void LightGBMRanking()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.adultRanking.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx = new RankingContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(0), features: c.LoadFloat(9, 14), groupId: c.LoadText(1)),
                                                 separator: '\t', hasHeader: true);

            LightGbmRankingPredictor pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, r.features, groupId: r.groupId.ToKey()))
                      .Append(r => (r.label, r.groupId, score: ctx.Trainers.LightGbm(r.label, r.features, r.groupId, onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.groupId, r => r.score);

            Assert.NotNull(metrics);

            Assert.True(metrics.Ndcg.Length == metrics.Dcg.Length && metrics.Dcg.Length == 3);

            Assert.InRange(metrics.Dcg[0], 1.4, 1.6);
            Assert.InRange(metrics.Dcg[1], 1.4, 1.8);
            Assert.InRange(metrics.Dcg[2], 1.4, 1.8);

            Assert.InRange(metrics.Ndcg[0], 36.5, 37);
            Assert.InRange(metrics.Ndcg[1], 36.5, 37);
            Assert.InRange(metrics.Ndcg[2], 36.5, 37);
        }
Esempio n. 6
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // TensorFlow is 64-bit only
        public void TestTensorFlowStatic()
        {
            var modelLocation = "cifar_model/frozen_model.pb";

            var mlContext   = new MLContext(seed: 1, conc: 1);
            var imageHeight = 32;
            var imageWidth  = 32;
            var dataFile    = GetDataPath("images/images.tsv");
            var imageFolder = Path.GetDirectoryName(dataFile);

            var data = TextLoader.CreateReader(mlContext, ctx => (
                                                   imagePath: ctx.LoadText(0),
                                                   name: ctx.LoadText(1)))
                       .Read(dataFile);

            // Note that CamelCase column names are there to match the TF graph node names.
            var pipe = data.MakeNewEstimator()
                       .Append(row => (
                                   row.name,
                                   Input: row.imagePath.LoadAsImage(imageFolder).Resize(imageHeight, imageWidth).ExtractPixels(interleaveArgb: true)))
                       .Append(row => (row.name, Output: row.Input.ApplyTensorFlowGraph(modelLocation)));

            TestEstimatorCore(pipe.AsDynamic, data.AsDynamic);

            var result = pipe.Fit(data).Transform(data).AsDynamic;

            result.Schema.TryGetColumnIndex("Output", out int output);
            using (var cursor = result.GetRowCursor(col => col == output))
            {
                var buffer  = default(VBuffer <float>);
                var getter  = cursor.GetGetter <VBuffer <float> >(output);
                var numRows = 0;
                while (cursor.MoveNext())
                {
                    getter(ref buffer);
                    Assert.Equal(10, buffer.Length);
                    numRows += 1;
                }
                Assert.Equal(4, numRows);
            }
        }
        public void TestWhiteningOldSavingAndLoading()
        {
            var    env        = new ConsoleEnvironment(seed: 0);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    dataView   = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource).AsDynamic;
            var pipe = new VectorWhiteningEstimator(env, "features", "whitened");

            var result      = pipe.Fit(dataView).Transform(dataView);
            var resultRoles = new RoleMappedData(result);

            using (var ms = new MemoryStream())
            {
                TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles);
                ms.Position = 0;
                var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms);
            }
            Done();
        }
Esempio n. 8
0
        public void OnlineLinearWorkout()
        {
            var dataPath = GetDataPath("breast-cancer.txt");

            var data = TextLoader.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10)))
                       .Read(new MultiFileSource(dataPath));

            var pipe = data.MakeNewEstimator()
                       .Append(r => (r.Label, Features: r.Features.Normalize()));

            var trainData = pipe.Fit(data).Transform(data).AsDynamic;

            IEstimator <ITransformer> est = new OnlineGradientDescentTrainer(Env, new OnlineGradientDescentTrainer.Arguments());

            TestEstimatorCore(est, trainData);

            est = new AveragedPerceptronTrainer(Env, new AveragedPerceptronTrainer.Arguments());
            TestEstimatorCore(est, trainData);

            Done();
        }
Esempio n. 9
0
        public void SimpleImageSmokeTest()
        {
            var env = new ConsoleEnvironment(0, verbose: true);

            var reader = TextLoader.CreateReader(env,
                                                 ctx => ctx.LoadText(0).LoadAsImage().AsGrayscale().Resize(10, 8).ExtractPixels());

            var schema = reader.AsDynamic.GetOutputSchema();

            Assert.True(schema.TryGetColumnIndex("Data", out int col), "Could not find 'Data' column");
            var type = schema.GetColumnType(col);

            Assert.True(type.IsKnownSizeVector, $"Type was supposed to be known size vector but was instead '{type}'");
            var vecType = type.AsVector;

            Assert.Equal(NumberType.R4, vecType.ItemType);
            Assert.Equal(3, vecType.DimCount);
            Assert.Equal(3, vecType.GetDim(0));
            Assert.Equal(8, vecType.GetDim(1));
            Assert.Equal(10, vecType.GetDim(2));
        }
Esempio n. 10
0
        public void SdcaWorkout()
        {
            var dataPath = GetDataPath("breast-cancer.txt");

            var data = TextLoader.CreateReader(Env, ctx => (Label: ctx.LoadFloat(0), Features: ctx.LoadFloat(1, 10)))
                       .Read(dataPath).Cache();

            var binaryTrainer = new SdcaBinaryTrainer(Env, "Label", "Features", advancedSettings: (s) => s.ConvergenceTolerance = 1e-2f);

            TestEstimatorCore(binaryTrainer, data.AsDynamic);

            var regressionTrainer = new SdcaRegressionTrainer(Env, "Label", "Features", advancedSettings: (s) => s.ConvergenceTolerance = 1e-2f);

            TestEstimatorCore(regressionTrainer, data.AsDynamic);

            var mcTrainer = new SdcaMultiClassTrainer(Env, "Label", "Features", advancedSettings: (s) => s.ConvergenceTolerance = 1e-2f);

            TestEstimatorCore(mcTrainer, data.AsDynamic);

            Done();
        }
Esempio n. 11
0
        public void HogwildSGDBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.StochasticGradientDescentClassificationTrainer(r.label, r.features,
                                                                                                                l2Weight: 0,
                                                                                                                onFit: (p) => { pred = p; },
                                                                                                                advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Esempio n. 12
0
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new WordTokenizer(Env, "text", "text")
                      .Append(new TermEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
Esempio n. 13
0
        public void ConcatWith()
        {
            var env      = new ConsoleEnvironment(seed: 0);
            var dataPath = GetDataPath("iris.data");
            var reader   = TextLoader.CreateReader(env,
                                                   c => (label: c.LoadText(4), values: c.LoadFloat(0, 3), value: c.LoadFloat(2)),
                                                   separator: ',');
            var dataSource = new MultiFileSource(dataPath);
            var data       = reader.Read(dataSource);

            var est = data.MakeNewEstimator()
                      .Append(r => (
                                  r.label, r.values, r.value,
                                  c0: r.label.AsVector(), c1: r.label.ConcatWith(r.label),
                                  c2: r.value.ConcatWith(r.values), c3: r.values.ConcatWith(r.value, r.values)));

            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            int[] idx = new int[4];
            for (int i = 0; i < idx.Length; ++i)
            {
                Assert.True(schema.TryGetColumnIndex("c" + i, out idx[i]), $"Could not find col c{i}");
            }
            var types = new VectorType[idx.Length];

            int[] expectedLen = new int[] { 1, 2, 5, 9 };
            for (int i = 0; i < idx.Length; ++i)
            {
                var type = schema.GetColumnType(idx[i]);
                Assert.True(type.VectorSize > 0, $"Col c{i} had unexpected type {type}");
                types[i] = type.AsVector;
                Assert.Equal(expectedLen[i], type.VectorSize);
            }
            Assert.Equal(TextType.Instance, types[0].ItemType);
            Assert.Equal(TextType.Instance, types[1].ItemType);
            Assert.Equal(NumberType.Float, types[2].ItemType);
            Assert.Equal(NumberType.Float, types[3].ItemType);
        }
Esempio n. 14
0
        public void TestPcaEstimator()
        {
            var data = TextLoader.CreateReader(_env,
                                               c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                               separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = new PcaEstimator(_env, "features", "pca", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = _env.Start("save"))
            {
                IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(_env, savedData, "pca");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
Esempio n. 15
0
        public void PcaWorkout()
        {
            var data = TextLoader.CreateReader(_env,
                                               c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadFloat(1, 10)),
                                               separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var invalidData = TextLoader.CreateReader(_env,
                                                      c => (label: c.LoadFloat(11), weight: c.LoadFloat(0), features: c.LoadText(1, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(_dataSource);

            var est = new PcaEstimator(_env, "features", "pca", rank: 4, seed: 10);

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var estNonDefaultArgs = new PcaEstimator(_env, "features", "pca", rank: 3, weightColumn: "weight", overSampling: 2, center: false);

            TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            Done();
        }
        public void PrincipalComponentAnalysis()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label,
                                    pca: r.features.ToPrincipalComponents(rank: 5)));
            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("pca", out int pcaCol));
            var type = schema.GetColumnType(pcaCol);

            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
        }
Esempio n. 17
0
        public void SdcaBinaryClassification()
        {
            var env        = new TlcEnvironment(seed: 0);
            var dataPath   = GetDataPath("breast-cancer.txt");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;
            ParameterMixingCalibratedPredictor cali = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: r.label.PredictSdcaBinaryClassification(r.features,
                                                                                            maxIterations: 2,
                                                                                            onFit: (p, c) => { pred = p; cali = c; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            Assert.Null(cali);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            Assert.NotNull(cali);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var rows   = DataViewUtils.ComputeRowCount(data.AsDynamic);
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}");
            }
        }
Esempio n. 18
0
        public void AveragePerceptronNoCalibration()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;

            var loss = new HingeLoss(new HingeLoss.Arguments()
            {
                Margin = 1
            });

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss,
                                                                                    numIterations: 2, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Esempio n. 19
0
        public void PcaWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 1, conc: 1);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(new MultiFileSource(dataSource));

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(new MultiFileSource(dataSource));

            var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => {
                s.Seed = 1;
            });

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(env, savedData, "pca");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv");
            Done();
        }
Esempio n. 20
0
        public static void Train()
        {
            using (var env = new LocalEnvironment(1974))
            {
                /*env.AddListener((messageSource, message) =>
                 *  Console.WriteLine($"{messageSource.ShortName}: {message.Message} ({message.Kind})"));*/
                env.AddListener(ConsoleLogger);

                var classification = new MulticlassClassificationContext(env);

                var reader = TextLoader.CreateReader(env, ctx => (
                                                         Sentence: ctx.LoadText(1),
                                                         Label: ctx.LoadText(0)
                                                         ),
                                                     separator: ',');

                var trainData = reader.Read(new MultiFileSource(TrainDataPath));

                var pipeline = reader.MakeNewEstimator()
                               .Append(r => (
                                           Label: r.Label.ToKey(),
                                           Features: r.Sentence.FeaturizeText()))
                               .Append(r => (
                                           r.Label,
                                           Predictions: classification.Trainers.Sdca(r.Label, r.Features)
                                           ))
                               .Append(r => r.Predictions.predictedLabel.ToValue());

                Console.WriteLine("=============== Training model ===============");

                var model = pipeline.Fit(trainData).AsDynamic;

                using (var fs = new FileStream(ModelPath, FileMode.Create, FileAccess.Write, FileShare.Write))
                    model.SaveTo(env, fs);

                Console.WriteLine("=============== End training ===============");
                Console.WriteLine("The model is saved to {0}", ModelPath);
            }
        }
Esempio n. 21
0
        public void ToKey()
        {
            var env      = new ConsoleEnvironment(seed: 0);
            var dataPath = GetDataPath("iris.data");
            var reader   = TextLoader.CreateReader(env,
                                                   c => (label: c.LoadText(4), values: c.LoadFloat(0, 3)),
                                                   separator: ',');
            var dataSource = new MultiFileSource(dataPath);
            var data       = reader.Read(dataSource);

            var est = data.MakeNewEstimator()
                      .Append(r => (labelKey: r.label.ToKey(), valuesKey: r.values.ToKey(onFit: m => { })))
                      .Append(r => (r.labelKey, r.valuesKey, valuesKeyKey: r.valuesKey.ToKey()));

            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("labelKey", out int labelCol));
            Assert.True(schema.TryGetColumnIndex("valuesKey", out int valuesCol));
            Assert.True(schema.TryGetColumnIndex("valuesKeyKey", out int valuesKeyCol));

            Assert.Equal(3, schema.GetColumnType(labelCol).KeyCount);
            Assert.True(schema.GetColumnType(valuesCol).ItemType.IsKey);
            Assert.True(schema.GetColumnType(valuesKeyCol).ItemType.IsKey);

            var labelKeyType     = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, labelCol);
            var valuesKeyType    = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesCol);
            var valuesKeyKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesKeyCol);

            Assert.NotNull(labelKeyType);
            Assert.NotNull(valuesKeyType);
            Assert.NotNull(valuesKeyKeyType);
            Assert.True(labelKeyType.IsVector && labelKeyType.ItemType == TextType.Instance);
            Assert.True(valuesKeyType.IsVector && valuesKeyType.ItemType == NumberType.Float);
            Assert.True(valuesKeyKeyType.IsVector && valuesKeyKeyType.ItemType == NumberType.Float);
            // Because they're over exactly the same data, they ought to have the same cardinality and everything.
            Assert.True(valuesKeyKeyType.Equals(valuesKeyType));
        }
Esempio n. 22
0
        public void TestWordEmbeddings()
        {
            var dataPath     = GetDataPath(ScenariosTests.SentimentDataPath);
            var testDataPath = GetDataPath(ScenariosTests.SentimentTestPath);

            var data = TextLoader.CreateReader(Env, ctx => (
                                                   label: ctx.LoadBool(0),
                                                   SentimentText: ctx.LoadText(1)), hasHeader: true)
                       .Read(new MultiFileSource(dataPath));

            var dynamicData = TextTransform.Create(Env, new TextTransform.Arguments()
            {
                Column = new TextTransform.Column
                {
                    Name   = "SentimentText_Features",
                    Source = new[] { "SentimentText" }
                },
                KeepDiacritics       = false,
                KeepPunctuations     = false,
                TextCase             = Runtime.TextAnalytics.TextNormalizerTransform.CaseNormalizationMode.Lower,
                OutputTokens         = true,
                StopWordsRemover     = new Runtime.TextAnalytics.PredefinedStopWordsRemoverFactory(),
                VectorNormalizer     = TextTransform.TextNormKind.None,
                CharFeatureExtractor = null,
                WordFeatureExtractor = null,
            }, data.AsDynamic);

            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     SentimentText_Features_TransformedText: ctx.Text.VarVector,
                                                     SentimentText: ctx.Text.Scalar,
                                                     label: ctx.Bool.Scalar));

            var est = data2.MakeNewEstimator()
                      .Append(row => row.SentimentText_Features_TransformedText.WordEmbeddings());

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);
            Done();
        }
Esempio n. 23
0
        public void CrossValidate()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx    = new MulticlassClassificationContext(env);
            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            var est = reader.MakeNewEstimator()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(
                                        r.label,
                                        r.features,
                                        maxIterations: 2)));

            var results = ctx.CrossValidate(reader.Read(dataSource), est, r => r.label)
                          .Select(x => x.metrics).ToArray();

            Assert.Equal(5, results.Length);
            Assert.True(results.All(x => x.LogLoss > 0));
        }
Esempio n. 24
0
        static void Main(string[] args)
        {
            var testDataFile = TestData.PrepareTestDataAndReturnPath(23, 61, 72);

            for (int i = 0; i < 5; i++)
            {
                var env            = new LocalEnvironment();
                var classification = new MulticlassClassificationContext(env);
                var reader         = TextLoader.CreateReader(env, ctx => (Label: ctx.LoadText(0), Text: ctx.LoadText(1)), separator: ',', hasHeader: false);
                var data           = reader.Read(new MultiFileSource(testDataFile));

                var learningPipeline = reader.MakeNewEstimator()
                                       .Append(r => (Label: r.Label.ToKey(), Features: r.Text.FeaturizeText(advancedSettings: s =>
                {
                    s.KeepDiacritics = false;
                    //s.KeepNumbers = false; bv PC
                    s.KeepPunctuations = false;
                    s.TextCase = TextNormalizerTransform.CaseNormalizationMode.Lower;
                    s.TextLanguage = TextTransform.Language.Dutch;
                    s.VectorNormalizer = TextTransform.TextNormKind.LInf;
                })))
                                       .Append(r => (Label: r.Label, Predications: classification.Trainers.Sdca(r.Label, r.Features)));

                var(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);
                var model   = learningPipeline.Fit(trainData);
                var metrics = classification.Evaluate(model.Transform(testData), r => r.Label, r => r.Predications);
                Console.WriteLine(metrics.AccuracyMicro);
                Console.WriteLine(metrics.AccuracyMacro);

                //var cvResults = classification.CrossValidate(data, learningPipeline, r => r.Label, numFolds: 5);
                //var microAccuracies = cvResults.Select(r => r.metrics.AccuracyMicro);
                //Console.WriteLine(microAccuracies.Average());
                //var macroAccuracies = cvResults.Select(r => r.metrics.AccuracyMacro);
                //Console.WriteLine(macroAccuracies.Average());
                Console.WriteLine("-----------");
            }
        }
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath))
                              .AsDynamic;

            //var feat = Estimator.MakeNew(data)
            //     .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; }));
            var feat = new TextTransform(Env, "text", "Data", advancedSettings: s => { s.OutputTokens = true; });

            TestEstimatorCore(feat, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, feat.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "Data", "Data_TransformedText");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
Esempio n. 26
0
        public void LpGcNormAndWhitening()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label,
                                    lpnorm: r.features.LpNormalize(),
                                    gcnorm: r.features.GlobalContrastNormalize(),
                                    zcawhitened: r.features.ZcaWhitening(),
                                    pcswhitened: r.features.PcaWhitening()));
            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("lpnorm", out int lpnormCol));
            var type = schema.GetColumnType(lpnormCol);

            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

            Assert.True(schema.TryGetColumnIndex("gcnorm", out int gcnormCol));
            type = schema.GetColumnType(gcnormCol);
            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

            Assert.True(schema.TryGetColumnIndex("zcawhitened", out int zcawhitenedCol));
            type = schema.GetColumnType(zcawhitenedCol);
            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

            Assert.True(schema.TryGetColumnIndex("pcswhitened", out int pcswhitenedCol));
            type = schema.GetColumnType(pcswhitenedCol);
            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramCountingEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
Esempio n. 28
0
        public void LpGcNormAndWhiteningWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 0);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(new MultiFileSource(dataSource));

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(new MultiFileSource(dataSource));

            var est = new LpNormalizer(env, "features", "lpnorm")
                      .Append(new GlobalContrastNormalizer(env, "features", "gcnorm"))
                      .Append(new Whitening(env, "features", "whitened"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "lpnorm", "gcnorm", "whitened");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
Esempio n. 29
0
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new TextNormalizer(Env, "text")
                      .Append(new WordTokenizer(Env, "text", "words"))
                      .Append(new StopwordRemover(Env, "words", "words_without_stopwords"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
Esempio n. 30
0
        public void Normalizer()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, r.features, bin: r.features.NormalizeByBinning(), mm: r.features.Normalize()));
            var tdata = est.Fit(data).Transform(data);

            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("features", out int featCol));
            Assert.True(schema.TryGetColumnIndex("bin", out int binCol));
            Assert.True(schema.TryGetColumnIndex("mm", out int mmCol));
            Assert.False(schema.IsNormalized(featCol));
            Assert.True(schema.IsNormalized(binCol));
            Assert.True(schema.IsNormalized(mmCol));
        }