public void FeatureSelectionWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(ML, "text", "bag_of_words")
                      .AppendCacheCheckpoint(ML)
                      .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words", "bag_of_words_count", 10)
                              .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words", "bag_of_words_mi", labelColumn: "label")));

            var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "bag_of_words_count", "bag_of_words_mi" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("FeatureSelection", "featureselection.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "words", "text")
                      .Append(new TokenizingByCharactersEstimator(ML, "chars", "text"))
                      .Append(new KeyToValueMappingEstimator(ML, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "words", "chars" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
Пример #3
0
        [ConditionalFact(typeof(BaseTestBaseline), nameof(BaseTestBaseline.LessThanNetCore30OrNotNetCore))] // netcore3.0 output differs from Baseline
        public void GcnWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoader.CreateReader(ML,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(ML,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new GlobalContrastNormalizingEstimator(ML, "features", "gcnNorm1")
                      .Append(new GlobalContrastNormalizingEstimator(ML, "features", "gcnNorm2", substractMean: false, useStdDev: true, scale: 3));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "gcnNorm1", "gcnNorm2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "gcnNorm.tsv", digitsOfPrecision: 4);
            Done();
        }
Пример #4
0
        public void WhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoader.CreateReader(ML,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(ML,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new VectorWhiteningEstimator(ML, "features", "whitened1")
                      .Append(new VectorWhiteningEstimator(ML, "features", "whitened2", kind: WhiteningKind.Pca, pcaNum: 5));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "whitened1", "whitened2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void LpNormWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = ML.Transforms.Projection.LpNormalize("lpNorm1", "features")
                      .Append(ML.Transforms.Projection.LpNormalize("lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, subMean: true));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "lpNorm1", "lpNorm2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpNorm.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadText(1)), hasHeader: true)
                              .Read(dataPath).AsDynamic;

            var est       = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4);
            var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" });

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
Пример #7
0
        public void TestWordEmbeddings()
        {
            var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var data     = new TextLoader(ML,
                                          new TextLoader.Arguments()
            {
                Separator = "\t",
                HasHeader = true,
                Columns   = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            }).Read(GetDataPath(dataPath));

            var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
                      .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
            var words = est.Fit(data).Transform(data);

            var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe);

            TestEstimatorCore(pipe, words, invalidInput: data);

            var outputPath = GetOutputPath("Text", "wordEmbeddings.tsv");
            var savedData  = ML.Data.TakeRows(pipe.Fit(words).Transform(words), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "WordEmbeddings" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);
            CheckEquality("Text", "wordEmbeddings.tsv");
            Done();
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);
            var est = ML.Transforms.Text.NormalizeText("text")
                      .Append(ML.Transforms.Text.TokenizeWords("words", "text"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
                      .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "NoDefaultStopwords", "NoStopWords" });
            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
        public void WordBagWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(ML, "bag_of_words", "text").
                      Append(new WordHashBagEstimator(ML, "bag_of_wordshash", "text", invertHash: -1));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "bag_of_words.tsv");
            Done();
        }
Пример #10
0
        private IDataView WrapPerInstance(RoleMappedData perInst)
        {
            var idv = perInst.Data;

            // Make a list of column names that Maml outputs as part of the per-instance data view, and then wrap
            // the per-instance data computed by the evaluator in a SelectColumnsTransform.
            var cols       = new List <(string Source, string Name)>();
            var colsToKeep = new List <string>();

            // If perInst is the result of cross-validation and contains a fold Id column, include it.
            int foldCol;

            if (perInst.Schema.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.FoldIndex, out foldCol))
            {
                colsToKeep.Add(MetricKinds.ColumnNames.FoldIndex);
            }

            // Maml always outputs a name column, if it doesn't exist add a GenerateNumberTransform.
            if (perInst.Schema.Name == null)
            {
                var args = new GenerateNumberTransform.Arguments();
                args.Column = new[] { new GenerateNumberTransform.Column()
                                      {
                                          Name = "Instance"
                                      } };
                args.UseCounter = true;
                idv             = new GenerateNumberTransform(Host, args, idv);
                colsToKeep.Add("Instance");
            }
            else
            {
                cols.Add((perInst.Schema.Name.Name, "Instance"));
                colsToKeep.Add("Instance");
            }

            // Maml outputs the weight column if it exists.
            if (perInst.Schema.Weight != null)
            {
                colsToKeep.Add(perInst.Schema.Weight.Name);
            }

            // Get the other columns from the evaluator.
            foreach (var col in GetPerInstanceColumnsToSave(perInst.Schema))
            {
                colsToKeep.Add(col);
            }

            idv = new ColumnsCopyingTransformer(Host, cols.ToArray()).Transform(idv);
            idv = ColumnSelectingTransformer.CreateKeep(Host, idv, colsToKeep.ToArray());
            return(GetPerInstanceMetricsCore(idv, perInst.Schema));
        }
Пример #11
0
        public void CategoricalHashStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarString: ctx.LoadText(1),
                                                          VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = "1", B = "2", C = "3",
                                          }, new TestClass()
                                          {
                                              A = "4", B = "5", C = "6"
                                          } };

            var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         row.ScalarString,
                                         row.VectorString,
                                         // Create a VarVector column
                                         VarVectorString: row.ScalarString.TokenizeText())).
                              Append(row => (
                                         A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin),
                                         F: row.VarVectorString.OneHotHashEncoding()
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E", "F" });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("CategoricalHash", "featurized.tsv");
            Done();
        }
Пример #12
0
        public void TestCustomWordEmbeddings()
        {
            var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var data     = new TextLoader(Env,
                                          new TextLoader.Arguments()
            {
                Separator = "\t",
                HasHeader = true,
                Columns   = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            }).Read(GetDataPath(dataPath));

            var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
                      .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
            var words             = est.Fit(data).Transform(data);
            var pathToCustomModel = DeleteOutputPath("custommodel.txt");

            using (StreamWriter file = new StreamWriter(pathToCustomModel))
            {
                file.WriteLine("This is custom file for 4 words with 5 dimentional vector. First line in this file is ignored");
                file.WriteLine("stop" + " " + string.Join(" ", 1.5f, 2.5f, 3.5f, 4.5f, 5.5f));
                file.WriteLine("bursts" + " " + string.Join(" ", -0.9f, -3f, 7.3f, 1.0f, 12f));
                file.WriteLine("you" + " " + string.Join(" ", -1f, -2f, -4f, -6f, -1f));
                file.WriteLine("dude" + " " + string.Join(" ", 100f, 0f, 0f, 0f, 0f));
            }
            var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", pathToCustomModel, "CleanWords");

            TestEstimatorCore(pipe, words, invalidInput: data);

            var outputPath = GetOutputPath("Text", "customWordEmbeddings.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 10);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings", "CleanWords" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "customWordEmbeddings.tsv");
            Done();
        }
        public void LdaWorkout()
        {
            IHostEnvironment env = new MLContext(seed: 42, conc: 1);
            string           sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1)), hasHeader: true)
                       .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(env, "text", "bag_of_words").
                      Append(new LatentDirichletAllocationEstimator(env, "bag_of_words", "topics", 10, numIterations: 10,
                                                                    resetRandomGenerator: true));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // In this test it manifests because of the WordBagEstimator in the estimator chain
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ldatopics.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false, Dense = true
                });
                var       transformer     = est.Fit(data.AsDynamic);
                var       transformedData = transformer.Transform(data.AsDynamic);
                IDataView savedData       = TakeFilter.Create(env, transformedData, 4);
                savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

                Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size);
            }

            // Diabling this check due to the following issue with consitency of output.
            // `seed` specified in ConsoleEnvironment has no effect.
            // https://github.com/dotnet/machinelearning/issues/1004
            // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server
            // CheckEquality("Text", "ldatopics.tsv");
            Done();
        }
        public void NAReplaceStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarFloat: ctx.LoadFloat(1),
                                                          ScalarDouble: ctx.LoadDouble(1),
                                                          VectorFloat: ctx.LoadFloat(1, 4),
                                                          VectorDoulbe: ctx.LoadDouble(1, 4)
                                                          ));

            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);

            var est = data.MakeNewEstimator().
                      Append(row => (
                                 A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Maximum),
                                 B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Mean),
                                 C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Mean),
                                 D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingTransformer.ColumnInfo.ReplacementMode.Minimum)
                                 ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAReplace", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D" });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("NAReplace", "featurized.tsv");
            Done();
        }
Пример #15
0
        public void TestPcaEstimator()
        {
            var data = TextLoaderStatic.CreateReader(ML,
                                                     c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                     separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = ML.Transforms.Projection.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "pca" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void CategoricalHashStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = "1", B = "2", C = "3",
                                          }, new TestClass()
                                          {
                                              A = "4", B = "5", C = "6"
                                          } };

            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         row.ScalarString,
                                         row.VectorString,
                                         // Create a VarVector column
                                         VarVectorString: row.ScalarString.TokenizeText())).
                              Append(row => (
                                         A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin),
                                         F: row.VarVectorString.OneHotHashEncoding()
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4);
            var view       = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "A", "B", "C", "D", "E", "F" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true);

            CheckEquality("CategoricalHash", "featurized.tsv");
            Done();
        }
Пример #17
0
        public void NAReplaceStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                ScalarFloat: ctx.LoadFloat(1),
                                                                ScalarDouble: ctx.LoadDouble(1),
                                                                VectorFloat: ctx.LoadFloat(1, 4),
                                                                VectorDoulbe: ctx.LoadDouble(1, 4)
                                                                ));

            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);

            var est = data.MakeNewEstimator().
                      Append(row => (
                                 A: row.ScalarFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Maximum),
                                 B: row.ScalarDouble.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Mean),
                                 C: row.VectorFloat.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Mean),
                                 D: row.VectorDoulbe.ReplaceNaNValues(MissingValueReplacingEstimator.ColumnInfo.ReplacementMode.Minimum)
                                 ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAReplace", "featurized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data).AsDynamic, 4);
            var view       = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(view, fs, headerRow: true, keepHidden: true);

            CheckEquality("NAReplace", "featurized.tsv");
            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
Пример #19
0
        public void TestPcaEstimator()
        {
            var data = TextLoader.CreateReader(_env,
                                               c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                               separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = new PrincipalComponentAnalysisEstimator(_env, "features", "pca", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = _env.Start("save"))
            {
                IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(_env, savedData, new[] { "pca" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "words")
                      .Append(new TokenizingByCharactersEstimator(Env, "text", "chars"))
                      .Append(new KeyToValueMappingEstimator(Env, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words", "chars" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath)
                              .AsDynamic;

            var feat = data.MakeNewEstimator()
                       .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; }));

            TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, feat.Fit(data).Transform(data).AsDynamic, 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "Data", "Data_TransformedText" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
Пример #22
0
        public void KeyToValuePigsty()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)
                                                                ));

            var data = reader.Read(dataPath);

            // Non-pigsty Term.
            var dynamicData = new ValueToKeyMappingEstimator(Env, new[] {
                new ValueToKeyMappingTransformer.ColumnInfo("A", "ScalarString"),
                new ValueToKeyMappingTransformer.ColumnInfo("B", "VectorString")
            })
                              .Fit(data.AsDynamic).Transform(data.AsDynamic);

            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     A: ctx.KeyU4.TextValues.Scalar,
                                                     B: ctx.KeyU4.TextValues.Vector));

            var est = data2.MakeNewEstimator()
                      .Append(row => (
                                  ScalarString: row.A.ToValue(),
                                  VectorString: row.B.ToValue()));

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);

            // Check that term and ToValue are round-trippable.
            var dataLeft  = ColumnSelectingTransformer.CreateKeep(Env, data.AsDynamic, new[] { "ScalarString", "VectorString" });
            var dataRight = ColumnSelectingTransformer.CreateKeep(Env, est.Fit(data2).Transform(data2).AsDynamic, new[] { "ScalarString", "VectorString" });

            CheckSameSchemas(dataLeft.Schema, dataRight.Schema);
            CheckSameValues(dataLeft, dataRight);
            Done();
        }
        public void LpGcNormAndWhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoaderStatic.CreateReader(ML,
                                                              c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                              separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoaderStatic.CreateReader(ML,
                                                            c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                            separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = ML.Transforms.Projection.LpNormalize("lpnorm", "features")
                      .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnorm", "features"))
                      .Append(new VectorWhiteningEstimator(ML, "whitened", "features"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "lpnorm", "gcnorm", "whitened" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
Пример #24
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Columns = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 9),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }),
                    new TextLoader.Column("float6", DataKind.R4, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12, 14) }),
                    new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(14, null)
                                                                         {
                                                                             AutoEnd = false, VariableEnd = true
                                                                         } })
                },
                Separator = "\t",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Read(source);

            ColumnType GetType(Schema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema[cIdx].Type);
            }

            var pipe = new ColumnConcatenatingEstimator(Env, "f1", "float1")
                       .Append(new ColumnConcatenatingEstimator(Env, "f2", "float1", "float1"))
                       .Append(new ColumnConcatenatingEstimator(Env, "f3", "float4", "float1"))
                       .Append(new ColumnConcatenatingEstimator(Env, "f4", "float6", "vfloat", "float1"));

            data = TakeFilter.Create(Env, data, 10);
            data = pipe.Fit(data).Transform(data);

            ColumnType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t is VectorType vt1 && vt1.ItemType == NumberType.R4 && vt1.Size == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t is VectorType vt2 && vt2.ItemType == NumberType.R4 && vt2.Size == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t is VectorType vt3 && vt3.ItemType == NumberType.R4 && vt3.Size == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t is VectorType vt4 && vt4.ItemType == NumberType.R4 && vt4.Size == 0);

            data = ColumnSelectingTransformer.CreateKeep(Env, data, new[] { "f1", "f2", "f3", "f4" });

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }
Пример #25
0
        private void RunCore(IChannel ch)
        {
            Host.AssertValue(ch);
            IDataView data = CreateAndSaveLoader();

            if (!string.IsNullOrWhiteSpace(ImplOptions.Columns))
            {
                var keepColumns = ImplOptions.Columns
                                  .Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToArray();
                if (Utils.Size(keepColumns) > 0)
                {
                    data = ColumnSelectingTransformer.CreateKeep(Host, data, keepColumns);
                }
            }

            IDataSaver saver;

            if (ImplOptions.Saver != null)
            {
                saver = ImplOptions.Saver.CreateComponent(Host);
            }
            else
            {
                saver = new TextSaver(Host, new TextSaver.Arguments()
                {
                    Dense = ImplOptions.Dense
                });
            }
            var cols = new List <int>();

            for (int i = 0; i < data.Schema.Count; i++)
            {
                if (!ImplOptions.KeepHidden && data.Schema[i].IsHidden)
                {
                    continue;
                }
                var type = data.Schema[i].Type;
                if (saver.IsColumnSavable(type))
                {
                    cols.Add(i);
                }
                else
                {
                    ch.Info(MessageSensitivity.Schema, "The column '{0}' will not be written as it has unsavable column type.", data.Schema[i].Name);
                }
            }
            Host.NotSensitive().Check(cols.Count > 0, "No valid columns to save");

            // Send the first N lines to console.
            if (ImplOptions.Rows > 0)
            {
                var args = new SkipTakeFilter.TakeOptions()
                {
                    Count = ImplOptions.Rows
                };
                data = SkipTakeFilter.Create(Host, args, data);
            }
            var textSaver = saver as TextSaver;

            // If it is a text saver, utilize a special utility for this purpose.
            if (textSaver != null)
            {
                textSaver.WriteData(data, true, cols.ToArray());
            }
            else
            {
                using (MemoryStream mem = new MemoryStream())
                {
                    using (Stream wrapStream = new SubsetStream(mem))
                        saver.SaveData(wrapStream, data, cols.ToArray());
                    mem.Seek(0, SeekOrigin.Begin);
                    using (StreamReader reader = new StreamReader(mem))
                    {
                        string result = reader.ReadToEnd();
                        ch.Info(MessageSensitivity.UserData | MessageSensitivity.Schema, result);
                    }
                }
            }
        }