public void NormalizerWithOnFit()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => c.LoadFloat(0, 2),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            // These will be populated once we call fit.
            ImmutableArray <float> mm;
            ImmutableArray <float> ss;
            ImmutableArray <ImmutableArray <float> > bb;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r,
                                    ncdf: r.NormalizeByCumulativeDistribution(onFit: (m, s) => mm = m),
                                    n: r.NormalizeByMeanVar(onFit: (s, o) => { ss = s; Assert.Empty(o); }),
                                    b: r.NormalizeByBinning(onFit: b => bb = b)));
            var tdata = est.Fit(data).Transform(data);

            Assert.Equal(3, mm.Length);
            Assert.Equal(3, ss.Length);
            Assert.Equal(3, bb.Length);

            // Just for fun, let's also write out some of the lines of the data to the console.
            using (var stream = new MemoryStream())
            {
                IDataView v = new ChooseColumnsTransform(env, tdata.AsDynamic, "r", "ncdf", "n", "b");
                v = TakeFilter.Create(env, v, 10);
                var saver = new TextSaver(env, new TextSaver.Arguments()
                {
                    Dense        = true,
                    Separator    = ",",
                    OutputHeader = false
                });
                saver.SaveData(stream, v, Utils.GetIdentityPermutation(v.Schema.ColumnCount));
                Console.WriteLine(Encoding.UTF8.GetString(stream.ToArray()));
            }
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new WordTokenizer(Env, "text", "text")
                      .Append(new TermEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
Example #3
0
        public void TestPcaEstimator()
        {
            var data = TextLoader.CreateReader(_env,
                                               c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                               separator: ';', hasHeader: true)
                       .Read(_dataSource);

            var est        = new PcaEstimator(_env, "features", "pca", rank: 5, seed: 1);
            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = _env.Start("save"))
            {
                IDataView savedData = TakeFilter.Create(_env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(_env, savedData, "pca");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, _saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv", digitsOfPrecision: 4);
            Done();
        }
Example #4
0
        public void PcaWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 1, conc: 1);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(new MultiFileSource(dataSource));

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(new MultiFileSource(dataSource));

            var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => {
                s.Seed = 1;
            });

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(env, savedData, "pca");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv");
            Done();
        }
        public void LpGcNormAndWhiteningWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 0);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(new MultiFileSource(dataSource));

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(new MultiFileSource(dataSource));

            var est = new LpNormalizer(env, "features", "lpnorm")
                      .Append(new GlobalContrastNormalizer(env, "features", "gcnorm"))
                      .Append(new Whitening(env, "features", "whitened"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "lpnorm", "gcnorm", "whitened");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath))
                              .AsDynamic;

            //var feat = Estimator.MakeNew(data)
            //     .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; }));
            var feat = new TextTransform(Env, "text", "Data", advancedSettings: s => { s.OutputTokens = true; });

            TestEstimatorCore(feat, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, feat.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "Data", "Data_TransformedText");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new TextNormalizer(Env, "text")
                      .Append(new WordTokenizer(Env, "text", "words"))
                      .Append(new StopwordRemover(Env, "words", "words_without_stopwords"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
        public void KeyToValuePigsty()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarString: ctx.LoadText(1),
                                                          VectorString: ctx.LoadText(1, 4)
                                                          ));

            var data = reader.Read(dataPath);

            // Non-pigsty Term.
            var dynamicData = new ValueToKeyMappingEstimator(Env, new[] {
                new TermTransform.ColumnInfo("ScalarString", "A"),
                new TermTransform.ColumnInfo("VectorString", "B")
            })
                              .Fit(data.AsDynamic).Transform(data.AsDynamic);

            var data2 = dynamicData.AssertStatic(Env, ctx => (
                                                     A: ctx.KeyU4.TextValues.Scalar,
                                                     B: ctx.KeyU4.TextValues.Vector));

            var est = data2.MakeNewEstimator()
                      .Append(row => (
                                  ScalarString: row.A.ToValue(),
                                  VectorString: row.B.ToValue()));

            TestEstimatorCore(est.AsDynamic, data2.AsDynamic, invalidInput: data.AsDynamic);

            // Check that term and ToValue are round-trippable.
            var dataLeft  = new ChooseColumnsTransform(Env, data.AsDynamic, "ScalarString", "VectorString");
            var dataRight = new ChooseColumnsTransform(Env, est.Fit(data2).Transform(data2).AsDynamic, "ScalarString", "VectorString");

            CheckSameSchemas(dataLeft.Schema, dataRight.Schema);
            CheckSameValues(dataLeft, dataRight);
            Done();
        }
Example #9
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.test");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 0),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null)
                                                                         {
                                                                             AutoEnd = false, VariableEnd = true
                                                                         } })
                },
                Separator = ",",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Read(source);

            ColumnType GetType(ISchema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema.GetColumnType(cIdx));
            }

            var pipe = new ConcatEstimator(Env, "f1", "float1")
                       .Append(new ConcatEstimator(Env, "f2", "float1", "float1"))
                       .Append(new ConcatEstimator(Env, "f3", "float4", "float1"))
                       .Append(new ConcatEstimator(Env, "f4", "vfloat", "float1"));

            data = TakeFilter.Create(Env, data, 10);
            data = pipe.Fit(data).Transform(data);

            ColumnType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 0);

            data = new ChooseColumnsTransform(Env, data, "f1", "f2", "f3", "f4");

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }