Esempio n. 1
0
        public void LpGcNormAndWhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoader.CreateReader(ML,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(ML,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new LpNormalizingEstimator(ML, "features", "lpnorm")
                      .Append(new GlobalContrastNormalizingEstimator(ML, "features", "gcnorm"))
                      .Append(new VectorWhiteningEstimator(ML, "features", "whitened"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = SelectColumnsTransform.CreateKeep(ML, savedData, new[] { "lpnorm", "gcnorm", "whitened" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
Esempio n. 2
0
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "words")
                      .Append(new CharacterTokenizingEstimator(Env, "text", "chars"))
                      .Append(new KeyToValueEstimator(Env, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "text", "words", "chars");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
Esempio n. 3
0
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath)
                              .AsDynamic;

            var feat = data.MakeNewEstimator()
                       .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; }));

            TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, feat.Fit(data).Transform(data).AsDynamic, 4);
                savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "Data", "Data_TransformedText");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
        public void WhiteningWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 0);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new VectorWhiteningEstimator(env, "features", "whitened1")
                      .Append(new VectorWhiteningEstimator(env, "features", "whitened2", kind: WhiteningKind.Pca, pcaNum: 5));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = SelectColumnsTransform.CreateKeep(Env, savedData, "whitened1", "whitened2");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
Esempio n. 5
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.test");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 0),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("float6", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, 12) }),
                    new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(14, null)
                                                                         {
                                                                             AutoEnd = false, VariableEnd = true
                                                                         } })
                },
                Separator = ",",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Read(source);

            ColumnType GetType(Schema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema.GetColumnType(cIdx));
            }

            var pipe = new ColumnConcatenatingEstimator(Env, "f1", "float1")
                       .Append(new ColumnConcatenatingEstimator(Env, "f2", "float1", "float1"))
                       .Append(new ColumnConcatenatingEstimator(Env, "f3", "float4", "float1"))
                       .Append(new ColumnConcatenatingEstimator(Env, "f4", "float6", "vfloat", "float1"));

            data = TakeFilter.Create(Env, data, 10);
            data = pipe.Fit(data).Transform(data);

            ColumnType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t is VectorType vt1 && vt1.ItemType == NumberType.R4 && vt1.Size == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t is VectorType vt2 && vt2.ItemType == NumberType.R4 && vt2.Size == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t is VectorType vt3 && vt3.ItemType == NumberType.R4 && vt3.Size == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t is VectorType vt4 && vt4.ItemType == NumberType.R4 && vt4.Size == 0);

            data = SelectColumnsTransform.CreateKeep(Env, data, new[] { "f1", "f2", "f3", "f4" });

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }
Esempio n. 6
0
        private void RunCore(IChannel ch)
        {
            Host.AssertValue(ch);
            IDataView data = CreateAndSaveLoader();

            if (!string.IsNullOrWhiteSpace(Args.Columns))
            {
                var keepColumns = Args.Columns
                                  .Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToArray();
                if (Utils.Size(keepColumns) > 0)
                {
                    data = SelectColumnsTransform.CreateKeep(Host, data, keepColumns);
                }
            }

            IDataSaver saver;

            if (Args.Saver != null)
            {
                saver = Args.Saver.CreateComponent(Host);
            }
            else
            {
                saver = new TextSaver(Host, new TextSaver.Arguments()
                {
                    Dense = Args.Dense
                });
            }
            var cols = new List <int>();

            for (int i = 0; i < data.Schema.ColumnCount; i++)
            {
                if (!Args.KeepHidden && data.Schema.IsHidden(i))
                {
                    continue;
                }
                var type = data.Schema.GetColumnType(i);
                if (saver.IsColumnSavable(type))
                {
                    cols.Add(i);
                }
                else
                {
                    ch.Info(MessageSensitivity.Schema, "The column '{0}' will not be written as it has unsavable column type.", data.Schema.GetColumnName(i));
                }
            }
            Host.NotSensitive().Check(cols.Count > 0, "No valid columns to save");

            // Send the first N lines to console.
            if (Args.Rows > 0)
            {
                var args = new SkipTakeFilter.TakeArguments()
                {
                    Count = Args.Rows
                };
                data = SkipTakeFilter.Create(Host, args, data);
            }
            var textSaver = saver as TextSaver;

            // If it is a text saver, utilize a special utility for this purpose.
            if (textSaver != null)
            {
                textSaver.WriteData(data, true, cols.ToArray());
            }
            else
            {
                using (MemoryStream mem = new MemoryStream())
                {
                    using (Stream wrapStream = new SubsetStream(mem))
                        saver.SaveData(wrapStream, data, cols.ToArray());
                    mem.Seek(0, SeekOrigin.Begin);
                    using (StreamReader reader = new StreamReader(mem))
                    {
                        string result = reader.ReadToEnd();
                        ch.Info(MessageSensitivity.UserData | MessageSensitivity.Schema, result);
                    }
                }
            }
        }