Esempio n. 1
0
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath)
                              .AsDynamic;

            var feat = data.MakeNewEstimator()
                       .Append(row => row.text.FeaturizeText(advancedSettings: s => { s.OutputTokens = true; }));

            TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, feat.Fit(data).Transform(data).AsDynamic, 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "Data", "Data_TransformedText" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
Esempio n. 2
0
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "words")
                      .Append(new TokenizingByCharactersEstimator(Env, "text", "chars"))
                      .Append(new KeyToValueMappingEstimator(Env, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words", "chars" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
        private void TestFeatureContribution(
            ITrainerEstimator <ISingleFeaturePredictionTransformer <IPredictor>, IPredictor> trainer,
            IDataView data,
            string testFile,
            int precision = 6)
        {
            // Train the model.
            var model = trainer.Fit(data);

            // Extract the predictor, check that it supports feature contribution.
            var predictor = model.Model as ICalculateFeatureContribution;

            Assert.NotNull(predictor);

            // Calculate feature contributions.
            var est = new FeatureContributionCalculatingEstimator(ML, predictor, "Features", numPositiveContributions: 3, numNegativeContributions: 0)
                      .Append(new FeatureContributionCalculatingEstimator(ML, predictor, "Features", numPositiveContributions: 0, numNegativeContributions: 3))
                      .Append(new FeatureContributionCalculatingEstimator(ML, predictor, "Features", numPositiveContributions: 1, numNegativeContributions: 1))
                      .Append(new FeatureContributionCalculatingEstimator(ML, predictor, "Features", numPositiveContributions: 1, numNegativeContributions: 1, normalize: false));

            TestEstimatorCore(est, data);
            // Verify output.
            var outputPath = GetOutputPath("FeatureContribution", testFile + ".tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("FeatureContribution", testFile + ".tsv", digitsOfPrecision: precision);
            Done();
        }
        public void LpGcNormAndWhiteningWorkout()
        {
            string dataSource = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var    data       = TextLoader.CreateReader(ML,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(ML,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new LpNormalizingEstimator(ML, "features", "lpnorm")
                      .Append(new GlobalContrastNormalizingEstimator(ML, "features", "gcnorm"))
                      .Append(new VectorWhiteningEstimator(ML, "features", "whitened"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(ML, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "lpnorm", "gcnorm", "whitened" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new TextNormalizer(Env, "text")
                      .Append(new WordTokenizer(Env, "text", "words"))
                      .Append(new StopwordRemover(Env, "words", "words_without_stopwords"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
        public void WhiteningWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 0);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(dataSource);

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(dataSource);

            var est = new VectorWhiteningEstimator(env, "features", "whitened1")
                      .Append(new VectorWhiteningEstimator(env, "features", "whitened2", kind: WhiteningKind.Pca, pcaNum: 5));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("NormalizerEstimator", "whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = SelectColumnsTransform.CreateKeep(Env, savedData, new[] { "whitened1", "whitened2" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NormalizerEstimator", "whitened.tsv", digitsOfPrecision: 4);
            Done();
        }
Esempio n. 7
0
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);
            var est = ML.Transforms.Text.NormalizeText("text")
                      .Append(ML.Transforms.Text.TokenizeWords("words", "text"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
                      .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "NoDefaultStopwords", "NoStopWords" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
Esempio n. 8
0
        PrepareData(MLContext mlContext)
        {
            IDataView data      = null;
            IDataView trainData = null;
            IDataView testData  = null;

            // Step one: read the data as an IDataView.
            // Create the reader: define the data columns
            // and where to find them in the text file.
            var reader = new TextLoader(mlContext, new TextLoader.Arguments
            {
                Column = new[] {
                    // A boolean column depicting the 'label'.
                    new TextLoader.Column("Label", DataKind.BL, 30),
                    // 29 Features V1..V28 + Amount
                    new TextLoader.Column("V1", DataKind.R4, 1),
                    new TextLoader.Column("V2", DataKind.R4, 2),
                    new TextLoader.Column("V3", DataKind.R4, 3),
                    new TextLoader.Column("V4", DataKind.R4, 4),
                    new TextLoader.Column("V5", DataKind.R4, 5),
                    new TextLoader.Column("V6", DataKind.R4, 6),
                    new TextLoader.Column("V7", DataKind.R4, 7),
                    new TextLoader.Column("V8", DataKind.R4, 8),
                    new TextLoader.Column("V9", DataKind.R4, 9),
                    new TextLoader.Column("V10", DataKind.R4, 10),
                    new TextLoader.Column("V11", DataKind.R4, 11),
                    new TextLoader.Column("V12", DataKind.R4, 12),
                    new TextLoader.Column("V13", DataKind.R4, 13),
                    new TextLoader.Column("V14", DataKind.R4, 14),
                    new TextLoader.Column("V15", DataKind.R4, 15),
                    new TextLoader.Column("V16", DataKind.R4, 16),
                    new TextLoader.Column("V17", DataKind.R4, 17),
                    new TextLoader.Column("V18", DataKind.R4, 18),
                    new TextLoader.Column("V19", DataKind.R4, 19),
                    new TextLoader.Column("V20", DataKind.R4, 20),
                    new TextLoader.Column("V21", DataKind.R4, 21),
                    new TextLoader.Column("V22", DataKind.R4, 22),
                    new TextLoader.Column("V23", DataKind.R4, 23),
                    new TextLoader.Column("V24", DataKind.R4, 24),
                    new TextLoader.Column("V25", DataKind.R4, 25),
                    new TextLoader.Column("V26", DataKind.R4, 26),
                    new TextLoader.Column("V27", DataKind.R4, 27),
                    new TextLoader.Column("V28", DataKind.R4, 28),
                    new TextLoader.Column("Amount", DataKind.R4, 29),
                },
                // First line of the file is a header, not a data row.
                HasHeader = true,
                Separator = ","
            });


            // We know that this is a Binary Classification task,
            // so we create a Binary Classification context:
            // it will give us the algorithms we need,
            // as well as the evaluation procedure.
            var classification = new BinaryClassificationContext(mlContext);

            if (!File.Exists(Path.Combine(_outputPath, "testData.idv")) &&
                !File.Exists(Path.Combine(_outputPath, "trainData.idv")))
            {
                // Split the data 80:20 into train and test sets, train and evaluate.

                data = reader.Read(new MultiFileSource(_dataSetFile));
                ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (source)");
                ConsoleHelpers.InspectData(mlContext, data, 4);



                // Can't do stratification when column type is a boolean, is this an issue?
                //(trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2, stratificationColumn: "Label");
                (trainData, testData) = classification.TrainTestSplit(data, testFraction: 0.2);

                // save test split
                IHostEnvironment env = (IHostEnvironment)mlContext;
                using (var ch = env.Start("SaveData"))
                    using (var file = env.CreateOutputFile(Path.Combine(_outputPath, "testData.idv")))
                    {
                        var saver = new BinarySaver(mlContext, new BinarySaver.Arguments());
                        DataSaverUtils.SaveDataView(ch, saver, testData, file);
                    }

                // save train split
                using (var ch = ((IHostEnvironment)env).Start("SaveData"))
                    using (var file = env.CreateOutputFile(Path.Combine(_outputPath, "trainData.idv")))
                    {
                        var saver = new BinarySaver(mlContext, new BinarySaver.Arguments());
                        DataSaverUtils.SaveDataView(ch, saver, trainData, file);
                    }
            }
            else
            {
                // Load splited data
                var binTrainData = new BinaryLoader(mlContext, new BinaryLoader.Arguments(), new MultiFileSource(Path.Combine(_outputPath, "trainData.idv")));
                var trainRoles   = new RoleMappedData(binTrainData, roles: TransactionObservation.Roles());
                trainData = trainRoles.Data;


                var binTestData = new BinaryLoader(mlContext, new BinaryLoader.Arguments(), new MultiFileSource(Path.Combine(_outputPath, "testData.idv")));
                var testRoles   = new RoleMappedData(binTestData, roles: TransactionObservation.Roles());
                testData = testRoles.Data;
            }

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (traindata)");
            ConsoleHelpers.InspectData(mlContext, trainData, 4);

            ConsoleHelpers.ConsoleWriteHeader("Show 4 transactions fraud (true) and 4 transactions not fraud (false) -  (testData)");
            ConsoleHelpers.InspectData(mlContext, testData, 4);

            return(classification, reader, trainData, testData);
        }
        public void TestConvertWorkout()
        {
            var data = new[] { new TestClass()
                               {
                                   A = 1, B = new int[2] {
                                       1, 4
                                   }
                               },
                               new TestClass()
                               {
                                   A = 2, B = new int[2] {
                                       3, 4
                                   }
                               } };
            var dataView = ComponentCreation.CreateDataView(Env, data);
            var pipe     = new ConvertingEstimator(Env, columns: new[] { new ConvertingTransform.ColumnInfo("A", "ConvA", DataKind.R4),
                                                                         new ConvertingTransform.ColumnInfo("B", "ConvB", DataKind.R4) });

            TestEstimatorCore(pipe, dataView);
            var allTypesData = new[]
            {
                new TestPrimitiveClass()
                {
                    AA = new [] { "a", "b" },
                    AB = new [] { false, true },
                    AC = new [] { -1, 1 },
                    AD = new uint[] { 0, 1 },
                    AE = new byte[] { 0, 1 },
                    AF = new sbyte[] { -1, 1 },
                    AG = new short[] { -1, 1 },
                    AH = new ushort[] { 0, 1 },
                    AK = new long[] { -1, 1 },
                    AL = new ulong[] { 0, 1 },
                    AM = new float[] { 1.0f, 1.0f, },
                    AN = new double[] { 1.0d, 1.0d, }
                },
                new TestPrimitiveClass()
                {
                    AA = new [] { "0", "1" },
                    AB = new [] { false, true },
                    AC = new [] { int.MinValue, int.MaxValue },
                    AD = new uint[] { uint.MinValue, uint.MaxValue },
                    AE = new byte[] { byte.MinValue, byte.MaxValue },
                    AF = new sbyte[] { sbyte.MinValue, sbyte.MaxValue },
                    AG = new short[] { short.MinValue, short.MaxValue },
                    AH = new ushort[] { ushort.MinValue, ushort.MaxValue },
                    AK = new long[] { long.MinValue, long.MaxValue },
                    AL = new ulong[] { ulong.MinValue, ulong.MaxValue },
                    AM = new float[] { float.MinValue, float.MaxValue, },
                    AN = new double[] { double.MinValue, double.MaxValue, }
                }
            };

            var allTypesDataView = ComponentCreation.CreateDataView(Env, allTypesData);
            var allTypesPipe     = new ConvertingEstimator(Env, columns: new[] {
                new ConvertingTransform.ColumnInfo("AA", "ConvA", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AB", "ConvB", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AC", "ConvC", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AD", "ConvD", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AE", "ConvE", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AF", "ConvF", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AG", "ConvG", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AH", "ConvH", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AK", "ConvK", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AL", "ConvL", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AM", "ConvM", DataKind.R4),
                new ConvertingTransform.ColumnInfo("AN", "ConvN", DataKind.R4)
            }
                                                           );

            TestEstimatorCore(allTypesPipe, allTypesDataView);

            var outputPath = GetOutputPath("Convert", "Types.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, allTypesPipe.Fit(allTypesDataView).Transform(allTypesDataView), 2);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Convert", "Types.tsv");
            Done();
        }
Esempio n. 10
0
        public void NormalizerWorkout()
        {
            string dataPath = GetDataPath(TestDatasets.iris.trainFilename);

            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 1),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("double1", DataKind.R8, 1),
                    new TextLoader.Column("double4", DataKind.R8, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("int1", DataKind.I4, 0),
                    new TextLoader.Column("float0", DataKind.R4, new[] { new TextLoader.Range {
                                                                             Min = 1, VariableEnd = true
                                                                         } }),
                },
                HasHeader = true
            }, new MultiFileSource(dataPath));

            var est = new NormalizingEstimator(Env,
                                               new NormalizingEstimator.MinMaxColumn("float1"),
                                               new NormalizingEstimator.MinMaxColumn("float4"),
                                               new NormalizingEstimator.MinMaxColumn("double1"),
                                               new NormalizingEstimator.MinMaxColumn("double4"),
                                               new NormalizingEstimator.BinningColumn("float1", "float1bin"),
                                               new NormalizingEstimator.BinningColumn("float4", "float4bin"),
                                               new NormalizingEstimator.BinningColumn("double1", "double1bin"),
                                               new NormalizingEstimator.BinningColumn("double4", "double4bin"),
                                               new NormalizingEstimator.SupervisedBinningColumn("float1", "float1supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.SupervisedBinningColumn("float4", "float4supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.SupervisedBinningColumn("double1", "double1supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.SupervisedBinningColumn("double4", "double4supervisedbin", labelColumn: "int1"),
                                               new NormalizingEstimator.MeanVarColumn("float1", "float1mv"),
                                               new NormalizingEstimator.MeanVarColumn("float4", "float4mv"),
                                               new NormalizingEstimator.MeanVarColumn("double1", "double1mv"),
                                               new NormalizingEstimator.MeanVarColumn("double4", "double4mv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float1", "float1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("float4", "float4lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double1", "double1lmv"),
                                               new NormalizingEstimator.LogMeanVarColumn("double4", "double4lmv"));

            var data = loader.Read(dataPath);

            var badData1 = new ColumnCopyingTransformer(Env, ("int1", "float1")).Transform(data);
            var badData2 = new ColumnCopyingTransformer(Env, ("float0", "float4")).Transform(data);

            TestEstimatorCore(est, data, null, badData1);
            TestEstimatorCore(est, data, null, badData2);

            var outputPath = GetOutputPath("NormalizerEstimator", "normalized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                using (var fs = File.Create(outputPath))
                {
                    var dataView = ColumnSelectingTransformer.CreateDrop(Env, est.Fit(data).Transform(data), "float0");
                    DataSaverUtils.SaveDataView(ch, saver, dataView, fs, keepHidden: true);
                }
            }

            CheckEquality("NormalizerEstimator", "normalized.tsv");

            Done();
        }
Esempio n. 11
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(ML, new TextLoader.Options
            {
                Columns = new[] {
                    new TextLoader.Column("float1", DataKind.Single, 9),
                    new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }),
                    new TextLoader.Column("float6", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12, 14) }),
                    new TextLoader.Column("vfloat", DataKind.Single, new[] { new TextLoader.Range(14, null)
                                                                             {
                                                                                 AutoEnd = false, VariableEnd = true
                                                                             } })
                },
                Separator = "\t",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Load(source);

            DataViewType GetType(DataViewSchema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema[cIdx].Type);
            }

            var pipe = ML.Transforms.Concatenate("f1", "float1")
                       .Append(ML.Transforms.Concatenate("f2", "float1", "float1"))
                       .Append(ML.Transforms.Concatenate("f3", "float4", "float1"))
                       .Append(ML.Transforms.Concatenate("f4", "float6", "vfloat", "float1"));

            data = ML.Data.TakeRows(data, 10);
            data = pipe.Fit(data).Transform(data);

            DataViewType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t is VectorDataViewType vt1 && vt1.ItemType == NumberDataViewType.Single && vt1.Size == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t is VectorDataViewType vt2 && vt2.ItemType == NumberDataViewType.Single && vt2.Size == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t is VectorDataViewType vt3 && vt3.ItemType == NumberDataViewType.Single && vt3.Size == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t is VectorDataViewType vt4 && vt4.ItemType == NumberDataViewType.Single && vt4.Size == 0);

            data = ML.Transforms.SelectColumns("f1", "f2", "f3", "f4").Fit(data).Transform(data);

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }
Esempio n. 12
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.test");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 0),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null)
                                                                         {
                                                                             AutoEnd = false, VariableEnd = true
                                                                         } })
                },
                Separator = ",",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Read(source);

            ColumnType GetType(ISchema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema.GetColumnType(cIdx));
            }

            var pipe = new ConcatEstimator(Env, "f1", "float1")
                       .Append(new ConcatEstimator(Env, "f2", "float1", "float1"))
                       .Append(new ConcatEstimator(Env, "f3", "float4", "float1"))
                       .Append(new ConcatEstimator(Env, "f4", "vfloat", "float1"));

            data = TakeFilter.Create(Env, data, 10);
            data = pipe.Fit(data).Transform(data);

            ColumnType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 0);

            data = new ChooseColumnsTransform(Env, data, "f1", "f2", "f3", "f4");

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }