Esempio n. 1
0
        public void LdaWorkout()
        {
            IHostEnvironment env = new MLContext(seed: 42, conc: 1);
            string           sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var data = TextLoaderStatic.CreateReader(env, ctx => (
                                                         label: ctx.LoadBool(0),
                                                         text: ctx.LoadText(1)), hasHeader: true)
                       .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(env, "bag_of_words", "text").
                      Append(new LatentDirichletAllocationEstimator(env, "topics", "bag_of_words", 10, numIterations: 10,
                                                                    resetRandomGenerator: true));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // In this test it manifests because of the WordBagEstimator in the estimator chain
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ldatopics.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false, Dense = true
                });
                var transformer     = est.Fit(data.AsDynamic);
                var transformedData = transformer.Transform(data.AsDynamic);
                var savedData       = ML.Data.TakeRows(transformedData, 4);
                savedData = ML.Transforms.SelectColumns("topics").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

                Assert.Equal(10, (savedData.Schema[0].Type as VectorType)?.Size);
            }

            // Diabling this check due to the following issue with consitency of output.
            // `seed` specified in ConsoleEnvironment has no effect.
            // https://github.com/dotnet/machinelearning/issues/1004
            // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server
            // CheckEquality("Text", "ldatopics.tsv");
            Done();
        }
Esempio n. 2
0
        public void TestSavePaymentInfo()
        {
            PaymentInfo paymentInfo = new PaymentInfo
            {
                BSB           = "this is the bsb",
                AccountName   = "account name is here",
                AccountNumber = "what's the number",
                Reference     = "take reference",
                Amount        = 1123.45
            };
            TextSaver saver = new TextSaver();

            BpActionResult result = saver.SavePaymentInfo(paymentInfo);

            Assert.AreEqual(true, result.Success);
        }
        public void LdaWorkout()
        {
            var    env = new ConsoleEnvironment(seed: 42, conc: 1);
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(env, "text", "bag_of_words").
                      Append(new LdaEstimator(env, "bag_of_words", "topics", 10, advancedSettings: s => {
                s.NumIterations        = 10;
                s.ResetRandomGenerator = true;
            }));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ldatopics.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false, Dense = true
                });
                IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(env, savedData, new[] { "topics" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);

                Assert.Equal(10, (savedData.Schema.GetColumnType(0) as VectorType)?.Size);
            }

            // Diabling this check due to the following issue with consitency of output.
            // `seed` specified in ConsoleEnvironment has no effect.
            // https://github.com/dotnet/machinelearning/issues/1004
            // On single box, setting `s.ResetRandomGenerator = true` works but fails on build server
            // CheckEquality("Text", "ldatopics.tsv");
            Done();
        }
Esempio n. 4
0
        public void NAReplaceStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarFloat: ctx.LoadFloat(1),
                                                          ScalarDouble: ctx.LoadDouble(1),
                                                          VectorFloat: ctx.LoadFloat(1, 4),
                                                          VectorDoulbe: ctx.LoadDouble(1, 4)
                                                          ));

            var data            = reader.Read(new MultiFileSource(dataPath));
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);

            var est = data.MakeNewEstimator().
                      Append(row => (
                                 A: row.ScalarFloat.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Maximum),
                                 B: row.ScalarDouble.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Mean),
                                 C: row.VectorFloat.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Mean),
                                 D: row.VectorDoulbe.ReplaceWithMissingValues(NAReplaceTransform.ColumnInfo.ReplacementMode.Minimum)
                                 ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAReplace", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "A", "B", "C", "D");
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NAReplace", "featurized.tsv");
            Done();
        }
Esempio n. 5
0
        void TestDifferentTypes()
        {
            string dataPath = GetDataPath("adult.test");

            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 0),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("double1", DataKind.R8, 0),
                    new TextLoader.Column("double4", DataKind.R8, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("int1", DataKind.I4, 0),
                    new TextLoader.Column("text1", DataKind.TX, 1),
                    new TextLoader.Column("text2", DataKind.TX, new[] { new TextLoader.Range(1), new TextLoader.Range(3) }),
                },
                Separator = ",",
                HasHeader = true
            }, new MultiFileSource(dataPath));

            var pipe = new ValueToKeyMappingEstimator(Env, new[] {
                new ValueToKeyMappingTransformer.ColumnInfo("float1", "TermFloat1"),
                new ValueToKeyMappingTransformer.ColumnInfo("float4", "TermFloat4"),
                new ValueToKeyMappingTransformer.ColumnInfo("double1", "TermDouble1"),
                new ValueToKeyMappingTransformer.ColumnInfo("double4", "TermDouble4"),
                new ValueToKeyMappingTransformer.ColumnInfo("int1", "TermInt1"),
                new ValueToKeyMappingTransformer.ColumnInfo("text1", "TermText1"),
                new ValueToKeyMappingTransformer.ColumnInfo("text2", "TermText2")
            });
            var data = loader.Read(dataPath);

            data = TakeFilter.Create(Env, data, 10);
            var outputPath = GetOutputPath("Term", "Term.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, pipe.Fit(data).Transform(data), fs, keepHidden: true);
            }

            CheckEquality("Term", "Term.tsv");
            Done();
        }
Esempio n. 6
0
        public void KeyToValueWorkout()
        {
            string dataPath = GetDataPath("iris.txt");

            var reader = new TextLoader(Env, new TextLoader.Options
            {
                Columns = new[]
                {
                    new TextLoader.Column("ScalarString", DataKind.String, 1),
                    new TextLoader.Column("VectorString", DataKind.String, new[] { new TextLoader.Range(1, 4) }),
                    new TextLoader.Column("BareKey", DataKind.UInt32, new[] { new TextLoader.Range(0) }, new KeyCount(6))
                }
            });

            var data = reader.Read(dataPath);

            data = new ValueToKeyMappingEstimator(Env, new[] {
                new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"),
                new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString")
            }).Fit(data).Transform(data);

            var badData1 = new ColumnCopyingTransformer(Env, ("A", "BareKey")).Transform(data);
            var badData2 = new ColumnCopyingTransformer(Env, ("B", "VectorString")).Transform(data);

            var est = new KeyToValueMappingEstimator(Env, ("A_back", "A"), ("B_back", "B"));

            TestEstimatorCore(est, data, invalidInput: badData1);
            TestEstimatorCore(est, data, invalidInput: badData2);


            var outputPath = GetOutputPath("KeyToValue", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = est.Fit(data).Transform(data);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("KeyToValue", "featurized.tsv");
            Done();
        }
        void TestDifferentTypes()
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");

            var loader = new TextLoader(ML, new TextLoader.Options
            {
                Columns = new[] {
                    new TextLoader.Column("float1", DataKind.Single, 9),
                    new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }),
                    new TextLoader.Column("double1", DataKind.Double, 9),
                    new TextLoader.Column("double4", DataKind.Double, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }),
                    new TextLoader.Column("int1", DataKind.Int32, 9),
                    new TextLoader.Column("text1", DataKind.String, 1),
                    new TextLoader.Column("text2", DataKind.String, new[] { new TextLoader.Range(1), new TextLoader.Range(2) }),
                },
                Separator = "\t",
                HasHeader = true
            }, new MultiFileSource(dataPath));

            var pipe = new ValueToKeyMappingEstimator(ML, new[] {
                new ValueToKeyMappingEstimator.ColumnInfo("TermFloat1", "float1"),
                new ValueToKeyMappingEstimator.ColumnInfo("TermFloat4", "float4"),
                new ValueToKeyMappingEstimator.ColumnInfo("TermDouble1", "double1"),
                new ValueToKeyMappingEstimator.ColumnInfo("TermDouble4", "double4"),
                new ValueToKeyMappingEstimator.ColumnInfo("TermInt1", "int1"),
                new ValueToKeyMappingEstimator.ColumnInfo("TermText1", "text1"),
                new ValueToKeyMappingEstimator.ColumnInfo("TermText2", "text2")
            });
            var data = loader.Read(dataPath);

            data = ML.Data.TakeRows(data, 10);
            var outputPath = GetOutputPath("Term", "Term.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, pipe.Fit(data).Transform(data), fs, keepHidden: true);
            }

            CheckEquality("Term", "Term.tsv");
            Done();
        }
Esempio n. 8
0
        private static void SaveIdvSchemaToFile(IDataView idv, string path, IHost host)
        {
            var emptyDataView = new EmptyDataView(host, idv.Schema);
            var saverArgs     = new TextSaver.Arguments
            {
                OutputHeader = false,
                OutputSchema = true,
                Dense        = true
            };
            IDataSaver saver = new TextSaver(host, saverArgs);

            using (var fs = File.OpenWrite(path))
            {
                saver.SaveData(fs, emptyDataView, Utils.GetIdentityPermutation(emptyDataView.Schema.Count)
                               .Where(x => !emptyDataView.Schema[x].IsHidden && saver.IsColumnSavable(emptyDataView.Schema[x].Type))
                               .ToArray());
            }
        }
Esempio n. 9
0
        public void NAIndicatorFileOutput()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                ScalarFloat: ctx.LoadFloat(1),
                                                                ScalarDouble: ctx.LoadDouble(1),
                                                                VectorFloat: ctx.LoadFloat(1, 4),
                                                                VectorDoulbe: ctx.LoadDouble(1, 4)
                                                                ));

            var data            = reader.Load(new MultiFileSource(dataPath)).AsDynamic;
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ML.Data.LoadFromEnumerable(wrongCollection);
            var est         = ML.Transforms.IndicateMissingValues(new[]
            {
                new InputOutputColumnPair("A", "ScalarFloat"),
                new InputOutputColumnPair("B", "ScalarDouble"),
                new InputOutputColumnPair("C", "VectorFloat"),
                new InputOutputColumnPair("D", "VectorDoulbe")
            });

            TestEstimatorCore(est, data, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAIndicator", "featurized.tsv");

            using (var ch = ((IHostEnvironment)ML).Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NAIndicator", "featurized.tsv");
            Done();
        }
        public void NAIndicatorFileOutput()
        {
            string dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var    data     = ML.Data.LoadFromTextFile(dataPath, new[] {
                new TextLoader.Column("ScalarFloat", DataKind.Single, 1),
                new TextLoader.Column("ScalarDouble", DataKind.Double, 1),
                new TextLoader.Column("VectorFloat", DataKind.Single, 1, 4),
                new TextLoader.Column("VectorDoulbe", DataKind.Double, 1, 4)
            });

            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 3, C = new float[2] {
                                                  1, 2
                                              }, D = new double[2] {
                                                  3, 4
                                              }
                                          } };
            var invalidData = ML.Data.LoadFromEnumerable(wrongCollection);
            var est         = ML.Transforms.IndicateMissingValues(new[]
            {
                new InputOutputColumnPair("A", "ScalarFloat"),
                new InputOutputColumnPair("B", "ScalarDouble"),
                new InputOutputColumnPair("C", "VectorFloat"),
                new InputOutputColumnPair("D", "VectorDoulbe")
            });

            TestEstimatorCore(est, data, invalidInput: invalidData);
            var outputPath = GetOutputPath("NAIndicator", "featurized.tsv");

            using (var ch = ((IHostEnvironment)ML).Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("NAIndicator", "featurized.tsv");
            Done();
        }
Esempio n. 11
0
        public void CategoricalStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                ScalarString: ctx.LoadText(1),
                                                                VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = 1, B = 2, C = 3,
                                          }, new TestClass()
                                          {
                                              A = 4, B = 5, C = 6
                                          } };

            var invalidData = ML.Data.ReadFromEnumerable(wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         A: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotEncoding(outputKind: CategoricalStaticExtensions.OneHotVectorOutputKind.Bin)
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Categorical", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = new ColumnSelectingTransformer(Env, new string[] { "A", "B", "C", "D", "E" }, null, false).Transform(savedData);
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("Categorical", "featurized.tsv");
            Done();
        }
        public void CategoricalHashStatic()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    reader   = TextLoader.CreateReader(Env, ctx => (
                                                          ScalarString: ctx.LoadText(1),
                                                          VectorString: ctx.LoadText(1, 4)));
            var data            = reader.Read(dataPath);
            var wrongCollection = new[] { new TestClass()
                                          {
                                              A = "1", B = "2", C = "3",
                                          }, new TestClass()
                                          {
                                              A = "4", B = "5", C = "6"
                                          } };

            var invalidData = ComponentCreation.CreateDataView(Env, wrongCollection);
            var est         = data.MakeNewEstimator().
                              Append(row => (
                                         A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                                         B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
                                         C: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bag),
                                         D: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Bin),
                                         E: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Bin)
                                         ));

            TestEstimatorCore(est.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("CategoricalHash", "featurized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = TakeFilter.Create(Env, est.Fit(data).Transform(data).AsDynamic, 4);
                var view      = SelectColumnsTransform.CreateKeep(Env, savedData, new[] { "A", "B", "C", "D", "E" });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, view, fs, keepHidden: true);
            }

            CheckEquality("CategoricalHash", "featurized.tsv");
            Done();
        }
Esempio n. 13
0
        public void NormalizerWithOnFit()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => c.LoadFloat(0, 2),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            // These will be populated once we call fit.
            ImmutableArray <float> mm;
            ImmutableArray <float> ss;
            ImmutableArray <ImmutableArray <float> > bb;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r,
                                    ncdf: r.NormalizeByCumulativeDistribution(onFit: (m, s) => mm = m),
                                    n: r.NormalizeByMeanVar(onFit: (s, o) => { ss = s; Assert.Empty(o); }),
                                    b: r.NormalizeByBinning(onFit: b => bb = b)));
            var tdata = est.Fit(data).Transform(data);

            Assert.Equal(3, mm.Length);
            Assert.Equal(3, ss.Length);
            Assert.Equal(3, bb.Length);

            // Just for fun, let's also write out some of the lines of the data to the console.
            using (var stream = new MemoryStream())
            {
                IDataView v = new ChooseColumnsTransform(env, tdata.AsDynamic, "r", "ncdf", "n", "b");
                v = TakeFilter.Create(env, v, 10);
                var saver = new TextSaver(env, new TextSaver.Arguments()
                {
                    Dense        = true,
                    Separator    = ",",
                    OutputHeader = false
                });
                saver.SaveData(stream, v, Utils.GetIdentityPermutation(v.Schema.ColumnCount));
                Console.WriteLine(Encoding.UTF8.GetString(stream.ToArray()));
            }
        }
Esempio n. 14
0
        public void OldKeyTypeCodecTest()
        {
            // Checks that we can load IDataViews defined with unknown cardinality KeyType.
            // schema-codec-test.idv was generated with the following command before simplifying the KeyType:
            // dotnet MML.dll savedata loader=text{col=A:U4[0-2]:0 col=B:U4[0-5]:0 col=C:U1[0-10]:0 col=D:U2[0-*]:0 col=E:U4[0-*]:0 col=F:U8[0-*]:0} dout=codectest.idv
            var data = ML.Data.ReadFromBinary(GetDataPath("schema-codec-test.idv"));

            var saver = new TextSaver(ML, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("BinaryLoaderSaver", "OldKeyTypeCodecTest.txt");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    ML.Data.SaveAsText(data, fs);
            }
            CheckEquality("BinaryLoaderSaver", "OldKeyTypeCodecTest.txt");
            Done();
        }
        /// <summary>
        /// Save the data view as text.
        /// </summary>
        /// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
        /// <param name="data">The data view to save.</param>
        /// <param name="stream">The stream to write to.</param>
        /// <param name="separatorChar">The column separator.</param>
        /// <param name="headerRow">Whether to write the header row.</param>
        /// <param name="schema">Whether to write the header comment with the schema.</param>
        /// <param name="keepHidden">Whether to keep hidden columns in the dataset.</param>
        public static void SaveAsText(this DataOperationsCatalog catalog,
                                      IDataView data,
                                      Stream stream,
                                      char separatorChar = TextLoader.DefaultArguments.Separator,
                                      bool headerRow     = TextLoader.DefaultArguments.HasHeader,
                                      bool schema        = true,
                                      bool keepHidden    = false)
        {
            Contracts.CheckValue(catalog, nameof(catalog));
            Contracts.CheckValue(data, nameof(data));
            Contracts.CheckValue(stream, nameof(stream));

            var env   = catalog.GetEnvironment();
            var saver = new TextSaver(env, new TextSaver.Arguments {
                Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema
            });

            using (var ch = env.Start("Saving data"))
                DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden);
        }
Esempio n. 16
0
        public void LambdaTransformCreate()
        {
            var env = new MLContext(seed: 42);
            var data = ReadBreastCancerExamples();
            var idv = env.CreateDataView(data);

            var filter = LambdaTransform.CreateFilter<BreastCancerExample, object>(env, idv,
                (input, state) => input.Label == 0, null);

            Assert.Null(filter.GetRowCount());

            // test re-apply
            var applied = env.CreateDataView(data);
            applied = ApplyTransformUtils.ApplyAllTransformsToData(env, filter, applied);

            var saver = new TextSaver(env, new TextSaver.Arguments());
            Assert.True(applied.Schema.TryGetColumnIndex("Label", out int label));
            using (var fs = File.Create(GetOutputPath(OutputRelativePath, "lambda-output.tsv")))
                saver.SaveData(fs, applied, label);
        }
Esempio n. 17
0
        /// <summary>
        /// Метод удаляет символы с текста.
        /// </summary>
        /// <param name="deleteSymblol">Символ для удаления.</param>
        /// <param name="text">Текст.</param>
        public static void Delete(string deleteSymblol, ref TextSaver text)
        {
            int  count   = 0;     //Итератор для foreach.
            bool isExist = false; //Булевая переменная для проверки существования удаляемого обьекта.

            foreach (string words in text.Words)
            {
                if (words.Contains(deleteSymblol))
                {
                    isExist           = true;
                    text.Words[count] = words.Replace(deleteSymblol, "");
                }
                count++;
            }

            text.Synchro();
            if (!isExist)
            {
                Console.WriteLine($"This file does not have this \"{deleteSymblol}\"");
            }
        }
Esempio n. 18
0
        public void TestWordEmbeddings()
        {
            var dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename);
            var data     = new TextLoader(Env,
                                          new TextLoader.Arguments()
            {
                Separator = "\t",
                HasHeader = true,
                Columns   = new[]
                {
                    new TextLoader.Column("Label", DataKind.BL, 0),
                    new TextLoader.Column("SentimentText", DataKind.Text, 1)
                }
            }).Read(GetDataPath(dataPath));

            var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
                      .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
                      .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
            var words = est.Fit(data).Transform(data);

            var pipe = ML.Transforms.Text.ExtractWordEmbeddings("WordEmbeddings", "CleanWords", modelKind: WordEmbeddingsExtractingTransformer.PretrainedModelKind.Sswe);

            TestEstimatorCore(pipe, words, invalidInput: data);

            var outputPath = GetOutputPath("Text", "wordEmbeddings.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, pipe.Fit(words).Transform(words), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "WordEmbeddings" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "wordEmbeddings.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var data = TextLoaderStatic.CreateLoader(Env, ctx => (
                    label: ctx.LoadBool(0),
                    text: ctx.LoadText(1)), hasHeader: true)
                .Load(dataPath).AsDynamic;

            var est = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
            var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata);

            var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");
            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
Esempio n. 20
0
        private static void SaveIdvToFile(IDataView idv, string path, IHost host)
        {
            if (path == STDNULL)
            {
                return;
            }
            var        extension = Path.GetExtension(path);
            IDataSaver saver;

            if (extension != ".csv" && extension != ".tsv" && extension != ".txt")
            {
                saver = new BinarySaver(host, new BinarySaver.Arguments());

                var schemaFilePath = Path.GetDirectoryName(path) +
                                     Path.DirectorySeparatorChar +
                                     Path.GetFileNameWithoutExtension(path) +
                                     ".schema";
                SaveIdvSchemaToFile(idv, schemaFilePath, host);
            }
            else
            {
                var saverArgs = new TextSaver.Arguments
                {
                    OutputHeader = true,
                    OutputSchema = true,
                    Dense        = true,

                    Separator = extension == ".csv" ? "comma" : "tab"
                };
                saver = new TextSaver(host, saverArgs);
            }

            using (var fs = File.OpenWrite(path))
            {
                saver.SaveData(fs, idv, Utils.GetIdentityPermutation(idv.Schema.Count)
                               .Where(x => !idv.Schema[x].IsHidden && saver.IsColumnSavable(idv.Schema[x].Type))
                               .ToArray());
            }
        }
Esempio n. 21
0
        public void PcaWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 1, conc: 1);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(new MultiFileSource(dataSource));

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(new MultiFileSource(dataSource));

            var est = new PcaEstimator(env, "features", "pca", rank: 5, advancedSettings: s => {
                s.Seed = 1;
            });

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("PCA", "pca.tsv");

            using (var ch = env.Start("save"))
            {
                var saver = new TextSaver(env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(env, savedData, "pca");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("PCA", "pca.tsv");
            Done();
        }
Esempio n. 22
0
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new WordTokenizer(Env, "text", "text")
                      .Append(new TermEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                   hasHeader: true, allowQuoting: true);

            var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.Single, 1)
            },
                                                       hasHeader: true, allowQuoting: true);

            var feat = ML.Transforms.Text.FeaturizeText("Data", new TextFeaturizingEstimator.Options {
                OutputTokensColumnName = "OutputTokens"
            }, new[] { "text" });

            TestEstimatorCore(feat, data, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = ((IHostEnvironment)ML).Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data), 4);
                savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
Esempio n. 24
0
        /// <summary>
        /// Save schema associations of role/column-name in <paramref name="rep"/>.
        /// </summary>
        internal static void SaveRoleMappings(IHostEnvironment env, IChannel ch, RoleMappedSchema schema, RepositoryWriter rep)
        {
            // REVIEW: Should we also save this stuff, for instance, in some portion of the
            // score command or transform?
            Contracts.AssertValue(env);
            env.AssertValue(ch);
            ch.AssertValue(schema);

            ArrayDataViewBuilder builder = new ArrayDataViewBuilder(env);

            List <string> rolesList       = new List <string>();
            List <string> columnNamesList = new List <string>();

            // OrderBy is stable, so there is no danger in it "reordering" columns
            // when a role is filled by multiple columns.
            foreach (var role in schema.GetColumnRoleNames().OrderBy(r => r.Key.Value))
            {
                rolesList.Add(role.Key.Value);
                columnNamesList.Add(role.Value);
            }
            builder.AddColumn("Role", rolesList.ToArray());
            builder.AddColumn("Column", columnNamesList.ToArray());

            using (var entry = rep.CreateEntry(DirTrainingInfo, RoleMappingFile))
            {
                // REVIEW: It seems very important that we have the role mappings
                // be easily human interpretable and even manipulable, but relying on the
                // text saver/loader means that special characters like '\n' won't be reinterpretable.
                // On the other hand, no one is such a big lunatic that they will actually
                // ever go ahead and do something so stupid as that.
                var saver = new TextSaver(env, new TextSaver.Arguments()
                {
                    Dense = true, Silent = true
                });
                var view = builder.GetDataView();
                saver.SaveData(entry.Stream, view, Utils.GetIdentityPermutation(view.Schema.ColumnCount));
            }
        }
Esempio n. 25
0
        public void TextFeaturizerWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Load(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Load(sentimentDataPath)
                              .AsDynamic;

            var feat = data.MakeNewEstimator()
                       .Append(row => row.text.FeaturizeText(options: new TextFeaturizingEstimator.Options {
                OutputTokensColumnName = "OutputTokens",
            }));

            TestEstimatorCore(feat.AsDynamic, data.AsDynamic, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "featurized.tsv");

            using (var ch = ((IHostEnvironment)ML).Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = ML.Data.TakeRows(feat.Fit(data).Transform(data).AsDynamic, 4);
                savedData = ML.Transforms.SelectColumns("Data", "OutputTokens").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "featurized.tsv");
            Done();
        }
Esempio n. 26
0
            public State(TextSaver parent, TextWriter writer, ValueWriter[] pipes, bool hasHeader)
            {
                Contracts.AssertValue(parent);
                Contracts.AssertValue(parent._host);
                _host = parent._host;
                _host.AssertValue(writer);
                _host.AssertValue(pipes);

                _dense   = parent._forceDense;
                _sepChar = parent._sepChar;
                _sepStr  = parent._sepStr;

                _writer    = writer;
                _pipes     = pipes;
                _hasHeader = hasHeader && parent._outputHeader;

                _mpcoldst  = new int[_pipes.Length + 1];
                _mpcolslot = new int[_pipes.Length + 1];

                _rgch         = new char[1024];
                _mpslotdst    = new int[128];
                _mpslotichLim = new int[128];
            }
        public void FeatureSelectionWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                   hasHeader: true, allowQuoting: true, allowSparse: true);

            var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.Single, 1)
            },
                                                       hasHeader: true, allowQuoting: true, allowSparse: true);

            var est = new WordBagEstimator(ML, "bag_of_words", "text")
                      .AppendCacheCheckpoint(ML)
                      .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10)
                              .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label")));

            var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true
                });
                var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
                savedData = ML.Transforms.SelectColumns("bag_of_words_count", "bag_of_words_mi").Fit(savedData).Transform(savedData);

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("FeatureSelection", "featureselection.tsv");
            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramExtractingEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashingEstimator(Env, "terms", "ngramshash"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void WordBagWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordBagEstimator(Env, "text", "bag_of_words").
                      Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash", invertHash: -1));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "bag_of_words.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "bag_of_words", "bag_of_wordshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "bag_of_words.tsv");
            Done();
        }
Esempio n. 30
0
        public void LpGcNormAndWhiteningWorkout()
        {
            var    env        = new ConsoleEnvironment(seed: 0);
            string dataSource = GetDataPath("generated_regression_dataset.csv");
            var    data       = TextLoader.CreateReader(env,
                                                        c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                        separator: ';', hasHeader: true)
                                .Read(new MultiFileSource(dataSource));

            var invalidData = TextLoader.CreateReader(env,
                                                      c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                      separator: ';', hasHeader: true)
                              .Read(new MultiFileSource(dataSource));

            var est = new LpNormalizer(env, "features", "lpnorm")
                      .Append(new GlobalContrastNormalizer(env, "features", "gcnorm"))
                      .Append(new Whitening(env, "features", "whitened"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "lpnorm_gcnorm_whitened.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, OutputHeader = false
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "lpnorm", "gcnorm", "whitened");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "lpnorm_gcnorm_whitened.tsv", digitsOfPrecision: 4);
            Done();
        }