public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadText(1)), hasHeader: true)
                              .Read(dataPath).AsDynamic;

            var est       = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4);
            var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" });

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "words", "text")
                      .Append(new TokenizingByCharactersEstimator(ML, "chars", "text"))
                      .Append(new KeyToValueMappingEstimator(ML, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "words", "chars" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
예제 #3
0
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Load(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Load(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(ML, "terms", "text"))
                      .Append(new NgramExtractingEstimator(ML, "ngrams", "terms"))
                      .Append(new NgramHashingEstimator(ML, "ngramshash", "terms"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = ML.Data.LoadFromTextFile(dataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                       hasHeader: true);

            var est       = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
            var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata);

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                   hasHeader: true);

            var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.Single, 1)
            },
                                                       hasHeader: true);

            var est = new WordTokenizingEstimator(ML, "words", "text")
                      .Append(new TokenizingByCharactersEstimator(ML, "chars", "text"))
                      .Append(new KeyToValueMappingEstimator(ML, "chars"));

            TestEstimatorCore(est, data, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);

            savedData = ML.Transforms.SelectColumns("text", "words", "chars").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
예제 #6
0
        public void WordTokenizeWorkout()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is a good sentence.", B = new string[2] {
                                       "Much words", "Wow So Cool"
                                   }
                               } };
            var dataView    = ML.Data.ReadFromEnumerable(data);
            var invalidData = new[] { new TestWrong()
                                      {
                                          A = 1, B = new float[2] {
                                              2, 3
                                          }
                                      } };
            var invalidDataView = ML.Data.ReadFromEnumerable(invalidData);
            var pipe            = new WordTokenizingEstimator(Env, new[] {
                new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"),
                new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"),
            });

            TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView);

            // Reuse the pipe trained on dataView in TestEstimatorCore to make prediction.
            var result = pipe.Fit(dataView).Transform(dataView);

            // Extract the transformed result of the first row (the only row we have because data contains only one TestClass) as a native class.
            var nativeResult = ML.CreateEnumerable <NativeResult>(result, false).First();

            // Check the tokenization of A. Expected result is { "This", "is", "a", "good", "sentence." }.
            var tokenizeA = new[] { "This", "is", "a", "good", "sentence." };

            Assert.True(tokenizeA.Length == nativeResult.TokenizeA.Length);
            for (int i = 0; i < tokenizeA.Length; ++i)
            {
                Assert.Equal(tokenizeA[i], nativeResult.TokenizeA[i]);
            }

            // Check the tokenization of B. Expected result is { "Much", "words", "Wow", "So", "Cool" }. One may think that the expected output
            // should be a 2-D array { { "Much", "words"}, { "Wow", "So", "Cool" } }, but please note that ML.NET may flatten all outputs if
            // they are high-dimension tensors.
            var tokenizeB = new[] { "Much", "words", "Wow", "So", "Cool" };

            Assert.True(tokenizeB.Length == nativeResult.TokenizeB.Length);
            for (int i = 0; i < tokenizeB.Length; ++i)
            {
                Assert.Equal(tokenizeB[i], nativeResult.TokenizeB[i]);
            }

            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "words")
                      .Append(new TokenizingByCharactersEstimator(Env, "text", "chars"))
                      .Append(new KeyToValueMappingEstimator(Env, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words", "chars" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
예제 #9
0
        public void TestOldSavingAndLoading()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is a good sentence.", B = new string[2] {
                                       "Much words", "Wow So Cool"
                                   }
                               } };

            var dataView = ML.Data.ReadFromEnumerable(data);
            var pipe     = new WordTokenizingEstimator(Env, new[] {
                new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"),
                new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"),
            });
            var result      = pipe.Fit(dataView).Transform(dataView);
            var resultRoles = new RoleMappedData(result);

            using (var ms = new MemoryStream())
            {
                TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles);
                ms.Position = 0;
                var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms);
            }
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                   hasHeader: true, allowQuoting: true);

            var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.Single, 1)
            },
                                                       hasHeader: true, allowQuoting: true);

            var est = new WordTokenizingEstimator(ML, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(ML, "terms", "text"))
                      .Append(new NgramExtractingEstimator(ML, "ngrams", "terms"))
                      .Append(new NgramHashingEstimator(ML, "ngramshash", "terms"))
                      // Also have a situation where we use invert hashing. However we only write
                      // the original non-inverted column to the actual baseline file.
                      .Append(new NgramHashingEstimator(ML, "ngramshashinvert", "terms", maximumNumberOfInverts: 2));

            TestEstimatorCore(est, data, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);

            savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
예제 #11
0
        public static void KeyToValueValueToKey()
        {
            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var ml = new MLContext();

            // Get a small dataset as an IEnumerable and load it into ML.NET data set.
            IEnumerable <SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData();
            var trainData = ml.Data.ReadFromEnumerable(data);

            // Preview of one of the columns of the the topics data.
            // The Review column contains the keys associated with a particular body of text.
            //
            // Review
            // "animals birds cats dogs fish horse"
            // "horse birds house fish duck cats"
            // "car truck driver bus pickup"
            // "car truck driver bus pickup horse"

            // A pipeline to convert the terms of the 'Review' column in
            // making use of default settings.
            string defaultColumnName = "DefaultKeys";
            // REVIEW create through the catalog extension
            var default_pipeline = new WordTokenizingEstimator(ml, "Review")
                                   .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

            // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
            // We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
            // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
            // to value/alphabetically.
            string customizedColumnName = "CustomizedKeys";
            var    customized_pipeline  = new WordTokenizingEstimator(ml, "Review")
                                          .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));

            // The transformed data.
            var transformedData_default    = default_pipeline.Fit(trainData).Transform(trainData);
            var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData);

            // Small helper to print the text inside the columns, in the console.
            Action <string, IEnumerable <VBuffer <uint> > > printHelper = (columnName, column) =>
            {
                Console.WriteLine($"{columnName} column obtained post-transformation.");
                foreach (var row in column)
                {
                    foreach (var value in row.GetValues())
                    {
                        Console.Write($"{value} ");
                    }
                    Console.WriteLine("");
                }

                Console.WriteLine("===================================================");
            };

            // Preview of the DefaultKeys column obtained after processing the input.
            var defaultColumn = transformedData_default.GetColumn <VBuffer <uint> >(ml, defaultColumnName);

            printHelper(defaultColumnName, defaultColumn);

            // DefaultKeys column obtained post-transformation.
            //
            // 1 2 3 4 5 6
            // 6 2 7 5 8 3
            // 9 10 11 12 13 3
            // 9 10 11 12 13 6

            // Previewing the CustomizedKeys column obtained after processing the input.
            var customizedColumn = transformedData_customized.GetColumn <VBuffer <uint> >(ml, customizedColumnName);

            printHelper(customizedColumnName, customizedColumn);

            // CustomizedKeys column obtained post-transformation.
            //
            // 1 2 4 5 7 8
            // 8 2 9 7 6 4
            // 3 10 0 0 0 4
            // 3 10 0 0 0 8

            // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines
            // to convert the keys back to the strings.
            var pipeline = default_pipeline.Append(ml.Transforms.Conversion.MapKeyToValue(defaultColumnName));

            transformedData_default = pipeline.Fit(trainData).Transform(trainData);

            // Preview of the DefaultColumnName column obtained.
            var originalColumnBack = transformedData_default.GetColumn <VBuffer <ReadOnlyMemory <char> > >(ml, defaultColumnName);

            foreach (var row in originalColumnBack)
            {
                foreach (var value in row.GetValues())
                {
                    Console.Write($"{value} ");
                }
                Console.WriteLine("");
            }

            // DefaultKeys column obtained post-transformation.
            //
            // animals birds cats dogs fish horse
            // horse birds house fish duck cats
            // car truck driver bus pickup cats
            // car truck driver bus pickup horse
        }