Ejemplo n.º 1
0
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Load(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Load(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(ML, "terms", "text"))
                      .Append(new NgramExtractingEstimator(ML, "ngrams", "terms"))
                      .Append(new NgramHashingEstimator(ML, "ngramshash", "terms"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadText(1)), hasHeader: true)
                              .Read(dataPath).AsDynamic;

            var est       = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4);
            var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" });

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(ML, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(ML, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(ML, "words", "text")
                      .Append(new TokenizingByCharactersEstimator(ML, "chars", "text"))
                      .Append(new KeyToValueMappingEstimator(ML, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);

            savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "words", "chars" });

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
        public void TokenizeWithSeparators()
        {
            string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data     = ML.Data.LoadFromTextFile(dataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                       hasHeader: true);

            var est       = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' });
            var outdata   = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);
            var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata);

            var saver = new TextSaver(Env, new TextSaver.Arguments {
                Silent = true
            });
            var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv");

            using (var ch = Env.Start("save"))
            {
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }
            CheckEquality("Text", "tokenizedWithSeparators.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                   hasHeader: true);

            var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.Single, 1)
            },
                                                       hasHeader: true);

            var est = new WordTokenizingEstimator(ML, "words", "text")
                      .Append(new TokenizingByCharactersEstimator(ML, "chars", "text"))
                      .Append(new KeyToValueMappingEstimator(ML, "chars"));

            TestEstimatorCore(est, data, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);

            savedData = ML.Transforms.SelectColumns("text", "words", "chars").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
Ejemplo n.º 6
0
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);
            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");

            // Compose the WordBagTransform from a tokenize transform,
            // followed by a NgramExtractionTransform.
            // Since WordBagTransform is a many-to-one column transform, for each
            // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform.

            // REVIEW: In order to not get ngrams that cross between vector slots, we need to
            // enable tokenize transforms to insert a special token between slots.

            // REVIEW: In order to make it possible to output separate bags for different columns
            // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries.

            var tokenizeColumns = new WordTokenizingTransformer.ColumnInfo[args.Column.Length];

            var extractorArgs =
                new NgramExtractorTransform.Arguments()
                {
                    MaxNumTerms = args.MaxNumTerms,
                    NgramLength = args.NgramLength,
                    SkipLength = args.SkipLength,
                    AllLengths = args.AllLengths,
                    Weighting = args.Weighting,
                    Column = new NgramExtractorTransform.Column[args.Column.Length]
                };

            for (int iinfo = 0; iinfo < args.Column.Length; iinfo++)
            {
                var column = args.Column[iinfo];
                h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name));
                h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source));
                h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source));

                tokenizeColumns[iinfo] = new WordTokenizingTransformer.ColumnInfo(column.Source.Length > 1 ? column.Name : column.Source[0], column.Name);

                extractorArgs.Column[iinfo] =
                    new NgramExtractorTransform.Column()
                    {
                        Name = column.Name,
                        Source = column.Name,
                        MaxNumTerms = column.MaxNumTerms,
                        NgramLength = column.NgramLength,
                        SkipLength = column.SkipLength,
                        Weighting = column.Weighting,
                        AllLengths = column.AllLengths
                    };
            }

            IDataView view = input;
            view = NgramExtractionUtils.ApplyConcatOnSources(h, args.Column, view);
            view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view);
            return NgramExtractorTransform.Create(h, extractorArgs, view);
        }
Ejemplo n.º 7
0
        public void WordTokenizeWorkout()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is a good sentence.", B = new string[2] {
                                       "Much words", "Wow So Cool"
                                   }
                               } };
            var dataView    = ML.Data.ReadFromEnumerable(data);
            var invalidData = new[] { new TestWrong()
                                      {
                                          A = 1, B = new float[2] {
                                              2, 3
                                          }
                                      } };
            var invalidDataView = ML.Data.ReadFromEnumerable(invalidData);
            var pipe            = new WordTokenizingEstimator(Env, new[] {
                new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"),
                new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"),
            });

            TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView);

            // Reuse the pipe trained on dataView in TestEstimatorCore to make prediction.
            var result = pipe.Fit(dataView).Transform(dataView);

            // Extract the transformed result of the first row (the only row we have because data contains only one TestClass) as a native class.
            var nativeResult = ML.CreateEnumerable <NativeResult>(result, false).First();

            // Check the tokenization of A. Expected result is { "This", "is", "a", "good", "sentence." }.
            var tokenizeA = new[] { "This", "is", "a", "good", "sentence." };

            Assert.True(tokenizeA.Length == nativeResult.TokenizeA.Length);
            for (int i = 0; i < tokenizeA.Length; ++i)
            {
                Assert.Equal(tokenizeA[i], nativeResult.TokenizeA[i]);
            }

            // Check the tokenization of B. Expected result is { "Much", "words", "Wow", "So", "Cool" }. One may think that the expected output
            // should be a 2-D array { { "Much", "words"}, { "Wow", "So", "Cool" } }, but please note that ML.NET may flatten all outputs if
            // they are high-dimension tensors.
            var tokenizeB = new[] { "Much", "words", "Wow", "So", "Cool" };

            Assert.True(tokenizeB.Length == nativeResult.TokenizeB.Length);
            for (int i = 0; i < tokenizeB.Length; ++i)
            {
                Assert.Equal(tokenizeB[i], nativeResult.TokenizeB[i]);
            }

            Done();
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(Env, "text", "terms"))
                      .Append(new NgramEstimator(Env, "terms", "ngrams"))
                      .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

            // The following call fails because of the following issue
            // https://github.com/dotnet/machinelearning/issues/969
            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
        public void TextTokenizationWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoaderStatic.CreateReader(Env, ctx => (
                                                            label: ctx.LoadBool(0),
                                                            text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoaderStatic.CreateReader(Env, ctx => (
                                                                label: ctx.LoadBool(0),
                                                                text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new WordTokenizingEstimator(Env, "text", "words")
                      .Append(new TokenizingByCharactersEstimator(Env, "text", "chars"))
                      .Append(new KeyToValueMappingEstimator(Env, "chars"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "tokenized.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words", "chars" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "tokenized.tsv");
            Done();
        }
Ejemplo n.º 10
0
        public void TestOldSavingAndLoading()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is a good sentence.", B = new string[2] {
                                       "Much words", "Wow So Cool"
                                   }
                               } };

            var dataView = ML.Data.ReadFromEnumerable(data);
            var pipe     = new WordTokenizingEstimator(Env, new[] {
                new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"),
                new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"),
            });
            var result      = pipe.Fit(dataView).Transform(dataView);
            var resultRoles = new RoleMappedData(result);

            using (var ms = new MemoryStream())
            {
                TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles);
                ms.Position = 0;
                var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms);
            }
        }
        public void NgramWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.String, 1)
            },
                                                   hasHeader: true, allowQuoting: true);

            var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] {
                new TextLoader.Column("label", DataKind.Boolean, 0),
                new TextLoader.Column("text", DataKind.Single, 1)
            },
                                                       hasHeader: true, allowQuoting: true);

            var est = new WordTokenizingEstimator(ML, "text", "text")
                      .Append(new ValueToKeyMappingEstimator(ML, "terms", "text"))
                      .Append(new NgramExtractingEstimator(ML, "ngrams", "terms"))
                      .Append(new NgramHashingEstimator(ML, "ngramshash", "terms"))
                      // Also have a situation where we use invert hashing. However we only write
                      // the original non-inverted column to the actual baseline file.
                      .Append(new NgramHashingEstimator(ML, "ngramshashinvert", "terms", maximumNumberOfInverts: 2));

            TestEstimatorCore(est, data, invalidInput: invalidData);

            var outputPath = GetOutputPath("Text", "ngrams.tsv");
            var savedData  = ML.Data.TakeRows(est.Fit(data).Transform(data), 4);

            savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData);

            using (var fs = File.Create(outputPath))
                ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true);

            CheckEquality("Text", "ngrams.tsv");
            Done();
        }
Ejemplo n.º 12
0
        public void WordTokenizeWorkout()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is a good sentence.", B = new string[2] {
                                       "Much words", "Wow So Cool"
                                   }
                               } };
            var dataView    = ComponentCreation.CreateDataView(Env, data);
            var invalidData = new[] { new TestWrong()
                                      {
                                          A = 1, B = new float[2] {
                                              2, 3
                                          }
                                      } };
            var invalidDataView = ComponentCreation.CreateDataView(Env, invalidData);
            var pipe            = new WordTokenizingEstimator(Env, new[] {
                new WordTokenizeTransform.ColumnInfo("A", "TokenizeA"),
                new WordTokenizeTransform.ColumnInfo("B", "TokenizeB"),
            });

            TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView);
            Done();
        }
        internal static ITransformer CreateTransformer(IHostEnvironment env, Options options, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(options, nameof(options));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified");

            // To each input column to the WordHashBagTransform, a tokenize transform is applied,
            // followed by applying WordHashVectorizeTransform.
            // Since WordHashBagTransform is a many-to-one column transform, for each
            // WordHashBagTransform.Column we may need to define multiple tokenize transform columns.
            // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns.
            // The intermediate columns are dropped at the end of using a DropColumnsTransform.
            IDataView view = input;

            var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, options.Columns, view.Schema);

            Contracts.Assert(uniqueSourceNames.Length == options.Columns.Length);

            var           tokenizeColumns = new List <WordTokenizingEstimator.ColumnOptions>();
            var           extractorCols   = new NgramHashExtractingTransformer.Column[options.Columns.Length];
            var           colCount        = options.Columns.Length;
            List <string> tmpColNames     = new List <string>();

            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column      = options.Columns[iinfo];
                int srcCount    = column.Source.Length;
                var curTmpNames = new string[srcCount];
                Contracts.Assert(uniqueSourceNames[iinfo].Length == options.Columns[iinfo].Source.Length);
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    tokenizeColumns.Add(new WordTokenizingEstimator.ColumnOptions(curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc], options.Columns[iinfo].Source[isrc]));
                }

                tmpColNames.AddRange(curTmpNames);
                extractorCols[iinfo] =
                    new NgramHashExtractingTransformer.Column
                {
                    Name                   = column.Name,
                    Source                 = curTmpNames,
                    NumberOfBits           = column.NumberOfBits,
                    NgramLength            = column.NgramLength,
                    Seed                   = column.Seed,
                    SkipLength             = column.SkipLength,
                    Ordered                = column.Ordered,
                    MaximumNumberOfInverts = column.MaximumNumberOfInverts,
                    FriendlyNames          = options.Columns[iinfo].Source,
                    UseAllLengths          = column.UseAllLengths
                };
            }

            ITransformer t1 = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view);

            var featurizeArgs =
                new NgramHashExtractingTransformer.Options
            {
                UseAllLengths          = options.UseAllLengths,
                NumberOfBits           = options.NumberOfBits,
                NgramLength            = options.NgramLength,
                SkipLength             = options.SkipLength,
                Ordered                = options.Ordered,
                Seed                   = options.Seed,
                Columns                = extractorCols.ToArray(),
                MaximumNumberOfInverts = options.MaximumNumberOfInverts
            };

            view = t1.Transform(view);
            ITransformer t2 = NgramHashExtractingTransformer.Create(h, featurizeArgs, view);

            // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform.
            ITransformer t3 = new ColumnSelectingTransformer(env, null, tmpColNames.ToArray());

            return(new TransformerChain <ITransformer>(new[] { t1, t2, t3 }));
        }
Ejemplo n.º 14
0
        public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register(RegistrationName);

            h.CheckValue(args, nameof(args));
            h.CheckValue(input, nameof(input));
            h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified");

            // To each input column to the WordHashBagTransform, a tokenize transform is applied,
            // followed by applying WordHashVectorizeTransform.
            // Since WordHashBagTransform is a many-to-one column transform, for each
            // WordHashBagTransform.Column we may need to define multiple tokenize transform columns.
            // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns.
            // The intermediate columns are dropped at the end of using a DropColumnsTransform.
            IDataView view = input;

            var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, args.Column, view.Schema);

            Contracts.Assert(uniqueSourceNames.Length == args.Column.Length);

            var           tokenizeColumns = new List <WordTokenizingTransformer.ColumnInfo>();
            var           extractorCols   = new NgramHashExtractingTransformer.Column[args.Column.Length];
            var           colCount        = args.Column.Length;
            List <string> tmpColNames     = new List <string>();

            for (int iinfo = 0; iinfo < colCount; iinfo++)
            {
                var column      = args.Column[iinfo];
                int srcCount    = column.Source.Length;
                var curTmpNames = new string[srcCount];
                Contracts.Assert(uniqueSourceNames[iinfo].Length == args.Column[iinfo].Source.Length);
                for (int isrc = 0; isrc < srcCount; isrc++)
                {
                    tokenizeColumns.Add(new WordTokenizingTransformer.ColumnInfo(args.Column[iinfo].Source[isrc], curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc]));
                }

                tmpColNames.AddRange(curTmpNames);
                extractorCols[iinfo] =
                    new NgramHashExtractingTransformer.Column
                {
                    Name          = column.Name,
                    Source        = curTmpNames,
                    HashBits      = column.HashBits,
                    NgramLength   = column.NgramLength,
                    Seed          = column.Seed,
                    SkipLength    = column.SkipLength,
                    Ordered       = column.Ordered,
                    InvertHash    = column.InvertHash,
                    FriendlyNames = args.Column[iinfo].Source,
                    AllLengths    = column.AllLengths
                };
            }

            view = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view).Transform(view);

            var featurizeArgs =
                new NgramHashExtractingTransformer.Arguments
            {
                AllLengths  = args.AllLengths,
                HashBits    = args.HashBits,
                NgramLength = args.NgramLength,
                SkipLength  = args.SkipLength,
                Ordered     = args.Ordered,
                Seed        = args.Seed,
                Column      = extractorCols.ToArray(),
                InvertHash  = args.InvertHash
            };

            view = NgramHashExtractingTransformer.Create(h, featurizeArgs, view);

            // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform.
            return(ColumnSelectingTransformer.CreateDrop(h, view, tmpColNames.ToArray()));
        }
Ejemplo n.º 15
0
        public static void KeyToValueValueToKey()
        {
            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var ml = new MLContext();

            // Get a small dataset as an IEnumerable and load it into ML.NET data set.
            IEnumerable <SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData();
            var trainData = ml.Data.ReadFromEnumerable(data);

            // Preview of one of the columns of the the topics data.
            // The Review column contains the keys associated with a particular body of text.
            //
            // Review
            // "animals birds cats dogs fish horse"
            // "horse birds house fish duck cats"
            // "car truck driver bus pickup"
            // "car truck driver bus pickup horse"

            // A pipeline to convert the terms of the 'Review' column in
            // making use of default settings.
            string defaultColumnName = "DefaultKeys";
            // REVIEW create through the catalog extension
            var default_pipeline = new WordTokenizingEstimator(ml, "Review")
                                   .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

            // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
            // We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
            // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
            // to value/alphabetically.
            string customizedColumnName = "CustomizedKeys";
            var    customized_pipeline  = new WordTokenizingEstimator(ml, "Review")
                                          .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));

            // The transformed data.
            var transformedData_default    = default_pipeline.Fit(trainData).Transform(trainData);
            var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData);

            // Small helper to print the text inside the columns, in the console.
            Action <string, IEnumerable <VBuffer <uint> > > printHelper = (columnName, column) =>
            {
                Console.WriteLine($"{columnName} column obtained post-transformation.");
                foreach (var row in column)
                {
                    foreach (var value in row.GetValues())
                    {
                        Console.Write($"{value} ");
                    }
                    Console.WriteLine("");
                }

                Console.WriteLine("===================================================");
            };

            // Preview of the DefaultKeys column obtained after processing the input.
            var defaultColumn = transformedData_default.GetColumn <VBuffer <uint> >(ml, defaultColumnName);

            printHelper(defaultColumnName, defaultColumn);

            // DefaultKeys column obtained post-transformation.
            //
            // 1 2 3 4 5 6
            // 6 2 7 5 8 3
            // 9 10 11 12 13 3
            // 9 10 11 12 13 6

            // Previewing the CustomizedKeys column obtained after processing the input.
            var customizedColumn = transformedData_customized.GetColumn <VBuffer <uint> >(ml, customizedColumnName);

            printHelper(customizedColumnName, customizedColumn);

            // CustomizedKeys column obtained post-transformation.
            //
            // 1 2 4 5 7 8
            // 8 2 9 7 6 4
            // 3 10 0 0 0 4
            // 3 10 0 0 0 8

            // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines
            // to convert the keys back to the strings.
            var pipeline = default_pipeline.Append(ml.Transforms.Conversion.MapKeyToValue(defaultColumnName));

            transformedData_default = pipeline.Fit(trainData).Transform(trainData);

            // Preview of the DefaultColumnName column obtained.
            var originalColumnBack = transformedData_default.GetColumn <VBuffer <ReadOnlyMemory <char> > >(ml, defaultColumnName);

            foreach (var row in originalColumnBack)
            {
                foreach (var value in row.GetValues())
                {
                    Console.Write($"{value} ");
                }
                Console.WriteLine("");
            }

            // DefaultKeys column obtained post-transformation.
            //
            // animals birds cats dogs fish horse
            // horse birds house fish duck cats
            // car truck driver bus pickup cats
            // car truck driver bus pickup horse
        }