public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Load(sentimentDataPath); var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) .Append(new NgramExtractingEstimator(ML, "ngrams", "terms")) .Append(new NgramHashingEstimator(ML, "ngramshash", "terms")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(dataPath).AsDynamic; var est = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4); var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" }); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(ML, "words", "text") .Append(new TokenizingByCharactersEstimator(ML, "chars", "text")) .Append(new KeyToValueMappingEstimator(ML, "chars")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "tokenized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "words", "chars" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "tokenized.tsv"); Done(); }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true); var est = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true); var est = new WordTokenizingEstimator(ML, "words", "text") .Append(new TokenizingByCharactersEstimator(ML, "chars", "text")) .Append(new KeyToValueMappingEstimator(ML, "chars")); TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "tokenized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "words", "chars").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "tokenized.tsv"); Done(); }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // Compose the WordBagTransform from a tokenize transform, // followed by a NgramExtractionTransform. // Since WordBagTransform is a many-to-one column transform, for each // WordBagTransform.Column with multiple sources, we first apply a ConcatTransform. // REVIEW: In order to not get ngrams that cross between vector slots, we need to // enable tokenize transforms to insert a special token between slots. // REVIEW: In order to make it possible to output separate bags for different columns // using the same dictionary, we need to find a way to make ConcatTransform remember the boundaries. var tokenizeColumns = new WordTokenizingTransformer.ColumnInfo[args.Column.Length]; var extractorArgs = new NgramExtractorTransform.Arguments() { MaxNumTerms = args.MaxNumTerms, NgramLength = args.NgramLength, SkipLength = args.SkipLength, AllLengths = args.AllLengths, Weighting = args.Weighting, Column = new NgramExtractorTransform.Column[args.Column.Length] }; for (int iinfo = 0; iinfo < args.Column.Length; iinfo++) { var column = args.Column[iinfo]; h.CheckUserArg(!string.IsNullOrWhiteSpace(column.Name), nameof(column.Name)); h.CheckUserArg(Utils.Size(column.Source) > 0, nameof(column.Source)); h.CheckUserArg(column.Source.All(src => !string.IsNullOrWhiteSpace(src)), nameof(column.Source)); tokenizeColumns[iinfo] = new WordTokenizingTransformer.ColumnInfo(column.Source.Length > 1 ? column.Name : column.Source[0], column.Name); extractorArgs.Column[iinfo] = new NgramExtractorTransform.Column() { Name = column.Name, Source = column.Name, MaxNumTerms = column.MaxNumTerms, NgramLength = column.NgramLength, SkipLength = column.SkipLength, Weighting = column.Weighting, AllLengths = column.AllLengths }; } IDataView view = input; view = NgramExtractionUtils.ApplyConcatOnSources(h, args.Column, view); view = new WordTokenizingEstimator(env, tokenizeColumns).Fit(view).Transform(view); return NgramExtractorTransform.Create(h, extractorArgs, view); }
public void WordTokenizeWorkout() { var data = new[] { new TestClass() { A = "This is a good sentence.", B = new string[2] { "Much words", "Wow So Cool" } } }; var dataView = ML.Data.ReadFromEnumerable(data); var invalidData = new[] { new TestWrong() { A = 1, B = new float[2] { 2, 3 } } }; var invalidDataView = ML.Data.ReadFromEnumerable(invalidData); var pipe = new WordTokenizingEstimator(Env, new[] { new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"), new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"), }); TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView); // Reuse the pipe trained on dataView in TestEstimatorCore to make prediction. var result = pipe.Fit(dataView).Transform(dataView); // Extract the transformed result of the first row (the only row we have because data contains only one TestClass) as a native class. var nativeResult = ML.CreateEnumerable <NativeResult>(result, false).First(); // Check the tokenization of A. Expected result is { "This", "is", "a", "good", "sentence." }. var tokenizeA = new[] { "This", "is", "a", "good", "sentence." }; Assert.True(tokenizeA.Length == nativeResult.TokenizeA.Length); for (int i = 0; i < tokenizeA.Length; ++i) { Assert.Equal(tokenizeA[i], nativeResult.TokenizeA[i]); } // Check the tokenization of B. Expected result is { "Much", "words", "Wow", "So", "Cool" }. One may think that the expected output // should be a 2-D array { { "Much", "words"}, { "Wow", "So", "Cool" } }, but please note that ML.NET may flatten all outputs if // they are high-dimension tensors. var tokenizeB = new[] { "Much", "words", "Wow", "So", "Cool" }; Assert.True(tokenizeB.Length == nativeResult.TokenizeB.Length); for (int i = 0; i < tokenizeB.Length; ++i) { Assert.Equal(tokenizeB[i], nativeResult.TokenizeB[i]); } Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) .Append(new NgramEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "words") .Append(new TokenizingByCharactersEstimator(Env, "text", "chars")) .Append(new KeyToValueMappingEstimator(Env, "chars")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "tokenized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words", "chars" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenized.tsv"); Done(); }
public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = "This is a good sentence.", B = new string[2] { "Much words", "Wow So Cool" } } }; var dataView = ML.Data.ReadFromEnumerable(data); var pipe = new WordTokenizingEstimator(Env, new[] { new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"), new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"), }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) { TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles); ms.Position = 0; var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms); } }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true, allowQuoting: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true, allowQuoting: true); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) .Append(new NgramExtractingEstimator(ML, "ngrams", "terms")) .Append(new NgramHashingEstimator(ML, "ngramshash", "terms")) // Also have a situation where we use invert hashing. However we only write // the original non-inverted column to the actual baseline file. .Append(new NgramHashingEstimator(ML, "ngramshashinvert", "terms", maximumNumberOfInverts: 2)); TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "ngrams.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "ngrams.tsv"); Done(); }
public void WordTokenizeWorkout() { var data = new[] { new TestClass() { A = "This is a good sentence.", B = new string[2] { "Much words", "Wow So Cool" } } }; var dataView = ComponentCreation.CreateDataView(Env, data); var invalidData = new[] { new TestWrong() { A = 1, B = new float[2] { 2, 3 } } }; var invalidDataView = ComponentCreation.CreateDataView(Env, invalidData); var pipe = new WordTokenizingEstimator(Env, new[] { new WordTokenizeTransform.ColumnInfo("A", "TokenizeA"), new WordTokenizeTransform.ColumnInfo("B", "TokenizeB"), }); TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView); Done(); }
internal static ITransformer CreateTransformer(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns), "Columns must be specified"); // To each input column to the WordHashBagTransform, a tokenize transform is applied, // followed by applying WordHashVectorizeTransform. // Since WordHashBagTransform is a many-to-one column transform, for each // WordHashBagTransform.Column we may need to define multiple tokenize transform columns. // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns. // The intermediate columns are dropped at the end of using a DropColumnsTransform. IDataView view = input; var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, options.Columns, view.Schema); Contracts.Assert(uniqueSourceNames.Length == options.Columns.Length); var tokenizeColumns = new List <WordTokenizingEstimator.ColumnOptions>(); var extractorCols = new NgramHashExtractingTransformer.Column[options.Columns.Length]; var colCount = options.Columns.Length; List <string> tmpColNames = new List <string>(); for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = options.Columns[iinfo]; int srcCount = column.Source.Length; var curTmpNames = new string[srcCount]; Contracts.Assert(uniqueSourceNames[iinfo].Length == options.Columns[iinfo].Source.Length); for (int isrc = 0; isrc < srcCount; isrc++) { tokenizeColumns.Add(new WordTokenizingEstimator.ColumnOptions(curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc], options.Columns[iinfo].Source[isrc])); } tmpColNames.AddRange(curTmpNames); extractorCols[iinfo] = new NgramHashExtractingTransformer.Column { Name = column.Name, Source = curTmpNames, NumberOfBits = column.NumberOfBits, NgramLength = column.NgramLength, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, MaximumNumberOfInverts = column.MaximumNumberOfInverts, FriendlyNames = options.Columns[iinfo].Source, UseAllLengths = column.UseAllLengths }; } ITransformer t1 = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view); var featurizeArgs = new NgramHashExtractingTransformer.Options { UseAllLengths = options.UseAllLengths, NumberOfBits = options.NumberOfBits, NgramLength = options.NgramLength, SkipLength = options.SkipLength, Ordered = options.Ordered, Seed = options.Seed, Columns = extractorCols.ToArray(), MaximumNumberOfInverts = options.MaximumNumberOfInverts }; view = t1.Transform(view); ITransformer t2 = NgramHashExtractingTransformer.Create(h, featurizeArgs, view); // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform. ITransformer t3 = new ColumnSelectingTransformer(env, null, tmpColNames.ToArray()); return(new TransformerChain <ITransformer>(new[] { t1, t2, t3 })); }
public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register(RegistrationName); h.CheckValue(args, nameof(args)); h.CheckValue(input, nameof(input)); h.CheckUserArg(Utils.Size(args.Column) > 0, nameof(args.Column), "Columns must be specified"); // To each input column to the WordHashBagTransform, a tokenize transform is applied, // followed by applying WordHashVectorizeTransform. // Since WordHashBagTransform is a many-to-one column transform, for each // WordHashBagTransform.Column we may need to define multiple tokenize transform columns. // NgramHashExtractorTransform may need to define an identical number of HashTransform.Columns. // The intermediate columns are dropped at the end of using a DropColumnsTransform. IDataView view = input; var uniqueSourceNames = NgramExtractionUtils.GenerateUniqueSourceNames(h, args.Column, view.Schema); Contracts.Assert(uniqueSourceNames.Length == args.Column.Length); var tokenizeColumns = new List <WordTokenizingTransformer.ColumnInfo>(); var extractorCols = new NgramHashExtractingTransformer.Column[args.Column.Length]; var colCount = args.Column.Length; List <string> tmpColNames = new List <string>(); for (int iinfo = 0; iinfo < colCount; iinfo++) { var column = args.Column[iinfo]; int srcCount = column.Source.Length; var curTmpNames = new string[srcCount]; Contracts.Assert(uniqueSourceNames[iinfo].Length == args.Column[iinfo].Source.Length); for (int isrc = 0; isrc < srcCount; isrc++) { tokenizeColumns.Add(new WordTokenizingTransformer.ColumnInfo(args.Column[iinfo].Source[isrc], curTmpNames[isrc] = uniqueSourceNames[iinfo][isrc])); } tmpColNames.AddRange(curTmpNames); extractorCols[iinfo] = new NgramHashExtractingTransformer.Column { Name = column.Name, Source = curTmpNames, HashBits = column.HashBits, NgramLength = column.NgramLength, Seed = column.Seed, SkipLength = column.SkipLength, Ordered = column.Ordered, InvertHash = column.InvertHash, FriendlyNames = args.Column[iinfo].Source, AllLengths = column.AllLengths }; } view = new WordTokenizingEstimator(env, tokenizeColumns.ToArray()).Fit(view).Transform(view); var featurizeArgs = new NgramHashExtractingTransformer.Arguments { AllLengths = args.AllLengths, HashBits = args.HashBits, NgramLength = args.NgramLength, SkipLength = args.SkipLength, Ordered = args.Ordered, Seed = args.Seed, Column = extractorCols.ToArray(), InvertHash = args.InvertHash }; view = NgramHashExtractingTransformer.Create(h, featurizeArgs, view); // Since we added columns with new names, we need to explicitly drop them before we return the IDataTransform. return(ColumnSelectingTransformer.CreateDrop(h, view, tmpColNames.ToArray())); }
public static void KeyToValueValueToKey() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var ml = new MLContext(); // Get a small dataset as an IEnumerable and load it into ML.NET data set. IEnumerable <SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData(); var trainData = ml.Data.ReadFromEnumerable(data); // Preview of one of the columns of the the topics data. // The Review column contains the keys associated with a particular body of text. // // Review // "animals birds cats dogs fish horse" // "horse birds house fish duck cats" // "car truck driver bus pickup" // "car truck driver bus pickup horse" // A pipeline to convert the terms of the 'Review' column in // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension var default_pipeline = new WordTokenizingEstimator(ml, "Review") .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review")); // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator. // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; var customized_pipeline = new WordTokenizingEstimator(ml, "Review") .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value)); // The transformed data. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); // Small helper to print the text inside the columns, in the console. Action <string, IEnumerable <VBuffer <uint> > > printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); foreach (var row in column) { foreach (var value in row.GetValues()) { Console.Write($"{value} "); } Console.WriteLine(""); } Console.WriteLine("==================================================="); }; // Preview of the DefaultKeys column obtained after processing the input. var defaultColumn = transformedData_default.GetColumn <VBuffer <uint> >(ml, defaultColumnName); printHelper(defaultColumnName, defaultColumn); // DefaultKeys column obtained post-transformation. // // 1 2 3 4 5 6 // 6 2 7 5 8 3 // 9 10 11 12 13 3 // 9 10 11 12 13 6 // Previewing the CustomizedKeys column obtained after processing the input. var customizedColumn = transformedData_customized.GetColumn <VBuffer <uint> >(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); // CustomizedKeys column obtained post-transformation. // // 1 2 4 5 7 8 // 8 2 9 7 6 4 // 3 10 0 0 0 4 // 3 10 0 0 0 8 // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines // to convert the keys back to the strings. var pipeline = default_pipeline.Append(ml.Transforms.Conversion.MapKeyToValue(defaultColumnName)); transformedData_default = pipeline.Fit(trainData).Transform(trainData); // Preview of the DefaultColumnName column obtained. var originalColumnBack = transformedData_default.GetColumn <VBuffer <ReadOnlyMemory <char> > >(ml, defaultColumnName); foreach (var row in originalColumnBack) { foreach (var value in row.GetValues()) { Console.Write($"{value} "); } Console.WriteLine(""); } // DefaultKeys column obtained post-transformation. // // animals birds cats dogs fish horse // horse birds house fish duck cats // car truck driver bus pickup cats // car truck driver bus pickup horse }