public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(dataPath).AsDynamic; var est = new WordTokenizingEstimator(Env, "text", "words", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = TakeFilter.Create(Env, est.Fit(data).Transform(data), 4); var savedData = ColumnSelectingTransformer.CreateKeep(Env, outdata, new[] { "words" }); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(ML, "words", "text") .Append(new TokenizingByCharactersEstimator(ML, "chars", "text")) .Append(new KeyToValueMappingEstimator(ML, "chars")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "tokenized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(ML, savedData, new[] { "text", "words", "chars" }); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "tokenized.tsv"); Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Load(sentimentDataPath); var invalidData = TextLoaderStatic.CreateLoader(ML, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) .Append(new NgramExtractingEstimator(ML, "ngrams", "terms")) .Append(new NgramHashingEstimator(ML, "ngramshash", "terms")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TokenizeWithSeparators() { string dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(dataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true); var est = new WordTokenizingEstimator(Env, "words", "text", separators: new[] { ' ', '?', '!', '.', ',' }); var outdata = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); var savedData = ML.Transforms.SelectColumns("words").Fit(outdata).Transform(outdata); var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); var outputPath = GetOutputPath("Text", "tokenizedWithSeparators.tsv"); using (var ch = Env.Start("save")) { using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenizedWithSeparators.tsv"); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true); var est = new WordTokenizingEstimator(ML, "words", "text") .Append(new TokenizingByCharactersEstimator(ML, "chars", "text")) .Append(new KeyToValueMappingEstimator(ML, "chars")); TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "tokenized.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "words", "chars").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "tokenized.tsv"); Done(); }
public void WordTokenizeWorkout() { var data = new[] { new TestClass() { A = "This is a good sentence.", B = new string[2] { "Much words", "Wow So Cool" } } }; var dataView = ML.Data.ReadFromEnumerable(data); var invalidData = new[] { new TestWrong() { A = 1, B = new float[2] { 2, 3 } } }; var invalidDataView = ML.Data.ReadFromEnumerable(invalidData); var pipe = new WordTokenizingEstimator(Env, new[] { new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"), new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"), }); TestEstimatorCore(pipe, dataView, invalidInput: invalidDataView); // Reuse the pipe trained on dataView in TestEstimatorCore to make prediction. var result = pipe.Fit(dataView).Transform(dataView); // Extract the transformed result of the first row (the only row we have because data contains only one TestClass) as a native class. var nativeResult = ML.CreateEnumerable <NativeResult>(result, false).First(); // Check the tokenization of A. Expected result is { "This", "is", "a", "good", "sentence." }. var tokenizeA = new[] { "This", "is", "a", "good", "sentence." }; Assert.True(tokenizeA.Length == nativeResult.TokenizeA.Length); for (int i = 0; i < tokenizeA.Length; ++i) { Assert.Equal(tokenizeA[i], nativeResult.TokenizeA[i]); } // Check the tokenization of B. Expected result is { "Much", "words", "Wow", "So", "Cool" }. One may think that the expected output // should be a 2-D array { { "Much", "words"}, { "Wow", "So", "Cool" } }, but please note that ML.NET may flatten all outputs if // they are high-dimension tensors. var tokenizeB = new[] { "Much", "words", "Wow", "So", "Cool" }; Assert.True(tokenizeB.Length == nativeResult.TokenizeB.Length); for (int i = 0; i < tokenizeB.Length; ++i) { Assert.Equal(tokenizeB[i], nativeResult.TokenizeB[i]); } Done(); }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "text") .Append(new ValueToKeyMappingEstimator(Env, "text", "terms")) .Append(new NgramEstimator(Env, "terms", "ngrams")) .Append(new NgramHashEstimator(Env, "terms", "ngramshash")); // The following call fails because of the following issue // https://github.com/dotnet/machinelearning/issues/969 // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "ngrams.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "terms", "ngrams", "ngramshash" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "ngrams.tsv"); Done(); }
public void TextTokenizationWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(sentimentDataPath); var invalidData = TextLoaderStatic.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(sentimentDataPath); var est = new WordTokenizingEstimator(Env, "text", "words") .Append(new TokenizingByCharactersEstimator(Env, "text", "chars")) .Append(new KeyToValueMappingEstimator(Env, "chars")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "tokenized.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words", "chars" }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "tokenized.tsv"); Done(); }
public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = "This is a good sentence.", B = new string[2] { "Much words", "Wow So Cool" } } }; var dataView = ML.Data.ReadFromEnumerable(data); var pipe = new WordTokenizingEstimator(Env, new[] { new WordTokenizingEstimator.ColumnInfo("TokenizeA", "A"), new WordTokenizingEstimator.ColumnInfo("TokenizeB", "B"), }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) { TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles); ms.Position = 0; var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms); } }
public void NgramWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.String, 1) }, hasHeader: true, allowQuoting: true); var invalidData = ML.Data.LoadFromTextFile(sentimentDataPath, new[] { new TextLoader.Column("label", DataKind.Boolean, 0), new TextLoader.Column("text", DataKind.Single, 1) }, hasHeader: true, allowQuoting: true); var est = new WordTokenizingEstimator(ML, "text", "text") .Append(new ValueToKeyMappingEstimator(ML, "terms", "text")) .Append(new NgramExtractingEstimator(ML, "ngrams", "terms")) .Append(new NgramHashingEstimator(ML, "ngramshash", "terms")) // Also have a situation where we use invert hashing. However we only write // the original non-inverted column to the actual baseline file. .Append(new NgramHashingEstimator(ML, "ngramshashinvert", "terms", maximumNumberOfInverts: 2)); TestEstimatorCore(est, data, invalidInput: invalidData); var outputPath = GetOutputPath("Text", "ngrams.tsv"); var savedData = ML.Data.TakeRows(est.Fit(data).Transform(data), 4); savedData = ML.Transforms.SelectColumns("text", "terms", "ngrams", "ngramshash").Fit(savedData).Transform(savedData); using (var fs = File.Create(outputPath)) ML.Data.SaveAsText(savedData, fs, headerRow: true, keepHidden: true); CheckEquality("Text", "ngrams.tsv"); Done(); }
public static void KeyToValueValueToKey() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var ml = new MLContext(); // Get a small dataset as an IEnumerable and load it into ML.NET data set. IEnumerable <SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData(); var trainData = ml.Data.ReadFromEnumerable(data); // Preview of one of the columns of the the topics data. // The Review column contains the keys associated with a particular body of text. // // Review // "animals birds cats dogs fish horse" // "horse birds house fish duck cats" // "car truck driver bus pickup" // "car truck driver bus pickup horse" // A pipeline to convert the terms of the 'Review' column in // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension var default_pipeline = new WordTokenizingEstimator(ml, "Review") .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review")); // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator. // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; var customized_pipeline = new WordTokenizingEstimator(ml, "Review") .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value)); // The transformed data. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); var transformedData_customized = customized_pipeline.Fit(trainData).Transform(trainData); // Small helper to print the text inside the columns, in the console. Action <string, IEnumerable <VBuffer <uint> > > printHelper = (columnName, column) => { Console.WriteLine($"{columnName} column obtained post-transformation."); foreach (var row in column) { foreach (var value in row.GetValues()) { Console.Write($"{value} "); } Console.WriteLine(""); } Console.WriteLine("==================================================="); }; // Preview of the DefaultKeys column obtained after processing the input. var defaultColumn = transformedData_default.GetColumn <VBuffer <uint> >(ml, defaultColumnName); printHelper(defaultColumnName, defaultColumn); // DefaultKeys column obtained post-transformation. // // 1 2 3 4 5 6 // 6 2 7 5 8 3 // 9 10 11 12 13 3 // 9 10 11 12 13 6 // Previewing the CustomizedKeys column obtained after processing the input. var customizedColumn = transformedData_customized.GetColumn <VBuffer <uint> >(ml, customizedColumnName); printHelper(customizedColumnName, customizedColumn); // CustomizedKeys column obtained post-transformation. // // 1 2 4 5 7 8 // 8 2 9 7 6 4 // 3 10 0 0 0 4 // 3 10 0 0 0 8 // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines // to convert the keys back to the strings. var pipeline = default_pipeline.Append(ml.Transforms.Conversion.MapKeyToValue(defaultColumnName)); transformedData_default = pipeline.Fit(trainData).Transform(trainData); // Preview of the DefaultColumnName column obtained. var originalColumnBack = transformedData_default.GetColumn <VBuffer <ReadOnlyMemory <char> > >(ml, defaultColumnName); foreach (var row in originalColumnBack) { foreach (var value in row.GetValues()) { Console.Write($"{value} "); } Console.WriteLine(""); } // DefaultKeys column obtained post-transformation. // // animals birds cats dogs fish horse // horse birds house fish duck cats // car truck driver bus pickup cats // car truck driver bus pickup horse }