public static ITransformer BuildAndTrainModel(MLContext mlContext, IDataView splitTrainSet) { var options = new TextFeaturizingEstimator.Options() // Retrieved options are from docs.Microsoft, these were default in the tutorial { // Also output tokenized words OutputTokensColumnName = "OutputTokens", CaseMode = TextNormalizingEstimator.CaseMode.Lower, // Use ML.NET's built-in stop word remover StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, }; var estimator = mlContext.Transforms.Text.FeaturizeText(outputColumnName: "Features", options: options, inputColumnNames: new string[] { nameof(TweetData.Text), nameof(TweetData.ReplyToText) }) .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(labelColumnName: "Label", featureColumnName: "Features")); Console.WriteLine("=============== Creating and Training the Model ==============="); var model = estimator.Fit(splitTrainSet); Console.WriteLine("======================= End of training ======================="); Console.WriteLine(); return(model); }
public void TextFeaturizerWithPredefinedStopWordRemoverTest() { var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputTokens = null }, new TestClass() { A = "No stop words", OutputTokens = null } }; var dataView = ML.Data.LoadFromEnumerable(data); var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine <TestClass, TestClass>(ML); var prediction = engine.Predict(data[0]); Assert.Equal("text english stop words", string.Join(" ", prediction.OutputTokens)); prediction = engine.Predict(data[1]); Assert.Equal("stop words", string.Join(" ", prediction.OutputTokens)); }
private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool keepPunctuations) { var options = new TextFeaturizingEstimator.Options() { KeepPunctuations = keepPunctuations, CaseMode = TextNormalizingEstimator.CaseMode.None, OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine <TestClass, TestClass>(ML); var prediction1 = engine.Predict(data[0]); var prediction2 = engine.Predict(data[1]); if (keepPunctuations) { Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens)); Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } else { var expected = Regex.Replace(data[0].A, "[,|_|'|\"|;|\\.]", ""); Assert.Equal(expected, string.Join(" ", prediction1.OutputTokens)); Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } }
private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepDiacritics) { var options = new TextFeaturizingEstimator.Options() { KeepDiacritics = keepDiacritics, CaseMode = TextNormalizingEstimator.CaseMode.None, OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine <TestClass, TestClass>(ML); var prediction1 = engine.Predict(data[0]); var prediction2 = engine.Predict(data[1]); if (keepDiacritics) { Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens)); Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } else { Assert.Equal("This is some text with diacritics", string.Join(" ", prediction1.OutputTokens)); Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } }
private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingEstimator.CaseMode caseMode) { var options = new TextFeaturizingEstimator.Options() { CaseMode = caseMode, OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine <TestClass, TestClass>(ML); var prediction1 = engine.Predict(data[0]); var prediction2 = engine.Predict(data[1]); string expected1 = null; string expected2 = null; if (caseMode == TextNormalizingEstimator.CaseMode.Upper) { expected1 = data[0].A.ToUpper(); expected2 = data[1].A.ToUpper(); } else if (caseMode == TextNormalizingEstimator.CaseMode.Lower) { expected1 = data[0].A.ToLower(); expected2 = data[1].A.ToLower(); } else if (caseMode == TextNormalizingEstimator.CaseMode.None) { expected1 = data[0].A; expected2 = data[1].A; } Assert.Equal(expected1, string.Join(" ", prediction1.OutputTokens)); Assert.Equal(expected2, string.Join(" ", prediction2.OutputTokens)); }
private static IDataScorerTransform _TrainSentiment() { bool normalize = true; var args = new TextLoader.Options() { Separators = new[] { '\t' }, HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.Boolean, 0), new TextLoader.Column("SentimentText", DataKind.String, 1) } }; var args2 = new TextFeaturizingEstimator.Options() { KeepDiacritics = false, KeepPunctuations = false, CaseMode = TextNormalizingEstimator.CaseMode.Lower, OutputTokensColumnName = "tokens", Norm = normalize ? TextFeaturizingEstimator.NormFunction.L2 : TextFeaturizingEstimator.NormFunction.None, CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, }; var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv"); /*using (*/ var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1); { // Pipeline var loader = new TextLoader(env, args).Load(new MultiFileSource(trainFilename)); var trans = TextFeaturizingEstimator.Create(env, args2, loader); // Train var trainer = new SdcaLogisticRegressionBinaryTrainer(env, new SdcaLogisticRegressionBinaryTrainer.Options { LabelColumnName = "Label", FeatureColumnName = "Features" }); var cached = new Microsoft.ML.Data.CacheDataView(env, trans, prefetch: null); var predictor = trainer.Fit(cached); var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features"); var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema)); } }
public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames() { var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens = null }, new TestClass2() { Features = "This is another example", OutputTokens = null } }; var dataView = ML.Data.LoadFromEnumerable(data); var options = new TextFeaturizingEstimator.Options() { WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, CharFeatureExtractor = null, Norm = TextFeaturizingEstimator.NormFunction.None, OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options); dataView = pipeline.Fit(dataView).Transform(dataView); VBuffer <float> features = default; float[][] transformed = { null, null }; var expected = new float[][] { new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }, new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f } }; using (var cursor = dataView.GetRowCursor(dataView.Schema)) { var i = 0; while (cursor.MoveNext()) { var featureGetter = cursor.GetGetter <VBuffer <float> >(cursor.Schema["Features"]); featureGetter(ref features); transformed[i] = features.DenseValues().ToArray(); i++; } } Assert.Equal(expected[0], transformed[0]); Assert.Equal(expected[1], transformed[1]); }
public static void PrintTextFeauturizingOptions(TextFeaturizingEstimator.Options tfo) { ConsoleHelper.Write(ConsoleColor.White, "Parametre for Text-Feauturizing"); var table = new ConsoleTable("", "NGram-længde", "SkipLength", "Use All Length", "Weighting"); table.Options.EnableCount = false; table.AddRow("Word", tfo.WordFeatureExtractor.NgramLength, tfo.WordFeatureExtractor.SkipLength, tfo.WordFeatureExtractor.UseAllLengths, tfo.WordFeatureExtractor.Weighting); table.AddRow("Char", tfo.CharFeatureExtractor.NgramLength, tfo.CharFeatureExtractor.SkipLength, tfo.CharFeatureExtractor.UseAllLengths, tfo.CharFeatureExtractor.Weighting); table.Write(); Console.WriteLine(""); }
static void Main(string[] args) { var texts = new List <Text> { new Text { Data = "apple apple orange grape" }, new Text { Data = "grape apple melon" }, new Text { Data = "grape banana melon" } }; var ml = new MLContext(); var data = ml.Data.LoadFromEnumerable(texts); var textFeaturizingOptions = new TextFeaturizingEstimator.Options { KeepDiacritics = false, KeepPunctuations = false, KeepNumbers = false, StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), WordFeatureExtractor = new WordBagEstimator.Options() { Weighting = NgramExtractingEstimator.WeightingCriteria.TfIdf }, CharFeatureExtractor = null }; var vectorizer = ml.Transforms.Text.FeaturizeText("TfIDFWeights", options: textFeaturizingOptions, inputColumnNames: "Data"); var result = vectorizer.Fit(data).Transform(data); var column = result.GetColumn <VBuffer <float> >("TfIDFWeights"); VBuffer <ReadOnlyMemory <char> > slotNames = default; result.Schema["TfIDFWeights"].GetSlotNames(slotNames: ref slotNames); var words = slotNames.DenseValues().ToArray(); var doc = 0; foreach (var tfidf in column) { for (int i = 0; i < tfidf.Length; i++) { Console.WriteLine($"doc:{doc} word '{words[i]}' {tfidf.GetItemOrDefault(i)}"); } doc++; } Console.ReadLine(); }
public void TextFeaturizerWithL2NormTest() { var data = new[] { new TestClass() { A = "abc xyz", OutputTokens = null }, new TestClass() { A = "xyz", OutputTokens = null } }; var dataView = ML.Data.LoadFromEnumerable(data); var options = new TextFeaturizingEstimator.Options() { CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, Norm = TextFeaturizingEstimator.NormFunction.L2, OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine <TestClass, TestClass>(ML); var prediction = engine.Predict(data[0]); Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens)); var exp1 = 0.333333343f; var exp2 = 0.707106769f; var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 }; Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); exp1 = 0.4472136f; Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens)); expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f }; Assert.Equal(expected, prediction.Features); }
public void TextFeaturizerWithWordFeatureExtractorTest() { var data = new[] { new TestClass() { A = "This is some text in english", OutputTokens = null }, new TestClass() { A = "This is another example", OutputTokens = null } }; var dataView = ML.Data.LoadFromEnumerable(data); var options = new TextFeaturizingEstimator.Options() { WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, CharFeatureExtractor = null, Norm = TextFeaturizingEstimator.NormFunction.None, OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine <TestClass, TestClass>(ML); var prediction = engine.Predict(data[0]); Assert.Equal(data[0].A.ToLower(), string.Join(" ", prediction.OutputTokens)); var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }; Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); Assert.Equal(data[1].A.ToLower(), string.Join(" ", prediction.OutputTokens)); expected = new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }; Assert.Equal(expected, prediction.Features); }
public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); // Create a small dataset as an IEnumerable. var samples = new List <TextData>() { new TextData() { Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." }, new TextData() { Text = "This API can be used as a featurizer to perform text classification." }, new TextData() { Text = "There are a number of approaches to text classification." }, new TextData() { Text = "One of the simplest and most common approaches is called “Bag of Words”." }, new TextData() { Text = "Text classification can be used for a wide variety of tasks" }, new TextData() { Text = "such as sentiment analysis, topic detection, intent identification etc." }, }; // Convert training data to IDataView. var dataview = mlContext.Data.LoadFromEnumerable(samples); // A pipeline for converting text into numeric features. // The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters. // The length of the output feature vector depends on these settings. var options = new TextFeaturizingEstimator.Options() { // Also output tokenized words OutputTokensColumnName = "OutputTokens", CaseMode = TextNormalizingEstimator.CaseMode.Lower, // Use ML.NET's built-in stop word remover StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, }; var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", options, "Text"); // Fit to data. var textTransformer = textPipeline.Fit(dataview); // Create the prediction engine to get the features extracted from the text. var predictionEngine = mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(textTransformer); // Convert the text into numeric features. var prediction = predictionEngine.Predict(samples[0]); // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.Features.Length}"); // Print feature values and tokens. Console.Write("Features: "); for (int i = 0; i < 10; i++) { Console.Write($"{prediction.Features[i]:F4} "); } Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}"); // Expected output: // Number of Features: 282 // Features: 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.1881 ... // Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features. }
public void MakeData() { var posts = this.repository.Posts(); var dataView = this.mlContext.Data.LoadFromEnumerable(posts); //// A pipeline for converting text into numeric features. //// The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters. //// The length of the output feature vector depends on these settings. var options = new TextFeaturizingEstimator.Options() { // Also output tokenized words OutputTokensColumnName = "OutputTokens", CaseMode = TextNormalizingEstimator.CaseMode.Lower, // Use ML.NET's built-in stop word remover StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, }; var textPipeline = this.mlContext.Transforms.Text.FeaturizeText("Features", options, "Text"); // Fit to data. var textTransformer = textPipeline.Fit(dataView); // Create the prediction engine to get the features extracted from the text. var predictionEngine = this.mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(textTransformer); // Convert the text into numeric features. // var prediction = predictionEngine.Predict(posts[0]); var features = new BlockingCollection <Feature>(); Parallel.ForEach(posts, p => { try { Console.WriteLine($"Now predicting {p.Id}"); var postPrediction = predictionEngine.Predict(new TextData { PostId = p.Id, Text = p.Body }); var feature = new Feature(); postPrediction.Features.Each(pf => feature.Values.Add(new FeatureValue { Value = pf })); postPrediction.OutputTokens.Each(ot => feature.Tokens.Add(new FeatureToken { Token = ot })); features.Add(feature); } catch (Exception e) { this.logger.LogError(e, e.Message); } }); //using (var context = new SEJapaneseDataContext()) //{ // context.Features.AddRange(features.ToList()); // context.SaveChanges(); // // context.BulkInsert(features.ToList()); //} //// Print the length of the feature vector. //Console.WriteLine($"Number of Features: {prediction.Features.Length}"); //// Print feature values and tokens. //Console.Write("Features: "); //for (int i = 0; i < 10; i++) //{ // Console.Write($"{prediction.Features[i]:F4} "); //} //Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}"); // Expected output: // Number of Features: 282 // Features: 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.1881 ... // Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features. }