Пример #1
0
        public static ITransformer BuildAndTrainModel(MLContext mlContext, IDataView splitTrainSet)
        {
            var options = new TextFeaturizingEstimator.Options() // Retrieved options are from docs.Microsoft, these were default in the tutorial
            {
                // Also output tokenized words
                OutputTokensColumnName = "OutputTokens",
                CaseMode = TextNormalizingEstimator.CaseMode.Lower,
                // Use ML.NET's built-in stop word remover
                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options()
                {
                    Language = TextFeaturizingEstimator.Language.English
                },
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 2, UseAllLengths = true
                },
                CharFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 3, UseAllLengths = false
                },
            };

            var estimator = mlContext.Transforms.Text.FeaturizeText(outputColumnName: "Features", options: options, inputColumnNames: new string[] { nameof(TweetData.Text), nameof(TweetData.ReplyToText) })
                            .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(labelColumnName: "Label", featureColumnName: "Features"));

            Console.WriteLine("=============== Creating and Training the Model ===============");
            var model = estimator.Fit(splitTrainSet);

            Console.WriteLine("======================= End of training =======================");
            Console.WriteLine();
            return(model);
        }
        public void TextFeaturizerWithPredefinedStopWordRemoverTest()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is some text with english stop words", OutputTokens = null
                               },
                               new TestClass()
                               {
                                   A = "No stop words", OutputTokens = null
                               } };
            var dataView = ML.Data.LoadFromEnumerable(data);

            var options = new TextFeaturizingEstimator.Options()
            {
                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokensColumnName = "OutputTokens"
            };
            var pipeline   = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model      = pipeline.Fit(dataView);
            var engine     = model.CreatePredictionEngine <TestClass, TestClass>(ML);
            var prediction = engine.Predict(data[0]);

            Assert.Equal("text english stop words", string.Join(" ", prediction.OutputTokens));

            prediction = engine.Predict(data[1]);
            Assert.Equal("stop words", string.Join(" ", prediction.OutputTokens));
        }
        private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool keepPunctuations)
        {
            var options = new TextFeaturizingEstimator.Options()
            {
                KeepPunctuations       = keepPunctuations,
                CaseMode               = TextNormalizingEstimator.CaseMode.None,
                OutputTokensColumnName = "OutputTokens"
            };
            var pipeline    = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model       = pipeline.Fit(dataView);
            var engine      = model.CreatePredictionEngine <TestClass, TestClass>(ML);
            var prediction1 = engine.Predict(data[0]);
            var prediction2 = engine.Predict(data[1]);

            if (keepPunctuations)
            {
                Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens));
                Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens));
            }
            else
            {
                var expected = Regex.Replace(data[0].A, "[,|_|'|\"|;|\\.]", "");
                Assert.Equal(expected, string.Join(" ", prediction1.OutputTokens));
                Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens));
            }
        }
        private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepDiacritics)
        {
            var options = new TextFeaturizingEstimator.Options()
            {
                KeepDiacritics         = keepDiacritics,
                CaseMode               = TextNormalizingEstimator.CaseMode.None,
                OutputTokensColumnName = "OutputTokens"
            };
            var pipeline    = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model       = pipeline.Fit(dataView);
            var engine      = model.CreatePredictionEngine <TestClass, TestClass>(ML);
            var prediction1 = engine.Predict(data[0]);
            var prediction2 = engine.Predict(data[1]);

            if (keepDiacritics)
            {
                Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens));
                Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens));
            }
            else
            {
                Assert.Equal("This is some text with diacritics", string.Join(" ", prediction1.OutputTokens));
                Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens));
            }
        }
        private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingEstimator.CaseMode caseMode)
        {
            var options = new TextFeaturizingEstimator.Options()
            {
                CaseMode = caseMode,
                OutputTokensColumnName = "OutputTokens"
            };
            var pipeline    = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model       = pipeline.Fit(dataView);
            var engine      = model.CreatePredictionEngine <TestClass, TestClass>(ML);
            var prediction1 = engine.Predict(data[0]);
            var prediction2 = engine.Predict(data[1]);

            string expected1 = null;
            string expected2 = null;

            if (caseMode == TextNormalizingEstimator.CaseMode.Upper)
            {
                expected1 = data[0].A.ToUpper();
                expected2 = data[1].A.ToUpper();
            }
            else if (caseMode == TextNormalizingEstimator.CaseMode.Lower)
            {
                expected1 = data[0].A.ToLower();
                expected2 = data[1].A.ToLower();
            }
            else if (caseMode == TextNormalizingEstimator.CaseMode.None)
            {
                expected1 = data[0].A;
                expected2 = data[1].A;
            }

            Assert.Equal(expected1, string.Join(" ", prediction1.OutputTokens));
            Assert.Equal(expected2, string.Join(" ", prediction2.OutputTokens));
        }
Пример #6
0
        private static IDataScorerTransform _TrainSentiment()
        {
            bool normalize = true;

            var args = new TextLoader.Options()
            {
                Separators = new[] { '\t' },
                HasHeader  = true,
                Columns    = new[]
                {
                    new TextLoader.Column("Label", DataKind.Boolean, 0),
                    new TextLoader.Column("SentimentText", DataKind.String, 1)
                }
            };

            var args2 = new TextFeaturizingEstimator.Options()
            {
                KeepDiacritics         = false,
                KeepPunctuations       = false,
                CaseMode               = TextNormalizingEstimator.CaseMode.Lower,
                OutputTokensColumnName = "tokens",
                Norm = normalize ? TextFeaturizingEstimator.NormFunction.L2 : TextFeaturizingEstimator.NormFunction.None,
                CharFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 3, UseAllLengths = false
                },
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 2, UseAllLengths = true
                },
            };

            var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv");

            /*using (*/
            var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1);
            {
                // Pipeline
                var loader = new TextLoader(env, args).Load(new MultiFileSource(trainFilename));

                var trans = TextFeaturizingEstimator.Create(env, args2, loader);

                // Train
                var trainer = new SdcaLogisticRegressionBinaryTrainer(env, new SdcaLogisticRegressionBinaryTrainer.Options
                {
                    LabelColumnName   = "Label",
                    FeatureColumnName = "Features"
                });

                var cached    = new Microsoft.ML.Data.CacheDataView(env, trans, prefetch: null);
                var predictor = trainer.Fit(cached);

                var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features");
                var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features");
                return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema));
            }
        }
Пример #7
0
        public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
        {
            var data = new[] { new TestClass2()
                               {
                                   Features = "This is some text in english", OutputTokens = null
                               },
                               new TestClass2()
                               {
                                   Features = "This is another example", OutputTokens = null
                               } };
            var dataView = ML.Data.LoadFromEnumerable(data);

            var options = new TextFeaturizingEstimator.Options()
            {
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 1
                },
                CharFeatureExtractor = null,
                Norm = TextFeaturizingEstimator.NormFunction.None,
                OutputTokensColumnName = "OutputTokens"
            };

            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options);

            dataView = pipeline.Fit(dataView).Transform(dataView);

            VBuffer <float> features = default;

            float[][] transformed = { null, null };

            var expected = new float[][] {
                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
            };

            using (var cursor = dataView.GetRowCursor(dataView.Schema))
            {
                var i = 0;
                while (cursor.MoveNext())
                {
                    var featureGetter = cursor.GetGetter <VBuffer <float> >(cursor.Schema["Features"]);
                    featureGetter(ref features);
                    transformed[i] = features.DenseValues().ToArray();
                    i++;
                }
            }

            Assert.Equal(expected[0], transformed[0]);
            Assert.Equal(expected[1], transformed[1]);
        }
Пример #8
0
        public static void PrintTextFeauturizingOptions(TextFeaturizingEstimator.Options tfo)
        {
            ConsoleHelper.Write(ConsoleColor.White, "Parametre for Text-Feauturizing");
            var table = new ConsoleTable("", "NGram-længde", "SkipLength", "Use All Length", "Weighting");

            table.Options.EnableCount = false;

            table.AddRow("Word", tfo.WordFeatureExtractor.NgramLength, tfo.WordFeatureExtractor.SkipLength,
                         tfo.WordFeatureExtractor.UseAllLengths, tfo.WordFeatureExtractor.Weighting);
            table.AddRow("Char", tfo.CharFeatureExtractor.NgramLength, tfo.CharFeatureExtractor.SkipLength,
                         tfo.CharFeatureExtractor.UseAllLengths, tfo.CharFeatureExtractor.Weighting);
            table.Write();
            Console.WriteLine("");
        }
Пример #9
0
        static void Main(string[] args)
        {
            var texts = new List <Text> {
                new Text {
                    Data = "apple apple orange grape"
                },
                new Text {
                    Data = "grape apple melon"
                },
                new Text {
                    Data = "grape banana melon"
                }
            };

            var ml   = new MLContext();
            var data = ml.Data.LoadFromEnumerable(texts);
            var textFeaturizingOptions = new TextFeaturizingEstimator.Options
            {
                KeepDiacritics          = false,
                KeepPunctuations        = false,
                KeepNumbers             = false,
                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(),
                WordFeatureExtractor    = new WordBagEstimator.Options()
                {
                    Weighting = NgramExtractingEstimator.WeightingCriteria.TfIdf
                },
                CharFeatureExtractor = null
            };
            var vectorizer = ml.Transforms.Text.FeaturizeText("TfIDFWeights", options: textFeaturizingOptions, inputColumnNames: "Data");
            var result     = vectorizer.Fit(data).Transform(data);
            var column     = result.GetColumn <VBuffer <float> >("TfIDFWeights");
            VBuffer <ReadOnlyMemory <char> > slotNames = default;

            result.Schema["TfIDFWeights"].GetSlotNames(slotNames: ref slotNames);
            var words = slotNames.DenseValues().ToArray();
            var doc   = 0;

            foreach (var tfidf in column)
            {
                for (int i = 0; i < tfidf.Length; i++)
                {
                    Console.WriteLine($"doc:{doc} word '{words[i]}' {tfidf.GetItemOrDefault(i)}");
                }
                doc++;
            }
            Console.ReadLine();
        }
        public void TextFeaturizerWithL2NormTest()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "abc xyz", OutputTokens = null
                               },
                               new TestClass()
                               {
                                   A = "xyz", OutputTokens = null
                               } };
            var dataView = ML.Data.LoadFromEnumerable(data);

            var options = new TextFeaturizingEstimator.Options()
            {
                CharFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 1
                },
                Norm = TextFeaturizingEstimator.NormFunction.L2,
                OutputTokensColumnName = "OutputTokens"
            };
            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model    = pipeline.Fit(dataView);
            var engine   = model.CreatePredictionEngine <TestClass, TestClass>(ML);

            var prediction = engine.Predict(data[0]);

            Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens));
            var exp1     = 0.333333343f;
            var exp2     = 0.707106769f;
            var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 };

            Assert.Equal(expected, prediction.Features);

            prediction = engine.Predict(data[1]);
            exp1       = 0.4472136f;
            Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens));
            expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f };
            Assert.Equal(expected, prediction.Features);
        }
        public void TextFeaturizerWithWordFeatureExtractorTest()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "This is some text in english", OutputTokens = null
                               },
                               new TestClass()
                               {
                                   A = "This is another example", OutputTokens = null
                               } };
            var dataView = ML.Data.LoadFromEnumerable(data);

            var options = new TextFeaturizingEstimator.Options()
            {
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 1
                },
                CharFeatureExtractor = null,
                Norm = TextFeaturizingEstimator.NormFunction.None,
                OutputTokensColumnName = "OutputTokens"
            };
            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model    = pipeline.Fit(dataView);
            var engine   = model.CreatePredictionEngine <TestClass, TestClass>(ML);

            var prediction = engine.Predict(data[0]);

            Assert.Equal(data[0].A.ToLower(), string.Join(" ", prediction.OutputTokens));
            var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f };

            Assert.Equal(expected, prediction.Features);

            prediction = engine.Predict(data[1]);
            Assert.Equal(data[1].A.ToLower(), string.Join(" ", prediction.OutputTokens));
            expected = new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f };
            Assert.Equal(expected, prediction.Features);
        }
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
            // as well as the source of randomness.
            var mlContext = new MLContext();

            // Create a small dataset as an IEnumerable.
            var samples = new List <TextData>()
            {
                new TextData()
                {
                    Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features."
                },
                new TextData()
                {
                    Text = "This API can be used as a featurizer to perform text classification."
                },
                new TextData()
                {
                    Text = "There are a number of approaches to text classification."
                },
                new TextData()
                {
                    Text = "One of the simplest and most common approaches is called “Bag of Words”."
                },
                new TextData()
                {
                    Text = "Text classification can be used for a wide variety of tasks"
                },
                new TextData()
                {
                    Text = "such as sentiment analysis, topic detection, intent identification etc."
                },
            };

            // Convert training data to IDataView.
            var dataview = mlContext.Data.LoadFromEnumerable(samples);

            // A pipeline for converting text into numeric features.
            // The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters.
            // The length of the output feature vector depends on these settings.
            var options = new TextFeaturizingEstimator.Options()
            {
                // Also output tokenized words
                OutputTokensColumnName = "OutputTokens",
                CaseMode = TextNormalizingEstimator.CaseMode.Lower,
                // Use ML.NET's built-in stop word remover
                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options()
                {
                    Language = TextFeaturizingEstimator.Language.English
                },
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 2, UseAllLengths = true
                },
                CharFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 3, UseAllLengths = false
                },
            };
            var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", options, "Text");

            // Fit to data.
            var textTransformer = textPipeline.Fit(dataview);

            // Create the prediction engine to get the features extracted from the text.
            var predictionEngine = mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(textTransformer);

            // Convert the text into numeric features.
            var prediction = predictionEngine.Predict(samples[0]);

            // Print the length of the feature vector.
            Console.WriteLine($"Number of Features: {prediction.Features.Length}");

            // Print feature values and tokens.
            Console.Write("Features: ");
            for (int i = 0; i < 10; i++)
            {
                Console.Write($"{prediction.Features[i]:F4}  ");
            }

            Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}");

            //  Expected output:
            //   Number of Features: 282
            //   Features: 0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.1881 ...
            //   Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features.
        }
Пример #13
0
        public void MakeData()
        {
            var posts    = this.repository.Posts();
            var dataView = this.mlContext.Data.LoadFromEnumerable(posts);

            //// A pipeline for converting text into numeric features.
            //// The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters.
            //// The length of the output feature vector depends on these settings.
            var options = new TextFeaturizingEstimator.Options()
            {
                // Also output tokenized words
                OutputTokensColumnName = "OutputTokens",
                CaseMode = TextNormalizingEstimator.CaseMode.Lower,
                // Use ML.NET's built-in stop word remover
                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options()
                {
                    Language = TextFeaturizingEstimator.Language.English
                },
                WordFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 2, UseAllLengths = true
                },
                CharFeatureExtractor = new WordBagEstimator.Options()
                {
                    NgramLength = 3, UseAllLengths = false
                },
            };
            var textPipeline = this.mlContext.Transforms.Text.FeaturizeText("Features", options, "Text");

            // Fit to data.
            var textTransformer = textPipeline.Fit(dataView);
            // Create the prediction engine to get the features extracted from the text.
            var predictionEngine = this.mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(textTransformer);
            // Convert the text into numeric features.
            // var prediction = predictionEngine.Predict(posts[0]);
            var features = new BlockingCollection <Feature>();


            Parallel.ForEach(posts, p =>
            {
                try
                {
                    Console.WriteLine($"Now predicting {p.Id}");
                    var postPrediction = predictionEngine.Predict(new TextData
                    {
                        PostId = p.Id,
                        Text   = p.Body
                    });
                    var feature = new Feature();
                    postPrediction.Features.Each(pf => feature.Values.Add(new FeatureValue {
                        Value = pf
                    }));
                    postPrediction.OutputTokens.Each(ot => feature.Tokens.Add(new FeatureToken {
                        Token = ot
                    }));
                    features.Add(feature);
                }
                catch (Exception e)
                {
                    this.logger.LogError(e, e.Message);
                }
            });


            //using (var context = new SEJapaneseDataContext())
            //{
            //    context.Features.AddRange(features.ToList());
            //    context.SaveChanges();
            //    //  context.BulkInsert(features.ToList());
            //}
            //// Print the length of the feature vector.
            //Console.WriteLine($"Number of Features: {prediction.Features.Length}");

            //// Print feature values and tokens.
            //Console.Write("Features: ");
            //for (int i = 0; i < 10; i++)
            //{
            //    Console.Write($"{prediction.Features[i]:F4}  ");
            //}

            //Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}");

            //  Expected output:
            //   Number of Features: 282
            //   Features: 0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.1881 ...
            //   Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features.
        }