예제 #1
0
        private void InitializeTextNormalizer(TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower,
                                              bool keepDiacritics  = false,
                                              bool keepPuncuations = false,
                                              bool keepNumbers     = false)
        {
            _mlContext        = new MLContext();
            _emptySamplesList = new List <TextData>();
            _emptyDataView    = _mlContext.Data.LoadFromEnumerable(_emptySamplesList);

            // text normalizer
            _normTextPipeline = _mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text",
                                                                         caseMode,
                                                                         keepDiacritics: keepDiacritics,
                                                                         keepPunctuations: keepPuncuations,
                                                                         keepNumbers: keepNumbers);
            _normTextTransformer = _normTextPipeline.Fit(_emptyDataView);
            _predictionEngine    = _mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(_normTextTransformer);
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(sentimentDataPath);

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(sentimentDataPath);

            var est = new TextNormalizingEstimator(Env, "text")
                      .Append(new WordTokenizingEstimator(Env, "text", "words"))
                      .Append(new StopwordRemover(Env, "words", "words_without_stopwords"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = ColumnSelectingTransformer.CreateKeep(Env, savedData, new[] { "text", "words_without_stopwords" });

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }