public Reconciler(TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
 {
     _textCase         = textCase;
     _keepDiacritics   = keepDiacritics;
     _keepPunctuations = keepPunctuations;
     _keepNumbers      = keepNumbers;
 }
        private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingEstimator.CaseMode caseMode)
        {
            var options = new TextFeaturizingEstimator.Options()
            {
                CaseMode = caseMode,
                OutputTokensColumnName = "OutputTokens"
            };
            var pipeline    = ML.Transforms.Text.FeaturizeText("Features", options, "A");
            var model       = pipeline.Fit(dataView);
            var engine      = model.CreatePredictionEngine <TestClass, TestClass>(ML);
            var prediction1 = engine.Predict(data[0]);
            var prediction2 = engine.Predict(data[1]);

            string expected1 = null;
            string expected2 = null;

            if (caseMode == TextNormalizingEstimator.CaseMode.Upper)
            {
                expected1 = data[0].A.ToUpper();
                expected2 = data[1].A.ToUpper();
            }
            else if (caseMode == TextNormalizingEstimator.CaseMode.Lower)
            {
                expected1 = data[0].A.ToLower();
                expected2 = data[1].A.ToLower();
            }
            else if (caseMode == TextNormalizingEstimator.CaseMode.None)
            {
                expected1 = data[0].A;
                expected2 = data[1].A;
            }

            Assert.Equal(expected1, string.Join(" ", prediction1.OutputTokens));
            Assert.Equal(expected2, string.Join(" ", prediction2.OutputTokens));
        }
Beispiel #3
0
 public TextNormalizer(TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower,
                       bool keepDiacritics  = false,
                       bool keepPuncuations = false,
                       bool keepNumbers     = false)
 {
     InitializeTextNormalizer(caseMode, keepDiacritics, keepPuncuations, keepNumbers);
 }
Beispiel #4
0
        private void InitializeTextNormalizer(TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower,
                                              bool keepDiacritics  = false,
                                              bool keepPuncuations = false,
                                              bool keepNumbers     = false)
        {
            _mlContext        = new MLContext();
            _emptySamplesList = new List <TextData>();
            _emptyDataView    = _mlContext.Data.LoadFromEnumerable(_emptySamplesList);

            // text normalizer
            _normTextPipeline = _mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text",
                                                                         caseMode,
                                                                         keepDiacritics: keepDiacritics,
                                                                         keepPunctuations: keepPuncuations,
                                                                         keepNumbers: keepNumbers);
            _normTextTransformer = _normTextPipeline.Fit(_emptyDataView);
            _predictionEngine    = _mlContext.Model.CreatePredictionEngine <TextData, TransformedTextData>(_normTextTransformer);
        }
 /// <summary>
 /// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
 /// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
 /// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
 /// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
 public static Scalar <string> NormalizeText(this Scalar <string> input,
                                             TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower,
                                             bool keepDiacritics   = false,
                                             bool keepPunctuations = true,
                                             bool keepNumbers      = true) => new OutPipelineColumn(input, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
 public OutPipelineColumn(Scalar <string> input, TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
     : base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input)
 {
     Input = input;
 }