private TextNormalizerTransform(IHost host, ModelLoadContext ctx, IDataView input)
            : base(host, ctx, input, TestIsTextItem)
        {
            Host.AssertValue(ctx);

            using (var ch = Host.Start("Deserialization"))
            {
                // *** Binary format ***
                // <base>
                //   byte: case
                //   bool: whether to keep diacritics
                //   bool: whether to keep punctuations
                //   bool: whether to keep numbers
                ch.AssertNonEmpty(Infos);

                _case = (CaseNormalizationMode)ctx.Reader.ReadByte();
                ch.CheckDecode(Enum.IsDefined(typeof(CaseNormalizationMode), _case));

                _keepDiacritics   = ctx.Reader.ReadBoolByte();
                _keepPunctuations = ctx.Reader.ReadBoolByte();
                _keepNumbers      = ctx.Reader.ReadBoolByte();

                ch.Done();
            }
            Metadata.Seal();
        }
예제 #2
0
 public Reconciler(CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
 {
     _textCase         = textCase;
     _keepDiacritics   = keepDiacritics;
     _keepPunctuations = keepPunctuations;
     _keepNumbers      = keepNumbers;
 }
예제 #3
0
            public TransformApplierParams(TextTransform parent)
            {
                var host = parent._host;

                host.Check(Enum.IsDefined(typeof(Language), parent.AdvancedSettings.TextLanguage));
                host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.AdvancedSettings.TextCase));
                WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
                CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
                VectorNormalizer     = parent.AdvancedSettings.VectorNormalizer;
                Language             = parent.AdvancedSettings.TextLanguage;
                StopWordsRemover     = parent._stopWordsRemover;
                TextCase             = parent.AdvancedSettings.TextCase;
                KeepDiacritics       = parent.AdvancedSettings.KeepDiacritics;
                KeepPunctuations     = parent.AdvancedSettings.KeepPunctuations;
                KeepNumbers          = parent.AdvancedSettings.KeepNumbers;
                OutputTextTokens     = parent.AdvancedSettings.OutputTokens;
                Dictionary           = parent._dictionary;
            }
        public TextNormalizerTransform(IHostEnvironment env, Arguments args, IDataView input)
            : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, input, TestIsTextItem)
        {
            Host.AssertNonEmpty(Infos);
            Host.Assert(Infos.Length == Utils.Size(args.Column));

            using (var ch = Host.Start("Construction"))
            {
                ch.CheckUserArg(Enum.IsDefined(typeof(CaseNormalizationMode), args.TextCase),
                                nameof(args.TextCase), "Invalid case normalization mode");

                _case             = args.TextCase;
                _keepDiacritics   = args.KeepDiacritics;
                _keepPunctuations = args.KeepPunctuations;
                _keepNumbers      = args.KeepNumbers;
            }
            Metadata.Seal();
        }
 public TransformApplierParams(IHost host, Arguments args)
 {
     Contracts.AssertValue(host);
     host.CheckUserArg(args.Column != null, nameof(args.Column), "Columns must be specified");
     host.CheckUserArg(args.WordFeatureExtractor != null || args.CharFeatureExtractor != null || args.OutputTokens,
                       nameof(args.WordFeatureExtractor), "At least one feature extractor or OutputTokens must be specified.");
     host.Check(Enum.IsDefined(typeof(Language), args.Language));
     host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), args.TextCase));
     WordExtractorFactory = args.WordFeatureExtractor?.CreateComponent(host, args.Dictionary);
     CharExtractorFactory = args.CharFeatureExtractor?.CreateComponent(host, args.Dictionary);
     VectorNormalizer     = args.VectorNormalizer;
     Language             = args.Language;
     StopWordsRemover     = args.StopWordsRemover;
     TextCase             = args.TextCase;
     KeepDiacritics       = args.KeepDiacritics;
     KeepPunctuations     = args.KeepPunctuations;
     KeepNumbers          = args.KeepNumbers;
     OutputTextTokens     = args.OutputTokens;
     Dictionary           = args.Dictionary;
 }
예제 #6
0
 /// <summary>
 /// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers.
 /// </summary>
 /// <param name="input">The column to apply to.</param>
 /// <param name="textCase">Casing text using the rules of the invariant culture.</param>
 /// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
 /// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
 /// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
 public static Scalar <string> NormalizeText(this Scalar <string> input,
                                             CaseNormalizationMode textCase = CaseNormalizationMode.Lower,
                                             bool keepDiacritics            = false,
                                             bool keepPunctuations          = true,
                                             bool keepNumbers = true) => new OutPipelineColumn(input, textCase, keepDiacritics, keepPunctuations, keepNumbers);
예제 #7
0
 public OutPipelineColumn(Scalar <string> input, CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
     : base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input)
 {
     Input = input;
 }