private TextNormalizerTransform(IHost host, ModelLoadContext ctx, IDataView input) : base(host, ctx, input, TestIsTextItem) { Host.AssertValue(ctx); using (var ch = Host.Start("Deserialization")) { // *** Binary format *** // <base> // byte: case // bool: whether to keep diacritics // bool: whether to keep punctuations // bool: whether to keep numbers ch.AssertNonEmpty(Infos); _case = (CaseNormalizationMode)ctx.Reader.ReadByte(); ch.CheckDecode(Enum.IsDefined(typeof(CaseNormalizationMode), _case)); _keepDiacritics = ctx.Reader.ReadBoolByte(); _keepPunctuations = ctx.Reader.ReadBoolByte(); _keepNumbers = ctx.Reader.ReadBoolByte(); ch.Done(); } Metadata.Seal(); }
public Reconciler(CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) { _textCase = textCase; _keepDiacritics = keepDiacritics; _keepPunctuations = keepPunctuations; _keepNumbers = keepNumbers; }
public TransformApplierParams(TextTransform parent) { var host = parent._host; host.Check(Enum.IsDefined(typeof(Language), parent.AdvancedSettings.TextLanguage)); host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.AdvancedSettings.TextCase)); WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary); CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary); VectorNormalizer = parent.AdvancedSettings.VectorNormalizer; Language = parent.AdvancedSettings.TextLanguage; StopWordsRemover = parent._stopWordsRemover; TextCase = parent.AdvancedSettings.TextCase; KeepDiacritics = parent.AdvancedSettings.KeepDiacritics; KeepPunctuations = parent.AdvancedSettings.KeepPunctuations; KeepNumbers = parent.AdvancedSettings.KeepNumbers; OutputTextTokens = parent.AdvancedSettings.OutputTokens; Dictionary = parent._dictionary; }
public TextNormalizerTransform(IHostEnvironment env, Arguments args, IDataView input) : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, input, TestIsTextItem) { Host.AssertNonEmpty(Infos); Host.Assert(Infos.Length == Utils.Size(args.Column)); using (var ch = Host.Start("Construction")) { ch.CheckUserArg(Enum.IsDefined(typeof(CaseNormalizationMode), args.TextCase), nameof(args.TextCase), "Invalid case normalization mode"); _case = args.TextCase; _keepDiacritics = args.KeepDiacritics; _keepPunctuations = args.KeepPunctuations; _keepNumbers = args.KeepNumbers; } Metadata.Seal(); }
public TransformApplierParams(IHost host, Arguments args) { Contracts.AssertValue(host); host.CheckUserArg(args.Column != null, nameof(args.Column), "Columns must be specified"); host.CheckUserArg(args.WordFeatureExtractor != null || args.CharFeatureExtractor != null || args.OutputTokens, nameof(args.WordFeatureExtractor), "At least one feature extractor or OutputTokens must be specified."); host.Check(Enum.IsDefined(typeof(Language), args.Language)); host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), args.TextCase)); WordExtractorFactory = args.WordFeatureExtractor?.CreateComponent(host, args.Dictionary); CharExtractorFactory = args.CharFeatureExtractor?.CreateComponent(host, args.Dictionary); VectorNormalizer = args.VectorNormalizer; Language = args.Language; StopWordsRemover = args.StopWordsRemover; TextCase = args.TextCase; KeepDiacritics = args.KeepDiacritics; KeepPunctuations = args.KeepPunctuations; KeepNumbers = args.KeepNumbers; OutputTextTokens = args.OutputTokens; Dictionary = args.Dictionary; }
/// <summary> /// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers. /// </summary> /// <param name="input">The column to apply to.</param> /// <param name="textCase">Casing text using the rules of the invariant culture.</param> /// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param> /// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param> /// <param name="keepNumbers">Whether to keep numbers or remove them.</param> public static Scalar <string> NormalizeText(this Scalar <string> input, CaseNormalizationMode textCase = CaseNormalizationMode.Lower, bool keepDiacritics = false, bool keepPunctuations = true, bool keepNumbers = true) => new OutPipelineColumn(input, textCase, keepDiacritics, keepPunctuations, keepNumbers);
public OutPipelineColumn(Scalar <string> input, CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers) : base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input) { Input = input; }