private RegexRecognizer CreateVariableRecognizer(Core.Resources.IResourceDataAccessor accessor, System.Globalization.CultureInfo actualCulture) { Wordlist wl = new Wordlist(); // TODO also create the recognizer if no variables are defined/available? using (System.IO.Stream data = accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.Variables, true)) { if (data != null) { wl.Load(data); } } if (wl.Count == 0) { return(null); } // TODO set context restrictions of the recognizer RegexRecognizer recog = new RegexRecognizer(TokenType.Variable, 100, "VAR", "DEFAULT_VAR_REGOCNIZER"); CharacterSet first; string rx = wl.GetRegularExpression(out first); recog.Add(rx, first); recog.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture); return(recog); }
private Recognizer CreateCurrencyRecognizer(Core.Resources.IResourceDataAccessor accessor, System.Globalization.CultureInfo actualCulture) { Wordlist wl = new Wordlist(); using (System.IO.Stream data = accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true)) { if (data != null) { wl.Load(data); } } if (wl.Count == 0) { return(null); } return(CurrencyRegexRecognizer.Create(actualCulture, wl, 100)); }
/// <summary> /// Initialize the tokenizer parameters from the tokenizer setup information. /// The resource data accessor is only used for retrieving variable values. /// </summary> /// <param name="setup">The tokenizer setup to use</param> /// <param name="accessor">A resource data accessor, to initialize the variables list</param> public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor) { if (setup == null) { throw new ArgumentNullException("setup"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } _BreakOnWhitespace = setup.BreakOnWhitespace; _CreateWhitespaceTokens = setup.CreateWhitespaceTokens; _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName); _Recognizers = new List <Recognizer>(); _ReclassifyAcronyms = false; // we need to determine a region-qualified culture since neutral cultures // don't have date/time/number pattern info System.Globalization.CultureInfo actualCulture = _Culture; if (_Culture.IsNeutralCulture) { actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0) { if (UseRXNumberRecognizer) { AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100)); } else { AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100)); } // it does not make sense to recognize ordinal numbers if they don't become placeables and // don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd) bool createOrdinalNumberRecognizer = false; if (createOrdinalNumberRecognizer) { // add special recognizer for ordinal numbers if ordinal followers are available // [0-9]+\. (?=[OrdinalFollowers]) if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable) { Wordlist ordinalFollowers = new Wordlist(); CharacterSet dummy; ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true)); if (ordinalFollowers.Count > 0) { string ordinalNumbersRX = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)"; RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer"); CharacterSet ordinalFirst = new CharacterSet(); ordinalFirst.Add('0', '9'); ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst); AddRecognizer(ordinalNumbersRecognizer); } } } } else { // TODO should we still add a rudimentary recognizer for alpha-numerals? } // TODO other recognizer types (for builtin token classes) if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0) { RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100); if (recog != null) { _ReclassifyAcronyms = true; AddRecognizer(recog); } // this shouldn't be in the "acronym" setting but it's too late for a UI change... recog = CreateUriRecognizer(actualCulture, 100); AddRecognizer(recog); // TODO make IP address recognizer optional? AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101)); // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0) { if (accessor != null) { try { RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture); if (recog != null) { AddRecognizer(recog); } } catch // (System.Exception e) { // nop - ignore errors } } } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0) { Recognizer recog; if (UseRXMeasurementRecognizer) { recog = MeasureRegexRecognizer.Create(actualCulture, 100); } else { recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100); } AddRecognizer(recog); // disable for the time being due to performance issues //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable) //{ // recog = CreateCurrencyRecognizer(accessor, actualCulture); // AddRecognizer(recog); //} } #if false // TODO NOTE this slows down the performance too much - need to find a better way if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable) { // add an abbreviation recognizer Wordlist abbreviations = new Wordlist(); CharacterSet first; abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true)); string abbreviationsRX = abbreviations.GetRegularExpression(out first) + @"(?=\W)"; RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer"); abbreviationsRecognizer.Add(abbreviationsRX, first); AddRecognizer(abbreviationsRecognizer); } #endif { Recognizer recog; bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture); recog = CreateDefaultFallbackRecognizer(split, accessor); AddRecognizer(recog); } SortRecognizers(); }