/// <summary> /// Attempts to get the compiled FST from the resources, and if that fails, will create it from scratch /// </summary> public MeasureFSTRecognizer(System.Globalization.CultureInfo culture, int priority, Core.Resources.IResourceDataAccessor accessor) : base(TokenType.Measurement, priority, "Measurement", "MeasureFSTRecognizer") { if (culture == null) { throw new ArgumentNullException(); } if (culture.IsNeutralCulture) { throw new ArgumentException("Cannot compute measurement patterns for neutral cultures"); } if (culture.NumberFormat == null) { throw new ArgumentException("No number format info available for the specified culture"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } LanguagePlatform.Lingua.FST.FST fst = null; bool attemptLoad = true; if (attemptLoad && accessor.GetResourceStatus(culture, Core.Resources.LanguageResourceType.MeasurementFST, true) != Core.Resources.ResourceStatus.NotAvailable) { // TODO should _Culture be set to the _actual_ culture of the loaded FST, i.e. // the invariant culture for the generic/canonical one? byte[] data = accessor.GetResourceData(culture, Core.Resources.LanguageResourceType.MeasurementFST, true); if (data == null) { throw new Core.LanguagePlatformException(Core.ErrorCode.ResourceNotAvailable); } fst = LanguagePlatform.Lingua.FST.FST.Create(data); } else { fst = CreateFST(culture, Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture)); } _FSTRecognizer = new FSTRecognizer(fst, culture); }
/// <summary> /// Initialize the tokenizer parameters from the tokenizer setup information. /// The resource data accessor is only used for retrieving variable values. /// </summary> /// <param name="setup">The tokenizer setup to use</param> /// <param name="accessor">A resource data accessor, to initialize the variables list</param> public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor) { if (setup == null) { throw new ArgumentNullException("setup"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } _BreakOnWhitespace = setup.BreakOnWhitespace; _CreateWhitespaceTokens = setup.CreateWhitespaceTokens; _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName); _Recognizers = new List <Recognizer>(); _ReclassifyAcronyms = false; // we need to determine a region-qualified culture since neutral cultures // don't have date/time/number pattern info System.Globalization.CultureInfo actualCulture = _Culture; if (_Culture.IsNeutralCulture) { actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0) { if (UseRXNumberRecognizer) { AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100)); } else { AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100)); } // it does not make sense to recognize ordinal numbers if they don't become placeables and // don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd) bool createOrdinalNumberRecognizer = false; if (createOrdinalNumberRecognizer) { // add special recognizer for ordinal numbers if ordinal followers are available // [0-9]+\. (?=[OrdinalFollowers]) if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable) { Wordlist ordinalFollowers = new Wordlist(); CharacterSet dummy; ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true)); if (ordinalFollowers.Count > 0) { string ordinalNumbersRX = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)"; RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer"); CharacterSet ordinalFirst = new CharacterSet(); ordinalFirst.Add('0', '9'); ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst); AddRecognizer(ordinalNumbersRecognizer); } } } } else { // TODO should we still add a rudimentary recognizer for alpha-numerals? } // TODO other recognizer types (for builtin token classes) if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0) { RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100); if (recog != null) { _ReclassifyAcronyms = true; AddRecognizer(recog); } // this shouldn't be in the "acronym" setting but it's too late for a UI change... recog = CreateUriRecognizer(actualCulture, 100); AddRecognizer(recog); // TODO make IP address recognizer optional? AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101)); // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0) { if (accessor != null) { try { RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture); if (recog != null) { AddRecognizer(recog); } } catch // (System.Exception e) { // nop - ignore errors } } } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0) { Recognizer recog; if (UseRXMeasurementRecognizer) { recog = MeasureRegexRecognizer.Create(actualCulture, 100); } else { recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100); } AddRecognizer(recog); // disable for the time being due to performance issues //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable) //{ // recog = CreateCurrencyRecognizer(accessor, actualCulture); // AddRecognizer(recog); //} } #if false // TODO NOTE this slows down the performance too much - need to find a better way if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable) { // add an abbreviation recognizer Wordlist abbreviations = new Wordlist(); CharacterSet first; abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true)); string abbreviationsRX = abbreviations.GetRegularExpression(out first) + @"(?=\W)"; RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer"); abbreviationsRecognizer.Add(abbreviationsRX, first); AddRecognizer(abbreviationsRecognizer); } #endif { Recognizer recog; bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture); recog = CreateDefaultFallbackRecognizer(split, accessor); AddRecognizer(recog); } SortRecognizers(); }