public static Recognizer Create(System.Globalization.CultureInfo culture, int priority) { NumberFSTRecognizer result = new NumberFSTRecognizer(culture, priority); SetAdditionalOptions(result, culture); return(result); }
public static Recognizer Create(Core.Resources.IResourceDataAccessor resourceDataAccessor, System.Globalization.CultureInfo culture, int priority) { NumberFSTRecognizer result = new NumberFSTRecognizer(culture, priority, resourceDataAccessor); SetAdditionalOptions(result, culture); return(result); }
private static void SetAdditionalOptions(NumberFSTRecognizer result, System.Globalization.CultureInfo culture) { result.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture); if (result.AdditionalTerminators == null) { result.AdditionalTerminators = new Sdl.LanguagePlatform.Core.CharacterSet(); } result.AdditionalTerminators.Add('-'); // TODO other math symbols? result.OverrideFallbackRecognizer = true; }
// NOTE parsing does not yet support canonical number FSTs since the culture // is not inspected for primary/secondary separators. The only source for // determination whether primary or alternate separators are used is the FST // output. This, however, will always be the alternate separator indicator for // canonical recognizers. private Core.Tokenization.MeasureToken Parse(string surface, string output) { System.Diagnostics.Debug.Assert(surface != null && output != null && surface.Length == output.Length); int sep = output.IndexOf('U'); if (sep <= 0) { throw new Exception("Invalid measurement format"); } string numericSurface = surface.Substring(0, sep); string numericOutput = output.Substring(0, sep); char unitSeparator = '\0'; while (sep < surface.Length && Char.IsWhiteSpace(surface[sep])) { if (unitSeparator == '\0') { unitSeparator = surface[sep]; } ++sep; } string unitPart = surface.Substring(sep); NumberToken nt = NumberFSTRecognizer.ParseNumber(numericSurface, numericOutput); Core.Tokenization.Unit u = Core.Tokenization.PhysicalUnit.Find(unitPart, _FSTRecognizer.Culture); MeasureToken value = new Core.Tokenization.MeasureToken(surface, nt, u, unitPart, unitSeparator); return(value); }
/// <summary> /// Initialize the tokenizer parameters from the tokenizer setup information. /// The resource data accessor is only used for retrieving variable values. /// </summary> /// <param name="setup">The tokenizer setup to use</param> /// <param name="accessor">A resource data accessor, to initialize the variables list</param> public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor) { if (setup == null) { throw new ArgumentNullException("setup"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } _BreakOnWhitespace = setup.BreakOnWhitespace; _CreateWhitespaceTokens = setup.CreateWhitespaceTokens; _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName); _Recognizers = new List <Recognizer>(); _ReclassifyAcronyms = false; // we need to determine a region-qualified culture since neutral cultures // don't have date/time/number pattern info System.Globalization.CultureInfo actualCulture = _Culture; if (_Culture.IsNeutralCulture) { actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0) { if (UseRXNumberRecognizer) { AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100)); } else { AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100)); } // it does not make sense to recognize ordinal numbers if they don't become placeables and // don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd) bool createOrdinalNumberRecognizer = false; if (createOrdinalNumberRecognizer) { // add special recognizer for ordinal numbers if ordinal followers are available // [0-9]+\. (?=[OrdinalFollowers]) if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable) { Wordlist ordinalFollowers = new Wordlist(); CharacterSet dummy; ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true)); if (ordinalFollowers.Count > 0) { string ordinalNumbersRX = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)"; RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer"); CharacterSet ordinalFirst = new CharacterSet(); ordinalFirst.Add('0', '9'); ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst); AddRecognizer(ordinalNumbersRecognizer); } } } } else { // TODO should we still add a rudimentary recognizer for alpha-numerals? } // TODO other recognizer types (for builtin token classes) if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0) { RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100); if (recog != null) { _ReclassifyAcronyms = true; AddRecognizer(recog); } // this shouldn't be in the "acronym" setting but it's too late for a UI change... recog = CreateUriRecognizer(actualCulture, 100); AddRecognizer(recog); // TODO make IP address recognizer optional? AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101)); // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0) { if (accessor != null) { try { RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture); if (recog != null) { AddRecognizer(recog); } } catch // (System.Exception e) { // nop - ignore errors } } } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0) { Recognizer recog; if (UseRXMeasurementRecognizer) { recog = MeasureRegexRecognizer.Create(actualCulture, 100); } else { recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100); } AddRecognizer(recog); // disable for the time being due to performance issues //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable) //{ // recog = CreateCurrencyRecognizer(accessor, actualCulture); // AddRecognizer(recog); //} } #if false // TODO NOTE this slows down the performance too much - need to find a better way if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable) { // add an abbreviation recognizer Wordlist abbreviations = new Wordlist(); CharacterSet first; abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true)); string abbreviationsRX = abbreviations.GetRegularExpression(out first) + @"(?=\W)"; RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer"); abbreviationsRecognizer.Add(abbreviationsRX, first); AddRecognizer(abbreviationsRecognizer); } #endif { Recognizer recog; bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture); recog = CreateDefaultFallbackRecognizer(split, accessor); AddRecognizer(recog); } SortRecognizers(); }
public static FST.FST GenerateNumberFST(System.Globalization.CultureInfo ci) { return(NumberFSTRecognizer.CreateFST(ci, Core.CultureInfoExtensions.UseBlankAsWordSeparator(ci))); }