private DefaultFallbackRecognizer CreateDefaultFallbackRecognizer(bool separateClitics, Core.Resources.IResourceDataAccessor accessor) { // A fallback recognizer is always lowest prio // TODO should this check whether there's already one? DefaultFallbackRecognizer r = null; switch (_Culture.TwoLetterISOLanguageName) { case "ja": case "zh": r = new DefaultJAZHFallbackRecognizer(TokenType.Unknown, 0, _Culture, accessor); break; case "th": case "km": // Khmer r = new DefaultThaiFallbackRecognizer(TokenType.Unknown, 0, _Culture, accessor); break; default: r = new DefaultFallbackRecognizer(TokenType.Unknown, 0, _Culture, accessor, separateClitics); break; } return(r); }
private RegexRecognizer CreateVariableRecognizer(Core.Resources.IResourceDataAccessor accessor, System.Globalization.CultureInfo actualCulture) { Wordlist wl = new Wordlist(); // TODO also create the recognizer if no variables are defined/available? using (System.IO.Stream data = accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.Variables, true)) { if (data != null) { wl.Load(data); } } if (wl.Count == 0) { return(null); } // TODO set context restrictions of the recognizer RegexRecognizer recog = new RegexRecognizer(TokenType.Variable, 100, "VAR", "DEFAULT_VAR_REGOCNIZER"); CharacterSet first; string rx = wl.GetRegularExpression(out first); recog.Add(rx, first); recog.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture); return(recog); }
public static TokenizerSetup Load(Core.Resources.IResourceDataAccessor accessor, System.Globalization.CultureInfo culture) { using (System.IO.Stream dataStream = accessor.ReadResourceData(culture, Core.Resources.LanguageResourceType.TokenizerSettings, true)) { return(Load(dataStream)); } }
public static Recognizer Create(Core.Resources.IResourceDataAccessor resourceDataAccessor, System.Globalization.CultureInfo culture, int priority) { NumberFSTRecognizer result = new NumberFSTRecognizer(culture, priority, resourceDataAccessor); SetAdditionalOptions(result, culture); return(result); }
public static Recognizer Create(Core.Resources.IResourceDataAccessor resourceDataAccessor, System.Globalization.CultureInfo culture, Core.Tokenization.DateTimePatternType types, int priority) { Recognizer result = new DateTimeRecognizer(priority, DateTimePatternComputer.GetPatterns(culture, resourceDataAccessor, types)); result.OnlyIfFollowedByNonwordCharacter = Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture); return(result); }
/// <summary> /// Attempts to get the compiled FST from the resources, and if that fails, will create it from scratch /// </summary> public MeasureFSTRecognizer(System.Globalization.CultureInfo culture, int priority, Core.Resources.IResourceDataAccessor accessor) : base(TokenType.Measurement, priority, "Measurement", "MeasureFSTRecognizer") { if (culture == null) { throw new ArgumentNullException(); } if (culture.IsNeutralCulture) { throw new ArgumentException("Cannot compute measurement patterns for neutral cultures"); } if (culture.NumberFormat == null) { throw new ArgumentException("No number format info available for the specified culture"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } LanguagePlatform.Lingua.FST.FST fst = null; bool attemptLoad = true; if (attemptLoad && accessor.GetResourceStatus(culture, Core.Resources.LanguageResourceType.MeasurementFST, true) != Core.Resources.ResourceStatus.NotAvailable) { // TODO should _Culture be set to the _actual_ culture of the loaded FST, i.e. // the invariant culture for the generic/canonical one? byte[] data = accessor.GetResourceData(culture, Core.Resources.LanguageResourceType.MeasurementFST, true); if (data == null) { throw new Core.LanguagePlatformException(Core.ErrorCode.ResourceNotAvailable); } fst = LanguagePlatform.Lingua.FST.FST.Create(data); } else { fst = CreateFST(culture, Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture)); } _FSTRecognizer = new FSTRecognizer(fst, culture); }
private Recognizer CreateCurrencyRecognizer(Core.Resources.IResourceDataAccessor accessor, System.Globalization.CultureInfo actualCulture) { Wordlist wl = new Wordlist(); using (System.IO.Stream data = accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true)) { if (data != null) { wl.Load(data); } } if (wl.Count == 0) { return(null); } return(CurrencyRegexRecognizer.Create(actualCulture, wl, 100)); }
public static List <CalendarDateTimePatterns> GetPatterns(System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor accessor, Core.Tokenization.DateTimePatternType types) { DateTimePatternComputer computer = new DateTimePatternComputer(culture, accessor); List <CalendarDateTimePatterns> result = null; if (accessor != null) { // TODO support multiple calendars result = computer.LoadPatterns(types, false); } if (result == null) { // TODO support multiple calendars result = computer.ComputePatterns(types, false); } return(result); }
public DefaultJapaneseFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor) : base(t, priority, culture, dataAccessor, false) { // TODO outsource the pattern into resource assembly, or make externally configurable? _DefaultWordRegex = new System.Text.RegularExpressions.Regex(_DEFAULT_WORD_RX, System.Text.RegularExpressions.RegexOptions.ExplicitCapture); int i = 0; _IsFallbackRecognizer = true; _DefaultPunctCharset = CharacterSetParser.Parse(_DEFAULT_PUNC_CS, ref i); }
public DefaultThaiFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor) : base(t, priority, culture, dataAccessor, false) { // also used for Khmer now // System.Diagnostics.Debug.Assert(culture.TwoLetterISOLanguageName.Equals("th", StringComparison.OrdinalIgnoreCase)); _IsFallbackRecognizer = true; }
public static CompositeResourceDataAccessor Load() { CompositeResourceDataAccessor result = new CompositeResourceDataAccessor(false); ResourceDataAccessorConfigurationSection section = System.Configuration.ConfigurationManager.GetSection(ConfigurationSectionName) as ResourceDataAccessorConfigurationSection; if (section == null || section.ResourceDataAccessors == null || section.ResourceDataAccessors.Count == 0) { // create default/fallback collection result.AddDefaultAccessor(); return(result); } for (int e = 0; e < section.ResourceDataAccessors.Count; ++e) { try { ResourceDataAccessorConfigurationElement element = section.ResourceDataAccessors[e]; System.Type rdaType = Type.GetType(element.Type); if (rdaType == null) { throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationCannotResolveType, Core.FaultStatus.Fatal, element.Type.ToString()); } bool found = false; foreach (System.Type t in rdaType.GetInterfaces()) { if (t == typeof(Core.Resources.IResourceDataAccessor)) { found = true; break; } } if (!found) { throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationInvalidType, Core.FaultStatus.Fatal, element.Type.ToString()); } if (rdaType.IsAbstract) { throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationAbstractType, Core.FaultStatus.Fatal, element.Type.ToString()); } object instance = rdaType.Assembly.CreateInstance(rdaType.FullName, false, System.Reflection.BindingFlags.CreateInstance, null, String.IsNullOrEmpty(element.Parameter) ? null : new object[] { element.Parameter }, System.Globalization.CultureInfo.CurrentCulture, null); // We could check the constructors to test whether they match the parameter if (instance == null) { throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationCannotInstantiateOrCastType, Core.FaultStatus.Fatal, element.Type.ToString()); } Core.Resources.IResourceDataAccessor rda = instance as Core.Resources.IResourceDataAccessor; if (rda == null) { throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationCannotInstantiateOrCastType, Core.FaultStatus.Fatal, element.Type.ToString()); } result.Add(rda); } catch (System.Exception) { throw; } } if (result.Count == 0) { result.AddDefaultAccessor(); } return(result); }
private DateTimePatternComputer(System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor accessor) { _Culture = culture; _Accessor = accessor; }
public DefaultJAZHFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor) : base(t, priority, culture, dataAccessor, false) { _IsFallbackRecognizer = true; }
public DefaultFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor, bool separateClitics) : base(t, priority, null, "DefaultFallbackRecognizer") { // NOTE the token type is ignored in the implementation as the fallback recognizer will deliver multiple token types _Culture = culture; if (dataAccessor != null) { _Resources = new Sdl.LanguagePlatform.Lingua.Resources.LanguageResources(_Culture, dataAccessor); } _IsFallbackRecognizer = true; // TODO test performance of building up the trie and matching instead fo using StartsWith() on // the list of clitics HashSet <string> leadingClitics = Core.CultureInfoExtensions.GetLeadingClitics(_Culture); if (leadingClitics != null) { int p = 0; _LeadingClitics = new Trie <char, int>(); foreach (string s in leadingClitics) { _LeadingClitics.Add(s.ToCharArray(), p++); } } }
/// <summary> /// Initialize the tokenizer parameters from the tokenizer setup information. /// The resource data accessor is only used for retrieving variable values. /// </summary> /// <param name="setup">The tokenizer setup to use</param> /// <param name="accessor">A resource data accessor, to initialize the variables list</param> public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor) { if (setup == null) { throw new ArgumentNullException("setup"); } if (accessor == null) { accessor = new ResourceFileResourceAccessor(); } _BreakOnWhitespace = setup.BreakOnWhitespace; _CreateWhitespaceTokens = setup.CreateWhitespaceTokens; _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName); _Recognizers = new List <Recognizer>(); _ReclassifyAcronyms = false; // we need to determine a region-qualified culture since neutral cultures // don't have date/time/number pattern info System.Globalization.CultureInfo actualCulture = _Culture; if (_Culture.IsNeutralCulture) { actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0) { AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture, DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0) { if (UseRXNumberRecognizer) { AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100)); } else { AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100)); } // it does not make sense to recognize ordinal numbers if they don't become placeables and // don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd) bool createOrdinalNumberRecognizer = false; if (createOrdinalNumberRecognizer) { // add special recognizer for ordinal numbers if ordinal followers are available // [0-9]+\. (?=[OrdinalFollowers]) if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable) { Wordlist ordinalFollowers = new Wordlist(); CharacterSet dummy; ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true)); if (ordinalFollowers.Count > 0) { string ordinalNumbersRX = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)"; RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer"); CharacterSet ordinalFirst = new CharacterSet(); ordinalFirst.Add('0', '9'); ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst); AddRecognizer(ordinalNumbersRecognizer); } } } } else { // TODO should we still add a rudimentary recognizer for alpha-numerals? } // TODO other recognizer types (for builtin token classes) if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0) { RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100); if (recog != null) { _ReclassifyAcronyms = true; AddRecognizer(recog); } // this shouldn't be in the "acronym" setting but it's too late for a UI change... recog = CreateUriRecognizer(actualCulture, 100); AddRecognizer(recog); // TODO make IP address recognizer optional? AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101)); // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50)); } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0) { if (accessor != null) { try { RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture); if (recog != null) { AddRecognizer(recog); } } catch // (System.Exception e) { // nop - ignore errors } } } if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0) { Recognizer recog; if (UseRXMeasurementRecognizer) { recog = MeasureRegexRecognizer.Create(actualCulture, 100); } else { recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100); } AddRecognizer(recog); // disable for the time being due to performance issues //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable) //{ // recog = CreateCurrencyRecognizer(accessor, actualCulture); // AddRecognizer(recog); //} } #if false // TODO NOTE this slows down the performance too much - need to find a better way if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable) { // add an abbreviation recognizer Wordlist abbreviations = new Wordlist(); CharacterSet first; abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true)); string abbreviationsRX = abbreviations.GetRegularExpression(out first) + @"(?=\W)"; RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer"); abbreviationsRecognizer.Add(abbreviationsRX, first); AddRecognizer(abbreviationsRecognizer); } #endif { Recognizer recog; bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture); recog = CreateDefaultFallbackRecognizer(split, accessor); AddRecognizer(recog); } SortRecognizers(); }
public DefaultChineseFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor) : base(t, priority, culture, dataAccessor, false) { int i = 0; _IsFallbackRecognizer = true; _DefaultPunctCharset = CharacterSetParser.Parse(_DEFAULT_PUNC_CS, ref i); }
public Tokenizer(TokenizerSetup setup, Core.Resources.IResourceDataAccessor resourceDataAccessor) : this(new TokenizerParameters(setup, resourceDataAccessor)) { }