private DefaultFallbackRecognizer CreateDefaultFallbackRecognizer(bool separateClitics,
                                                                          Core.Resources.IResourceDataAccessor accessor)
        {
            // A fallback recognizer is always lowest prio
            // TODO should this check whether there's already one?

            DefaultFallbackRecognizer r = null;

            switch (_Culture.TwoLetterISOLanguageName)
            {
            case "ja":
            case "zh":
                r = new DefaultJAZHFallbackRecognizer(TokenType.Unknown, 0, _Culture, accessor);
                break;

            case "th":
            case "km":             // Khmer
                r = new DefaultThaiFallbackRecognizer(TokenType.Unknown, 0, _Culture, accessor);
                break;

            default:
                r = new DefaultFallbackRecognizer(TokenType.Unknown, 0, _Culture, accessor, separateClitics);
                break;
            }

            return(r);
        }
        private RegexRecognizer CreateVariableRecognizer(Core.Resources.IResourceDataAccessor accessor,
                                                         System.Globalization.CultureInfo actualCulture)
        {
            Wordlist wl = new Wordlist();

            // TODO also create the recognizer if no variables are defined/available?

            using (System.IO.Stream data = accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.Variables, true))
            {
                if (data != null)
                {
                    wl.Load(data);
                }
            }

            if (wl.Count == 0)
            {
                return(null);
            }

            // TODO set context restrictions of the recognizer
            RegexRecognizer recog = new RegexRecognizer(TokenType.Variable, 100, "VAR", "DEFAULT_VAR_REGOCNIZER");
            CharacterSet    first;
            string          rx = wl.GetRegularExpression(out first);

            recog.Add(rx, first);

            recog.OnlyIfFollowedByNonwordCharacter =
                Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture);

            return(recog);
        }
Exemple #3
0
 public static TokenizerSetup Load(Core.Resources.IResourceDataAccessor accessor,
                                   System.Globalization.CultureInfo culture)
 {
     using (System.IO.Stream dataStream = accessor.ReadResourceData(culture,
                                                                    Core.Resources.LanguageResourceType.TokenizerSettings, true))
     {
         return(Load(dataStream));
     }
 }
        public static Recognizer Create(Core.Resources.IResourceDataAccessor resourceDataAccessor,
                                        System.Globalization.CultureInfo culture, int priority)
        {
            NumberFSTRecognizer result = new NumberFSTRecognizer(culture, priority,
                                                                 resourceDataAccessor);

            SetAdditionalOptions(result, culture);

            return(result);
        }
Exemple #5
0
        public static Recognizer Create(Core.Resources.IResourceDataAccessor resourceDataAccessor,
                                        System.Globalization.CultureInfo culture,
                                        Core.Tokenization.DateTimePatternType types, int priority)
        {
            Recognizer result = new DateTimeRecognizer(priority,
                                                       DateTimePatternComputer.GetPatterns(culture, resourceDataAccessor, types));

            result.OnlyIfFollowedByNonwordCharacter
                = Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture);
            return(result);
        }
Exemple #6
0
        /// <summary>
        /// Attempts to get the compiled FST from the resources, and if that fails, will create it from scratch
        /// </summary>
        public MeasureFSTRecognizer(System.Globalization.CultureInfo culture, int priority,
                                    Core.Resources.IResourceDataAccessor accessor)
            : base(TokenType.Measurement, priority, "Measurement", "MeasureFSTRecognizer")
        {
            if (culture == null)
            {
                throw new ArgumentNullException();
            }
            if (culture.IsNeutralCulture)
            {
                throw new ArgumentException("Cannot compute measurement patterns for neutral cultures");
            }
            if (culture.NumberFormat == null)
            {
                throw new ArgumentException("No number format info available for the specified culture");
            }
            if (accessor == null)
            {
                accessor = new ResourceFileResourceAccessor();
            }

            LanguagePlatform.Lingua.FST.FST fst = null;

            bool attemptLoad = true;

            if (attemptLoad &&
                accessor.GetResourceStatus(culture, Core.Resources.LanguageResourceType.MeasurementFST, true) !=
                Core.Resources.ResourceStatus.NotAvailable)
            {
                // TODO should _Culture be set to the _actual_ culture of the loaded FST, i.e.
                //  the invariant culture for the generic/canonical one?

                byte[] data = accessor.GetResourceData(culture, Core.Resources.LanguageResourceType.MeasurementFST, true);
                if (data == null)
                {
                    throw new Core.LanguagePlatformException(Core.ErrorCode.ResourceNotAvailable);
                }

                fst = LanguagePlatform.Lingua.FST.FST.Create(data);
            }
            else
            {
                fst = CreateFST(culture, Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture));
            }

            _FSTRecognizer = new FSTRecognizer(fst, culture);
        }
        private Recognizer CreateCurrencyRecognizer(Core.Resources.IResourceDataAccessor accessor,
                                                    System.Globalization.CultureInfo actualCulture)
        {
            Wordlist wl = new Wordlist();

            using (System.IO.Stream data = accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true))
            {
                if (data != null)
                {
                    wl.Load(data);
                }
            }

            if (wl.Count == 0)
            {
                return(null);
            }

            return(CurrencyRegexRecognizer.Create(actualCulture, wl, 100));
        }
Exemple #8
0
        public static List <CalendarDateTimePatterns> GetPatterns(System.Globalization.CultureInfo culture,
                                                                  Core.Resources.IResourceDataAccessor accessor,
                                                                  Core.Tokenization.DateTimePatternType types)
        {
            DateTimePatternComputer computer = new DateTimePatternComputer(culture, accessor);

            List <CalendarDateTimePatterns> result = null;

            if (accessor != null)
            {
                // TODO support multiple calendars
                result = computer.LoadPatterns(types, false);
            }

            if (result == null)
            {
                // TODO support multiple calendars
                result = computer.ComputePatterns(types, false);
            }

            return(result);
        }
        public DefaultJapaneseFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor)
            : base(t, priority, culture, dataAccessor, false)
        {
            // TODO outsource the pattern into resource assembly, or make externally configurable?
            _DefaultWordRegex = new System.Text.RegularExpressions.Regex(_DEFAULT_WORD_RX, System.Text.RegularExpressions.RegexOptions.ExplicitCapture);
            int i = 0;

            _IsFallbackRecognizer = true;
            _DefaultPunctCharset  = CharacterSetParser.Parse(_DEFAULT_PUNC_CS, ref i);
        }
Exemple #10
0
 public DefaultThaiFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor)
     : base(t, priority, culture, dataAccessor, false)
 {
     // also used for Khmer now
     // System.Diagnostics.Debug.Assert(culture.TwoLetterISOLanguageName.Equals("th", StringComparison.OrdinalIgnoreCase));
     _IsFallbackRecognizer = true;
 }
Exemple #11
0
        public static CompositeResourceDataAccessor Load()
        {
            CompositeResourceDataAccessor result
                = new CompositeResourceDataAccessor(false);

            ResourceDataAccessorConfigurationSection section = System.Configuration.ConfigurationManager.GetSection(ConfigurationSectionName)
                                                               as ResourceDataAccessorConfigurationSection;

            if (section == null ||
                section.ResourceDataAccessors == null ||
                section.ResourceDataAccessors.Count == 0)
            {
                // create default/fallback collection
                result.AddDefaultAccessor();
                return(result);
            }

            for (int e = 0; e < section.ResourceDataAccessors.Count; ++e)
            {
                try
                {
                    ResourceDataAccessorConfigurationElement element
                        = section.ResourceDataAccessors[e];

                    System.Type rdaType = Type.GetType(element.Type);
                    if (rdaType == null)
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationCannotResolveType, Core.FaultStatus.Fatal, element.Type.ToString());
                    }

                    bool found = false;
                    foreach (System.Type t in rdaType.GetInterfaces())
                    {
                        if (t == typeof(Core.Resources.IResourceDataAccessor))
                        {
                            found = true;
                            break;
                        }
                    }
                    if (!found)
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationInvalidType, Core.FaultStatus.Fatal, element.Type.ToString());
                    }

                    if (rdaType.IsAbstract)
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationAbstractType, Core.FaultStatus.Fatal, element.Type.ToString());
                    }

                    object instance = rdaType.Assembly.CreateInstance(rdaType.FullName, false, System.Reflection.BindingFlags.CreateInstance,
                                                                      null, String.IsNullOrEmpty(element.Parameter) ? null : new object[] { element.Parameter },
                                                                      System.Globalization.CultureInfo.CurrentCulture, null);

                    // We could check the constructors to test whether they match the parameter

                    if (instance == null)
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationCannotInstantiateOrCastType, Core.FaultStatus.Fatal, element.Type.ToString());
                    }

                    Core.Resources.IResourceDataAccessor rda = instance as Core.Resources.IResourceDataAccessor;
                    if (rda == null)
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.ConfigurationCannotInstantiateOrCastType, Core.FaultStatus.Fatal, element.Type.ToString());
                    }

                    result.Add(rda);
                }
                catch (System.Exception)
                {
                    throw;
                }
            }

            if (result.Count == 0)
            {
                result.AddDefaultAccessor();
            }

            return(result);
        }
Exemple #12
0
 private DateTimePatternComputer(System.Globalization.CultureInfo culture,
                                 Core.Resources.IResourceDataAccessor accessor)
 {
     _Culture  = culture;
     _Accessor = accessor;
 }
 public DefaultJAZHFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor)
     : base(t, priority, culture, dataAccessor, false)
 {
     _IsFallbackRecognizer = true;
 }
        public DefaultFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor, bool separateClitics)
            : base(t, priority, null, "DefaultFallbackRecognizer")
        {
            // NOTE the token type is ignored in the implementation as the fallback recognizer will deliver multiple token types
            _Culture = culture;
            if (dataAccessor != null)
            {
                _Resources = new Sdl.LanguagePlatform.Lingua.Resources.LanguageResources(_Culture, dataAccessor);
            }

            _IsFallbackRecognizer = true;

            // TODO test performance of building up the trie and matching instead fo using StartsWith() on
            //  the list of clitics
            HashSet <string> leadingClitics = Core.CultureInfoExtensions.GetLeadingClitics(_Culture);

            if (leadingClitics != null)
            {
                int p = 0;
                _LeadingClitics = new Trie <char, int>();
                foreach (string s in leadingClitics)
                {
                    _LeadingClitics.Add(s.ToCharArray(), p++);
                }
            }
        }
        /// <summary>
        /// Initialize the tokenizer parameters from the tokenizer setup information.
        /// The resource data accessor is only used for retrieving variable values.
        /// </summary>
        /// <param name="setup">The tokenizer setup to use</param>
        /// <param name="accessor">A resource data accessor, to initialize the variables list</param>
        public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor)
        {
            if (setup == null)
            {
                throw new ArgumentNullException("setup");
            }

            if (accessor == null)
            {
                accessor = new ResourceFileResourceAccessor();
            }

            _BreakOnWhitespace      = setup.BreakOnWhitespace;
            _CreateWhitespaceTokens = setup.CreateWhitespaceTokens;
            _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName);

            _Recognizers        = new List <Recognizer>();
            _ReclassifyAcronyms = false;

            // we need to determine a region-qualified culture since neutral cultures
            //  don't have date/time/number pattern info
            System.Globalization.CultureInfo actualCulture = _Culture;
            if (_Culture.IsNeutralCulture)
            {
                actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture);
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0)
            {
                if (UseRXNumberRecognizer)
                {
                    AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100));
                }
                else
                {
                    AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100));
                }

                // it does not make sense to recognize ordinal numbers if they don't become placeables and
                //  don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd)
                bool createOrdinalNumberRecognizer = false;
                if (createOrdinalNumberRecognizer)
                {
                    // add special recognizer for ordinal numbers if ordinal followers are available
                    // [0-9]+\. (?=[OrdinalFollowers])
                    if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable)
                    {
                        Wordlist     ordinalFollowers = new Wordlist();
                        CharacterSet dummy;
                        ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true));
                        if (ordinalFollowers.Count > 0)
                        {
                            string          ordinalNumbersRX         = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)";
                            RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer");
                            CharacterSet    ordinalFirst             = new CharacterSet();
                            ordinalFirst.Add('0', '9');
                            ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst);
                            AddRecognizer(ordinalNumbersRecognizer);
                        }
                    }
                }
            }
            else
            {
                // TODO should we still add a rudimentary recognizer for alpha-numerals?
            }

            // TODO other recognizer types (for builtin token classes)
            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0)
            {
                RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100);
                if (recog != null)
                {
                    _ReclassifyAcronyms = true;
                    AddRecognizer(recog);
                }

                // this shouldn't be in the "acronym" setting but it's too late for a UI change...
                recog = CreateUriRecognizer(actualCulture, 100);
                AddRecognizer(recog);

                // TODO make IP address recognizer optional?
                AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101));
                // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0)
            {
                if (accessor != null)
                {
                    try
                    {
                        RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture);
                        if (recog != null)
                        {
                            AddRecognizer(recog);
                        }
                    }
                    catch                     // (System.Exception e)
                    {
                        // nop - ignore errors
                    }
                }
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0)
            {
                Recognizer recog;

                if (UseRXMeasurementRecognizer)
                {
                    recog = MeasureRegexRecognizer.Create(actualCulture, 100);
                }
                else
                {
                    recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100);
                }

                AddRecognizer(recog);

                // disable for the time being due to performance issues
                //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable)
                //{
                //    recog = CreateCurrencyRecognizer(accessor, actualCulture);
                //    AddRecognizer(recog);
                //}
            }

#if false
            // TODO NOTE this slows down the performance too much - need to find a better way

            if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable)
            {
                // add an abbreviation recognizer
                Wordlist     abbreviations = new Wordlist();
                CharacterSet first;
                abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true));
                string          abbreviationsRX         = abbreviations.GetRegularExpression(out first) + @"(?=\W)";
                RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer");
                abbreviationsRecognizer.Add(abbreviationsRX, first);
                AddRecognizer(abbreviationsRecognizer);
            }
#endif

            {
                Recognizer recog;

                bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture);
                recog = CreateDefaultFallbackRecognizer(split, accessor);
                AddRecognizer(recog);
            }

            SortRecognizers();
        }
Exemple #16
0
        public DefaultChineseFallbackRecognizer(Core.Tokenization.TokenType t, int priority, System.Globalization.CultureInfo culture, Core.Resources.IResourceDataAccessor dataAccessor)
            : base(t, priority, culture, dataAccessor, false)
        {
            int i = 0;

            _IsFallbackRecognizer = true;
            _DefaultPunctCharset  = CharacterSetParser.Parse(_DEFAULT_PUNC_CS, ref i);
        }
Exemple #17
0
 public Tokenizer(TokenizerSetup setup, Core.Resources.IResourceDataAccessor resourceDataAccessor)
     : this(new TokenizerParameters(setup, resourceDataAccessor))
 {
 }