コード例 #1
0
        private RegexRecognizer CreateVariableRecognizer(Core.Resources.IResourceDataAccessor accessor,
                                                         System.Globalization.CultureInfo actualCulture)
        {
            Wordlist wl = new Wordlist();

            // TODO also create the recognizer if no variables are defined/available?

            using (System.IO.Stream data = accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.Variables, true))
            {
                if (data != null)
                {
                    wl.Load(data);
                }
            }

            if (wl.Count == 0)
            {
                return(null);
            }

            // TODO set context restrictions of the recognizer
            RegexRecognizer recog = new RegexRecognizer(TokenType.Variable, 100, "VAR", "DEFAULT_VAR_REGOCNIZER");
            CharacterSet    first;
            string          rx = wl.GetRegularExpression(out first);

            recog.Add(rx, first);

            recog.OnlyIfFollowedByNonwordCharacter =
                Core.CultureInfoExtensions.UseBlankAsWordSeparator(actualCulture);

            return(recog);
        }
コード例 #2
0
        private Recognizer CreateCurrencyRecognizer(Core.Resources.IResourceDataAccessor accessor,
                                                    System.Globalization.CultureInfo actualCulture)
        {
            Wordlist wl = new Wordlist();

            using (System.IO.Stream data = accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true))
            {
                if (data != null)
                {
                    wl.Load(data);
                }
            }

            if (wl.Count == 0)
            {
                return(null);
            }

            return(CurrencyRegexRecognizer.Create(actualCulture, wl, 100));
        }
コード例 #3
0
        /// <summary>
        /// Initialize the tokenizer parameters from the tokenizer setup information.
        /// The resource data accessor is only used for retrieving variable values.
        /// </summary>
        /// <param name="setup">The tokenizer setup to use</param>
        /// <param name="accessor">A resource data accessor, to initialize the variables list</param>
        public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor)
        {
            if (setup == null)
            {
                throw new ArgumentNullException("setup");
            }

            if (accessor == null)
            {
                accessor = new ResourceFileResourceAccessor();
            }

            _BreakOnWhitespace      = setup.BreakOnWhitespace;
            _CreateWhitespaceTokens = setup.CreateWhitespaceTokens;
            _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName);

            _Recognizers        = new List <Recognizer>();
            _ReclassifyAcronyms = false;

            // we need to determine a region-qualified culture since neutral cultures
            //  don't have date/time/number pattern info
            System.Globalization.CultureInfo actualCulture = _Culture;
            if (_Culture.IsNeutralCulture)
            {
                actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture);
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0)
            {
                if (UseRXNumberRecognizer)
                {
                    AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100));
                }
                else
                {
                    AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100));
                }

                // it does not make sense to recognize ordinal numbers if they don't become placeables and
                //  don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd)
                bool createOrdinalNumberRecognizer = false;
                if (createOrdinalNumberRecognizer)
                {
                    // add special recognizer for ordinal numbers if ordinal followers are available
                    // [0-9]+\. (?=[OrdinalFollowers])
                    if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable)
                    {
                        Wordlist     ordinalFollowers = new Wordlist();
                        CharacterSet dummy;
                        ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true));
                        if (ordinalFollowers.Count > 0)
                        {
                            string          ordinalNumbersRX         = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)";
                            RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer");
                            CharacterSet    ordinalFirst             = new CharacterSet();
                            ordinalFirst.Add('0', '9');
                            ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst);
                            AddRecognizer(ordinalNumbersRecognizer);
                        }
                    }
                }
            }
            else
            {
                // TODO should we still add a rudimentary recognizer for alpha-numerals?
            }

            // TODO other recognizer types (for builtin token classes)
            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0)
            {
                RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100);
                if (recog != null)
                {
                    _ReclassifyAcronyms = true;
                    AddRecognizer(recog);
                }

                // this shouldn't be in the "acronym" setting but it's too late for a UI change...
                recog = CreateUriRecognizer(actualCulture, 100);
                AddRecognizer(recog);

                // TODO make IP address recognizer optional?
                AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101));
                // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0)
            {
                if (accessor != null)
                {
                    try
                    {
                        RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture);
                        if (recog != null)
                        {
                            AddRecognizer(recog);
                        }
                    }
                    catch                     // (System.Exception e)
                    {
                        // nop - ignore errors
                    }
                }
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0)
            {
                Recognizer recog;

                if (UseRXMeasurementRecognizer)
                {
                    recog = MeasureRegexRecognizer.Create(actualCulture, 100);
                }
                else
                {
                    recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100);
                }

                AddRecognizer(recog);

                // disable for the time being due to performance issues
                //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable)
                //{
                //    recog = CreateCurrencyRecognizer(accessor, actualCulture);
                //    AddRecognizer(recog);
                //}
            }

#if false
            // TODO NOTE this slows down the performance too much - need to find a better way

            if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable)
            {
                // add an abbreviation recognizer
                Wordlist     abbreviations = new Wordlist();
                CharacterSet first;
                abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true));
                string          abbreviationsRX         = abbreviations.GetRegularExpression(out first) + @"(?=\W)";
                RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer");
                abbreviationsRecognizer.Add(abbreviationsRX, first);
                AddRecognizer(abbreviationsRecognizer);
            }
#endif

            {
                Recognizer recog;

                bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture);
                recog = CreateDefaultFallbackRecognizer(split, accessor);
                AddRecognizer(recog);
            }

            SortRecognizers();
        }