public static Recognizer Create(System.Globalization.CultureInfo culture, int priority)
        {
            try
            {
                // TODO support non-blank languages for unit separation

                MeasureRegexRecognizer result = new MeasureRegexRecognizer(100, "DEFAULT_MEASURE_RECOGNIZER", culture);

                Core.CharacterSet first = null;
                // augmentation doesn't change FIRST()
                List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first);
                AugmentPatterns(patterns, culture);

                foreach (string p in patterns)
                {
                    // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway)
                    result.Add(p, first, 2);
                }

                SeparatorCombination defaultSc = new SeparatorCombination(culture, false);
                if (defaultSc.IsSwappable())
                {
                    patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first);
                    AugmentPatterns(patterns, culture);

                    foreach (string p in patterns)
                    {
                        result.Add(p, first, 1);
                    }
                }
                if (NumberPatternComputer.DoAddENUSSeparators(culture))
                {
                    patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first);
                    AugmentPatterns(patterns, culture);

                    foreach (string p in patterns)
                    {
                        result.Add(p, first, 0);
                    }
                }

                result.OnlyIfFollowedByNonwordCharacter
                    = true;                     // otherwise "123 ABC" will be recognized as "123 A" "BC" in Japanese
                return(result);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
Example #2
0
        public static Recognizer Create(System.Globalization.CultureInfo culture, int priority)
        {
            try
            {
                NumberRegexRecognizer recog = new NumberRegexRecognizer(100, "DEFAULT_NUMBER_RECOGNIZER", culture.NumberFormat);

                CharacterSet first = null;
                foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first))
                {
                    recog.Add(p, first, 2);
                }

                SeparatorCombination defaultSc = new SeparatorCombination(culture, false);
                if (defaultSc.IsSwappable())
                {
                    foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first))
                    {
                        recog.Add(p, first, 1);
                    }
                }
                if (NumberPatternComputer.DoAddENUSSeparators(culture))
                {
                    foreach (string p in ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first))
                    {
                        recog.Add(p, first, 1);
                    }
                }

                recog.OnlyIfFollowedByNonwordCharacter
                    = CultureInfoExtensions.UseBlankAsWordSeparator(culture);
                recog.AdditionalTerminators = new CharacterSet();
                recog.AdditionalTerminators.Add('-');                 // TODO other math symbols?
                recog.OverrideFallbackRecognizer = true;

                return(recog);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
Example #3
0
        public static Recognizer Create(System.Globalization.CultureInfo culture,
                                        Core.Wordlist currencySymbols,
                                        int priority)
        {
            try
            {
                // TODO support non-blank languages for unit separation

                Core.CharacterSet curFirst     = null;
                string            currenciesRx = currencySymbols.GetRegularExpression(out curFirst);

                int currencyPattern = culture.NumberFormat.CurrencyPositivePattern;

                bool currencyPrecedesNumber = (currencyPattern % 2) == 0;
                bool currencyIsSeparated    = (currencyPattern >= 2);

                CurrencyRegexRecognizer result = new CurrencyRegexRecognizer(100, "DEFAULT_CURRENCY_RECOGNIZER", culture);

                Core.CharacterSet first = null;
                // augmentation doesn't change FIRST()
                // TODO use currency pattern instead of number pattern?
                List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first);

                if (currencyPrecedesNumber)
                {
                    first.Add(curFirst);
                }

                AugmentPatterns(patterns, currenciesRx, culture);

                foreach (string p in patterns)
                {
                    // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway)
                    result.Add(p, first, 2);
                }

                /*
                 * Be strict for currencies (only flexible for measurements and numbers)
                 *
                 * if (NumberRegexRecognizer.CanSwapSeparators(culture))
                 * {
                 *      patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first);
                 *      AugmentPatterns(patterns, currenciesRx, culture);
                 *
                 *      foreach (string p in patterns)
                 *      {
                 *              result.Add(p, first, 1);
                 *      }
                 * }
                 * if (NumberRegexRecognizer.AddENUSSeparators(culture))
                 * {
                 *      patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first);
                 *      AugmentPatterns(patterns, currenciesRx, culture);
                 *
                 *      foreach (string p in patterns)
                 *      {
                 *              result.Add(p, first, 0);
                 *      }
                 * }
                 */

                result.OnlyIfFollowedByNonwordCharacter
                    = Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture);

                return(result);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
        /// <summary>
        /// Initialize the tokenizer parameters from the tokenizer setup information.
        /// The resource data accessor is only used for retrieving variable values.
        /// </summary>
        /// <param name="setup">The tokenizer setup to use</param>
        /// <param name="accessor">A resource data accessor, to initialize the variables list</param>
        public TokenizerParameters(TokenizerSetup setup, Core.Resources.IResourceDataAccessor accessor)
        {
            if (setup == null)
            {
                throw new ArgumentNullException("setup");
            }

            if (accessor == null)
            {
                accessor = new ResourceFileResourceAccessor();
            }

            _BreakOnWhitespace      = setup.BreakOnWhitespace;
            _CreateWhitespaceTokens = setup.CreateWhitespaceTokens;
            _Culture = Core.CultureInfoExtensions.GetCultureInfo(setup.CultureName);

            _Recognizers        = new List <Recognizer>();
            _ReclassifyAcronyms = false;

            // we need to determine a region-qualified culture since neutral cultures
            //  don't have date/time/number pattern info
            System.Globalization.CultureInfo actualCulture = _Culture;
            if (_Culture.IsNeutralCulture)
            {
                actualCulture = Core.CultureInfoExtensions.GetRegionQualifiedCulture(_Culture);
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeDates) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortDate | DateTimePatternType.LongDate, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != 0)
            {
                AddRecognizer(DateTimeRecognizer.Create(accessor, actualCulture,
                                                        DateTimePatternType.ShortTime | DateTimePatternType.LongTime, 100));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != 0)
            {
                if (UseRXNumberRecognizer)
                {
                    AddRecognizer(NumberRegexRecognizer.Create(actualCulture, 100));
                }
                else
                {
                    AddRecognizer(NumberFSTRecognizer.Create(accessor, actualCulture, 100));
                }

                // it does not make sense to recognize ordinal numbers if they don't become placeables and
                //  don't participate in auto-localization. They'd also need to be auto-localized (3. -> 3rd)
                bool createOrdinalNumberRecognizer = false;
                if (createOrdinalNumberRecognizer)
                {
                    // add special recognizer for ordinal numbers if ordinal followers are available
                    // [0-9]+\. (?=[OrdinalFollowers])
                    if (accessor.GetResourceStatus(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true) != Core.Resources.ResourceStatus.NotAvailable)
                    {
                        Wordlist     ordinalFollowers = new Wordlist();
                        CharacterSet dummy;
                        ordinalFollowers.Load(accessor.ReadResourceData(_Culture, Core.Resources.LanguageResourceType.OrdinalFollowers, true));
                        if (ordinalFollowers.Count > 0)
                        {
                            string          ordinalNumbersRX         = "[0-9]+\\.(?=[ \u00A0]" + ordinalFollowers.GetRegularExpression(out dummy) + "\\s)";
                            RegexRecognizer ordinalNumbersRecognizer = new RegexRecognizer(TokenType.Word, 100, "ORDINALNUMBER", "Ordinal Number Recognizer");
                            CharacterSet    ordinalFirst             = new CharacterSet();
                            ordinalFirst.Add('0', '9');
                            ordinalNumbersRecognizer.Add(ordinalNumbersRX, ordinalFirst);
                            AddRecognizer(ordinalNumbersRecognizer);
                        }
                    }
                }
            }
            else
            {
                // TODO should we still add a rudimentary recognizer for alpha-numerals?
            }

            // TODO other recognizer types (for builtin token classes)
            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != 0)
            {
                RegexRecognizer recog = CreateAcronymRecognizer(actualCulture, 100);
                if (recog != null)
                {
                    _ReclassifyAcronyms = true;
                    AddRecognizer(recog);
                }

                // this shouldn't be in the "acronym" setting but it's too late for a UI change...
                recog = CreateUriRecognizer(actualCulture, 100);
                AddRecognizer(recog);

                // TODO make IP address recognizer optional?
                AddRecognizer(CreateIPAddressRecognizer(actualCulture, 101));
                // AddRecognizer(CreateHeadingNumberRecognizer(actualCulture, 50));
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != 0)
            {
                if (accessor != null)
                {
                    try
                    {
                        RegexRecognizer recog = CreateVariableRecognizer(accessor, actualCulture);
                        if (recog != null)
                        {
                            AddRecognizer(recog);
                        }
                    }
                    catch                     // (System.Exception e)
                    {
                        // nop - ignore errors
                    }
                }
            }

            if ((setup.BuiltinRecognizers & Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != 0)
            {
                Recognizer recog;

                if (UseRXMeasurementRecognizer)
                {
                    recog = MeasureRegexRecognizer.Create(actualCulture, 100);
                }
                else
                {
                    recog = MeasureFSTRecognizer.Create(accessor, actualCulture, 100);
                }

                AddRecognizer(recog);

                // disable for the time being due to performance issues
                //if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.CurrencySymbols, true) != Core.Resources.ResourceStatus.NotAvailable)
                //{
                //    recog = CreateCurrencyRecognizer(accessor, actualCulture);
                //    AddRecognizer(recog);
                //}
            }

#if false
            // TODO NOTE this slows down the performance too much - need to find a better way

            if (accessor.GetResourceStatus(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true) != Core.Resources.ResourceStatus.NotAvailable)
            {
                // add an abbreviation recognizer
                Wordlist     abbreviations = new Wordlist();
                CharacterSet first;
                abbreviations.Load(accessor.ReadResourceData(actualCulture, Core.Resources.LanguageResourceType.Abbreviations, true));
                string          abbreviationsRX         = abbreviations.GetRegularExpression(out first) + @"(?=\W)";
                RegexRecognizer abbreviationsRecognizer = new RegexRecognizer(TokenType.Abbreviation, 101, "ABBREVIATION", "Abbreviation Recognizer");
                abbreviationsRecognizer.Add(abbreviationsRX, first);
                AddRecognizer(abbreviationsRecognizer);
            }
#endif

            {
                Recognizer recog;

                bool split = setup.SeparateClitics && Core.CultureInfoExtensions.UsesClitics(_Culture);
                recog = CreateDefaultFallbackRecognizer(split, accessor);
                AddRecognizer(recog);
            }

            SortRecognizers();
        }