Exemplo n.º 1
0
        /// <summary>
        /// Attempts to parse a character set, given the input <paramref name="input"/>
        /// and starting from the start position <paramref name="p"/>. The position will
        /// be updated if the parse is successful and then point to the first character
        /// after the closing bracket in the input.
        /// <param name="input">The input string for the parser</param>
        /// <param name="p">The current position in the input string</param>
        /// </summary>
        /// <returns>A character set, if the parse is successful. If no valid character
        /// set can be parsed, an exception will be thrown.</returns>
        public static Core.CharacterSet Parse(string input, ref int p)
        {
            ParsePosition      position = new ParsePosition(p);
            CharacterSetParser parser   = new CharacterSetParser(input, position);

            Core.CharacterSet result = parser.Parse();
            p = position.Position;
            return(result);
        }
        public static Recognizer Create(System.Globalization.CultureInfo culture, int priority)
        {
            try
            {
                // TODO support non-blank languages for unit separation

                MeasureRegexRecognizer result = new MeasureRegexRecognizer(100, "DEFAULT_MEASURE_RECOGNIZER", culture);

                Core.CharacterSet first = null;
                // augmentation doesn't change FIRST()
                List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first);
                AugmentPatterns(patterns, culture);

                foreach (string p in patterns)
                {
                    // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway)
                    result.Add(p, first, 2);
                }

                SeparatorCombination defaultSc = new SeparatorCombination(culture, false);
                if (defaultSc.IsSwappable())
                {
                    patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first);
                    AugmentPatterns(patterns, culture);

                    foreach (string p in patterns)
                    {
                        result.Add(p, first, 1);
                    }
                }
                if (NumberPatternComputer.DoAddENUSSeparators(culture))
                {
                    patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first);
                    AugmentPatterns(patterns, culture);

                    foreach (string p in patterns)
                    {
                        result.Add(p, first, 0);
                    }
                }

                result.OnlyIfFollowedByNonwordCharacter
                    = true;                     // otherwise "123 ABC" will be recognized as "123 A" "BC" in Japanese
                return(result);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
Exemplo n.º 3
0
 public Recognizer(Core.Tokenization.TokenType t,
                   int priority,
                   string tokenClassName,
                   string recognizerName,
                   bool autoSubstitutable)
 {
     _Type                       = t;
     _Priority                   = priority;
     _TokenClassName             = tokenClassName;
     _RecognizerName             = recognizerName;
     _AdditionalTerminators      = null;
     _IsFallbackRecognizer       = false;
     _OverrideFallbackRecognizer = false;
     _AutoSubstitutable          = autoSubstitutable;
 }
Exemplo n.º 4
0
        public static Recognizer Create(System.Globalization.CultureInfo culture,
                                        Core.Wordlist currencySymbols,
                                        int priority)
        {
            try
            {
                // TODO support non-blank languages for unit separation

                Core.CharacterSet curFirst     = null;
                string            currenciesRx = currencySymbols.GetRegularExpression(out curFirst);

                int currencyPattern = culture.NumberFormat.CurrencyPositivePattern;

                bool currencyPrecedesNumber = (currencyPattern % 2) == 0;
                bool currencyIsSeparated    = (currencyPattern >= 2);

                CurrencyRegexRecognizer result = new CurrencyRegexRecognizer(100, "DEFAULT_CURRENCY_RECOGNIZER", culture);

                Core.CharacterSet first = null;
                // augmentation doesn't change FIRST()
                // TODO use currency pattern instead of number pattern?
                List <string> patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.CultureDefault, out first);

                if (currencyPrecedesNumber)
                {
                    first.Add(curFirst);
                }

                AugmentPatterns(patterns, currenciesRx, culture);

                foreach (string p in patterns)
                {
                    // use the same first for all patterns (the number regex pattern computer returns just one pattern anyway)
                    result.Add(p, first, 2);
                }

                /*
                 * Be strict for currencies (only flexible for measurements and numbers)
                 *
                 * if (NumberRegexRecognizer.CanSwapSeparators(culture))
                 * {
                 *      patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.Swapped, out first);
                 *      AugmentPatterns(patterns, currenciesRx, culture);
                 *
                 *      foreach (string p in patterns)
                 *      {
                 *              result.Add(p, first, 1);
                 *      }
                 * }
                 * if (NumberRegexRecognizer.AddENUSSeparators(culture))
                 * {
                 *      patterns = NumberRegexRecognizer.ComputeRXPatterns(culture, NumberSeparatorMode.EnUS, out first);
                 *      AugmentPatterns(patterns, currenciesRx, culture);
                 *
                 *      foreach (string p in patterns)
                 *      {
                 *              result.Add(p, first, 0);
                 *      }
                 * }
                 */

                result.OnlyIfFollowedByNonwordCharacter
                    = Core.CultureInfoExtensions.UseBlankAsWordSeparator(culture);

                return(result);
            }
            catch             // (System.Exception e)
            {
                return(null);
            }
        }
Exemplo n.º 5
0
        private Core.CharacterSet Parse()
        {
            Core.CharacterSet result = new Core.CharacterSet();

            Expect('[');
            if (LookingAt() == '^')
            {
                result.Negated = true;
                _Position.Advance();
            }

            const int startState     = 0;
            const int charclassState = 1;
            const int finalState     = 99;

            int state = startState;

            while (state != finalState)
            {
                char lookAt = LookingAt();

                switch (state)
                {
                case startState:
                    switch (lookAt)
                    {
                    case '[':
                        // character class
                        _Position.Advance();
                        if (LookingAt() == ':')
                        {
                            _Position.Advance();
                            state = charclassState;
                        }
                        else if (LookingAt() != '\0')
                        {
                            result.Add('[');
                            // don't advance
                        }
                        break;

                    case '\0':
                        // unexpected end
                        throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input);

                    case ']':
                        // right bracket - move on to final state
                        _Position.Advance();
                        state = finalState;
                        break;

                    default:
                    {
                        // a plain character (escaped, simple, or Unicode hex)
                        char lower = ScanChar();
                        char upper = '\0';

                        if (LookingAt() == '-')
                        {
                            _Position.Advance();
                            if (LookingAt() == ']')
                            {
                                // dash at end of input - add literal dash to charset, skip ], and go to final state
                                result.Add(lower);
                                result.Add('-');
                                _Position.Advance();
                                state = finalState;
                            }
                            else
                            {
                                upper = ScanChar();
                                result.Add(lower, upper);
                            }
                        }
                        else
                        {
                            result.Add(lower);
                            state = startState;
                        }
                    }
                    break;
                    }
                    break;

                case charclassState:
                    // just got '[' followed by ':'

                    if (LookingAt() == '\0')
                    {
                        throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input);
                    }
                    else
                    {
                        StringBuilder className = new StringBuilder();
                        while (char.IsLetter(LookingAt()))
                        {
                            className.Append(LookingAt());
                            _Position.Advance();
                        }

                        Nullable <System.Globalization.UnicodeCategory> category;

                        if (className.Length == 0 ||
                            (category = Core.CharacterProperties.GetUnicodeCategoryFromName(className.ToString().ToLower())) == null ||
                            !category.HasValue)
                        {
                            throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input);
                        }
                        else
                        {
                            result.Add(category.Value);
                        }

                        Expect(':');
                        Expect(']');

                        state = startState;
                    }
                    break;

                default:
                    // unexpected state
                    throw new Core.LanguagePlatformException(Core.ErrorCode.TokenizerInvalidCharacterSet, _Input);
                }
            }

            return(result);
        }