コード例 #1
0
        public void pbWithKleene()         // http://www.codeproject.com/csharp/spart.asp?df=100&forumid=30315&select=1812016#xx1812016xx
        {
            Rule opQt   = new Rule();
            Rule cnd_qt = new Rule();

            //([aA-zZ0-9 ]+)
            opQt.Parser = Ops.OneOrMore(Ops.Choice(Prims.LetterOrDigit, ' '));


            //CND_QT -> [OPQT] *(
            cnd_qt.Parser = Ops.Sequence(
                '[',
                opQt,
                ']',
                Ops.ZeroOrMore(' '),
                '('
                );

            Assert.IsTrue(cnd_qt.Parse(new StringScanner("[all] (")).Success);
            Assert.IsTrue(cnd_qt.Parse(new StringScanner("[all](")).Success);
        }
コード例 #2
0
ファイル: IcuRulesParser.cs プロジェクト: vkarthim/libpalaso
        private void DefineParsingRules()
        {
            // someWhiteSpace ::= WS+
            // optionalWhiteSpace ::= WS*
            _someWhiteSpace     = Ops.OneOrMore(Prims.WhiteSpace);
            _optionalWhiteSpace = Ops.ZeroOrMore(Prims.WhiteSpace);

            // Valid escaping formats (from http://www.icu-project.org/userguide/Collate_Customization.html )
            //
            // Most of the characters can be used as parts of rules.
            // However, whitespace characters will be skipped over,
            // and all ASCII characters that are not digits or letters
            // are considered to be part of syntax. In order to use
            // these characters in rules, they need to be escaped.
            // Escaping can be done in several ways:
            // * Single characters can be escaped using backslash \ (U+005C).
            // * Strings can be escaped by putting them between single quotes 'like this'.
            // * Single quote can be quoted using two single quotes ''.
            // because Unicode escape sequences are allowed in LDML we need to handle those also,
            // escapeSequence ::= '\' U[A-F0-9]{8} | u[A-F0-9]{4} | anyChar
            _escapeSequence = Ops.Choice(new Parser[] {
                Ops.Sequence('\\', Ops.Sequence('U', Prims.HexDigit, Prims.HexDigit, Prims.HexDigit,
                                                Prims.HexDigit, Prims.HexDigit, Prims.HexDigit, Prims.HexDigit,
                                                Prims.HexDigit)),
                Ops.Sequence('\\', Ops.Sequence('u', Prims.HexDigit, Prims.HexDigit, Prims.HexDigit,
                                                Prims.HexDigit)),
                Ops.Sequence('\\', Ops.Expect("icu0002", "Invalid escape sequence.", Prims.AnyChar))
            });

            // singleQuoteLiteral ::= "''"
            // quotedStringCharacter ::= AllChars - "'"
            // quotedString ::= "'" (singleQuoteLiteral | quotedStringCharacter)+ "'"
            _singleQuoteLiteral    = Prims.Str("''");
            _quotedStringCharacter = Prims.AnyChar - '\'';
            _quotedString          = Ops.Sequence('\'', Ops.OneOrMore(_singleQuoteLiteral | _quotedStringCharacter),
                                                  Ops.Expect("icu0003", "Quoted string without matching end-quote.", '\''));

            // Any alphanumeric ASCII character and all characters above the ASCII range are valid data characters
            // normalCharacter ::= [A-Za-z0-9] | [U+0080-U+1FFFFF]
            // dataCharacter ::= normalCharacter | singleQuoteLiteral | escapeSequence
            // dataString ::= (dataCharacter | quotedString) (WS? (dataCharacter | quotedString))*
            _normalCharacter = Prims.LetterOrDigit | Prims.Range('\u0080', char.MaxValue);
            _dataCharacter   = _normalCharacter | _singleQuoteLiteral | _escapeSequence;
            _dataString      = new Spart.Parsers.NonTerminal.Rule(Ops.List(_dataCharacter | _quotedString, _optionalWhiteSpace));

            // firstOrLast ::= 'first' | 'last'
            // primarySecondaryTertiary ::= 'primary' | 'secondary' | 'tertiary'
            // indirectOption ::= (primarySecondaryTertiary WS 'ignorable') | 'variable' | 'regular' | 'implicit' | 'trailing'
            // indirectPosition ::= '[' WS? firstOrLast WS indirectOption WS? ']'
            // According to the LDML spec, "implicit" should not be allowed in a reset element, but we're not going to check that
            _firstOrLast = Ops.Choice("first", "last");
            _primarySecondaryTertiary = Ops.Choice("primary", "secondary", "tertiary");
            _indirectOption           = Ops.Choice(Ops.Sequence(_primarySecondaryTertiary, _someWhiteSpace, "ignorable"),
                                                   "variable", "regular", "implicit", "trailing");
            _indirectPosition = new Spart.Parsers.NonTerminal.Rule(Ops.Sequence('[', _optionalWhiteSpace,
                                                                                Ops.Expect("icu0004", "Invalid indirect position specifier: unknown option",
                                                                                           Ops.Sequence(_firstOrLast, _someWhiteSpace, _indirectOption)), _optionalWhiteSpace,
                                                                                Ops.Expect("icu0005", "Indirect position specifier missing closing ']'", ']')));

            // top ::= '[' WS? 'top' WS? ']'
            // [top] is a deprecated element in ICU and should be replaced by indirect positioning.
            _top = Ops.Sequence('[', _optionalWhiteSpace, "top", _optionalWhiteSpace, ']');

            // simpleElement ::= indirectPosition | dataString
            _simpleElement = new Spart.Parsers.NonTerminal.Rule("simpleElement", _indirectPosition | _dataString);

            // expansion ::= WS? '/' WS? simpleElement
            _expansion = new Spart.Parsers.NonTerminal.Rule("extend", Ops.Sequence(_optionalWhiteSpace, '/', _optionalWhiteSpace,
                                                                                   Ops.Expect("icu0007", "Invalid expansion: Data missing after '/'", _simpleElement)));
            // prefix ::= simpleElement WS? '|' WS?
            _prefix = new Spart.Parsers.NonTerminal.Rule("context", Ops.Sequence(_simpleElement, _optionalWhiteSpace, '|', _optionalWhiteSpace));
            // extendedElement ::= (prefix simpleElement expansion?) | (prefix? simpleElement expansion)
            _extendedElement = Ops.Sequence(_prefix, _simpleElement, !_expansion) |
                               Ops.Sequence(!_prefix, _simpleElement, _expansion);

            // beforeOption ::= '1' | '2' | '3'
            // beforeSpecifier ::= '[' WS? 'before' WS beforeOption WS? ']'
            _beforeOption    = Ops.Choice('1', '2', '3');
            _beforeSpecifier = Ops.Sequence('[', _optionalWhiteSpace, "before", _someWhiteSpace,
                                            Ops.Expect("icu0010", "Invalid 'before' specifier: Invalid or missing option", _beforeOption), _optionalWhiteSpace,
                                            Ops.Expect("icu0011", "Invalid 'before' specifier: Missing closing ']'", ']'));

            // The difference operator initially caused some problems with parsing.  The spart library doesn't
            // handle situations where the first choice is the beginning of the second choice.
            // Ex:  differenceOperator = "<" | "<<" | "<<<" | "="     DOES NOT WORK!
            // That will fail to parse bothe the << and <<< operators because it always thinks it should match <.
            // However, differenceOperator = "<<<" | "<<" | "<" | "=" will work because it tries to match <<< first.
            // I'm using this strange production with the option '<' characters because it also works and doesn't
            // depend on order.  It is less likely for someone to change it and unknowingly mess it up.
            // differenceOperator ::=  ('<' '<'? '<'?) | '='
            _differenceOperator = Ops.Sequence('<', !Prims.Ch('<'), !Prims.Ch('<')) | Prims.Ch('=');

            // simpleDifference ::= differenceOperator WS? simpleElement
            // extendedDifference ::= differenceOperator WS? extendedElement
            // difference ::= simpleDifference | extendedDifference
            // NOTE: Due to the implementation of the parser, extendedDifference MUST COME BEFORE simpleDifference in the difference definition
            _simpleDifference = new Spart.Parsers.NonTerminal.Rule("simpleDifference", Ops.Sequence(_differenceOperator,
                                                                                                    _optionalWhiteSpace, _simpleElement));
            _extendedDifference = new Spart.Parsers.NonTerminal.Rule("x", Ops.Sequence(_differenceOperator,
                                                                                       _optionalWhiteSpace, _extendedElement));
            _difference = _extendedDifference | _simpleDifference;

            // reset ::= '&' WS? ((beforeSpecifier? WS? simpleElement) | top)
            _reset = new Spart.Parsers.NonTerminal.Rule("reset", Ops.Sequence('&', _optionalWhiteSpace,
                                                                              _top | Ops.Sequence(!_beforeSpecifier, _optionalWhiteSpace, _simpleElement)));

            // This option is a weird one, as it can come at any place in a rule and sets the preceding
            // dataString as the variable top option in the settings element.  So, it has to look at the
            // data for the preceding element to know its own value, but leaves the preceding and any
            // succeeding elements as if the variable top option wasn't there.  Go figure.
            // Also, it's really probably only valid following a simpleDifference or reset with a dataString
            // and not an indirect position, but checking for all that in the grammar would be very convoluted, so
            // we'll do it in the semantic action and throw.  Yuck.
            // optionVariableTop ::= '<' WS? '[' WS? 'variable' WS? 'top' WS? ']'
            _optionVariableTop = Ops.Sequence('<', _optionalWhiteSpace, '[', _optionalWhiteSpace, "variable",
                                              _optionalWhiteSpace, "top", _optionalWhiteSpace, ']');

            // oneRule ::= reset (WS? (optionVariableTop | difference))*
            _oneRule = new Spart.Parsers.NonTerminal.Rule("oneRule", Ops.Sequence(_reset, Ops.ZeroOrMore(Ops.Sequence(_optionalWhiteSpace,
                                                                                                                      _optionVariableTop | _difference))));

            // Option notes:
            // * The 'strength' option is specified in ICU as having valid values 1-4 and 'I'.  In the LDML spec, it
            //   seems to indicate that valid values in ICU are 1-5, so I am accepting both and treating 'I' and '5'
            //   as the same.  I'm also accepting 'I' and 'i', although my approach is, in general, to be case-sensitive.
            // * The 'numeric' option is not mentioned on the ICU website, but it is implied as being acceptable ICU
            //   in the LDML spec, so I am supporting it here.
            // * There are LDML options 'match-boundaries' and 'match-style' that are not in ICU, so they are not listed here.
            // * The UCA spec seems to indicate that there is a 'locale' option which is not mentioned in either the
            //   LDML or ICU specs, so I am not supporting it here.  It could be referring to the 'base' element that
            //   is an optional part of the 'collation' element in LDML.
            // optionOnOff ::= 'on' | 'off'
            // optionAlternate ::= 'alternate' WS ('non-ignorable' | 'shifted')
            // optionBackwards ::= 'backwards' WS ('1' | '2')
            // optionNormalization ::= 'normalization' WS optionOnOff
            // optionCaseLevel ::= 'caseLevel' WS optionOnOff
            // optionCaseFirst ::= 'caseFirst' WS ('off' | 'upper' | 'lower')
            // optionStrength ::= 'strength' WS ('1' | '2' | '3' | '4' | 'I' | 'i' | '5')
            // optionHiraganaQ ::= 'hiraganaQ' WS optionOnOff
            // optionNumeric ::= 'numeric' WS optionOnOff
            // characterSet ::= '[' (AnyChar - ']')* ']'
            // optionSuppressContractions ::= 'suppress' WS 'contractions' WS characterSet
            // optionOptimize ::= 'optimize' WS characterSet
            // option ::= '[' WS? (optionAlternate | optionBackwards | optionNormalization | optionCaseLevel
            //            | optionCaseFirst | optionStrength | optionHiraganaQ | optionNumeric
            //            | optionSuppressContractions | optionOptimize) WS? ']'
            _optionOnOff                = Ops.Choice("on", "off");
            _optionAlternate            = Ops.Sequence("alternate", _someWhiteSpace, Ops.Choice("non-ignorable", "shifted"));
            _optionBackwards            = Ops.Sequence("backwards", _someWhiteSpace, Ops.Choice('1', '2'));
            _optionNormalization        = Ops.Sequence("normalization", _someWhiteSpace, _optionOnOff);
            _optionCaseLevel            = Ops.Sequence("caseLevel", _someWhiteSpace, _optionOnOff);
            _optionCaseFirst            = Ops.Sequence("caseFirst", _someWhiteSpace, Ops.Choice("off", "upper", "lower"));
            _optionStrength             = Ops.Sequence("strength", _someWhiteSpace, Ops.Choice('1', '2', '3', '4', 'I', 'i', '5'));
            _optionHiraganaQ            = Ops.Sequence("hiraganaQ", _someWhiteSpace, _optionOnOff);
            _optionNumeric              = Ops.Sequence("numeric", _someWhiteSpace, _optionOnOff);
            _characterSet               = Ops.Sequence('[', Ops.ZeroOrMore(Prims.AnyChar - ']'), ']');
            _optionSuppressContractions = Ops.Sequence("suppress", _someWhiteSpace, "contractions", _someWhiteSpace,
                                                       _characterSet);
            _optionOptimize = Ops.Sequence("optimize", _someWhiteSpace, _characterSet);
            _option         = new Spart.Parsers.NonTerminal.Rule("option", Ops.Sequence('[', _optionalWhiteSpace, _optionAlternate |
                                                                                        _optionBackwards | _optionNormalization | _optionCaseLevel | _optionCaseFirst |
                                                                                        _optionStrength | _optionHiraganaQ | _optionNumeric | _optionSuppressContractions |
                                                                                        _optionOptimize, _optionalWhiteSpace, ']'));

            // I don't know if ICU requires all options first (it's unclear), but I am. :)
            // icuRules ::= WS? (option WS?)* (oneRule WS?)* EOF
            _icuRules = new Spart.Parsers.NonTerminal.Rule("icuRules", Ops.Sequence(_optionalWhiteSpace,
                                                                                    Ops.ZeroOrMore(Ops.Sequence(_option, _optionalWhiteSpace)),
                                                                                    Ops.ZeroOrMore(Ops.Sequence(_oneRule, _optionalWhiteSpace)), Ops.Expect("icu0015", "Invalid ICU rules.", Prims.End)));

            if (_useDebugger)
            {
                _debugger  = new Spart.Debug.Debugger(Console.Out);
                _debugger += _option;
                _debugger += _oneRule;
                _debugger += _reset;
                _debugger += _simpleElement;
                _debugger += _simpleDifference;
                _debugger += _extendedDifference;
                _debugger += _dataString;
            }
        }