Esempio n. 1
0
        // private methods

        private void ParseGroup(Group group, RegexOptions options, Assertion assertion)
        {
            bool is_top_level = group is RegularExpression;

            Alternation alternation = null;
            string      literal     = null;

            Group      current = new Group();
            Expression expr    = null;
            bool       closed  = false;

            while (true)
            {
                ConsumeWhitespace(IsIgnorePatternWhitespace(options));
                if (ptr >= pattern.Length)
                {
                    break;
                }

                // (1) Parse for Expressions

                char ch = pattern[ptr++];

                switch (ch)
                {
                case '^': {
                    Position pos =
                        IsMultiline(options) ? Position.StartOfLine : Position.Start;
                    expr = new PositionAssertion(pos);
                    break;
                }

                case '$': {
                    Position pos =
                        IsMultiline(options) ? Position.EndOfLine : Position.End;
                    expr = new PositionAssertion(pos);
                    break;
                }

                case '.': {
                    Category cat =
                        IsSingleline(options) ? Category.AnySingleline : Category.Any;
                    expr = new CharacterClass(cat, false);
                    break;
                }

                case '\\': {
                    int c = ParseEscape(false);
                    if (c >= 0)
                    {
                        ch = (char)c;
                    }
                    else
                    {
                        expr = ParseSpecial(options);

                        if (expr == null)
                        {
                            ch = pattern[ptr++];                                        // default escape
                        }
                    }
                    break;
                }

                case '[': {
                    expr = ParseCharacterClass(options);
                    break;
                }

                case '(': {
                    bool ignore = IsIgnoreCase(options);
                    expr = ParseGroupingConstruct(ref options);
                    if (expr == null)
                    {
                        if (literal != null && IsIgnoreCase(options) != ignore)
                        {
                            current.AppendExpression(new Literal(literal, IsIgnoreCase(options)));
                            literal = null;
                        }

                        continue;
                    }
                    break;
                }

                case ')': {
                    closed = true;
                    goto EndOfGroup;
                }

                case '|': {
                    if (literal != null)
                    {
                        current.AppendExpression(new Literal(literal, IsIgnoreCase(options)));
                        literal = null;
                    }

                    if (assertion != null)
                    {
                        if (assertion.TrueExpression == null)
                        {
                            assertion.TrueExpression = current;
                        }
                        else if (assertion.FalseExpression == null)
                        {
                            assertion.FalseExpression = current;
                        }
                        else
                        {
                            throw NewParseException("Too many | in (?()|).");
                        }
                    }
                    else
                    {
                        if (alternation == null)
                        {
                            alternation = new Alternation();
                        }

                        alternation.AddAlternative(current);
                    }

                    current = new Group();
                    continue;
                }

                case '*':
                case '+':
                case '?': {
                    throw NewParseException("Bad quantifier.");
                }

                default:
                    break;                              // literal character
                }

                ConsumeWhitespace(IsIgnorePatternWhitespace(options));

                // (2) Check for Repetitions

                if (ptr < pattern.Length)
                {
                    char k = pattern[ptr];
                    int  min = 0, max = 0;
                    bool lazy    = false;
                    bool haveRep = false;


                    if (k == '?' || k == '*' || k == '+')
                    {
                        ++ptr;
                        haveRep = true;

                        switch (k)
                        {
                        case '?': min = 0; max = 1; break;

                        case '*': min = 0; max = 0x7fffffff; break;

                        case '+': min = 1; max = 0x7fffffff; break;
                        }
                    }
                    else if (k == '{' && ptr + 1 < pattern.Length)
                    {
                        int saved_ptr = ptr;
                        ++ptr;
                        haveRep = ParseRepetitionBounds(out min, out max, options);
                        if (!haveRep)
                        {
                            ptr = saved_ptr;
                        }
                    }

                    if (haveRep)
                    {
                        ConsumeWhitespace(IsIgnorePatternWhitespace(options));
                        if (ptr < pattern.Length && pattern[ptr] == '?')
                        {
                            ++ptr;
                            lazy = true;
                        }

                        //It doesn't make sense to assert a given position more than once.
                        bool ignore_repetition = false;
                        if (expr is PositionAssertion)
                        {
                            ignore_repetition = min > 0 && !lazy;
                            max = 1;
                        }

                        if (!ignore_repetition)
                        {
                            Repetition repetition = new Repetition(min, max, lazy);

                            if (expr == null)
                            {
                                repetition.Expression = new Literal(ch.ToString(), IsIgnoreCase(options));
                            }
                            else
                            {
                                repetition.Expression = expr;
                            }

                            expr = repetition;
                        }
                    }
                }

                // (3) Append Expression and/or Literal

                if (expr == null)
                {
                    if (literal == null)
                    {
                        literal = "";
                    }
                    literal += ch;
                }
                else
                {
                    if (literal != null)
                    {
                        current.AppendExpression(new Literal(literal, IsIgnoreCase(options)));
                        literal = null;
                    }

                    current.AppendExpression(expr);
                    expr = null;
                }

                if (is_top_level && ptr >= pattern.Length)
                {
                    goto EndOfGroup;
                }
            }

EndOfGroup:
            if (is_top_level && closed)
            {
                throw NewParseException("Too many )'s.");
            }
            if (!is_top_level && !closed)
            {
                throw NewParseException("Not enough )'s.");
            }


            // clean up literals and alternations

            if (literal != null)
            {
                current.AppendExpression(new Literal(literal, IsIgnoreCase(options)));
            }

            if (assertion != null)
            {
                if (assertion.TrueExpression == null)
                {
                    assertion.TrueExpression = current;
                }
                else
                {
                    assertion.FalseExpression = current;
                }

                group.AppendExpression(assertion);
            }
            else if (alternation != null)
            {
                alternation.AddAlternative(current);
                group.AppendExpression(alternation);
            }
            else
            {
                group.AppendExpression(current);
            }
        }
Esempio n. 2
0
        private Expression ParseSpecial(RegexOptions options)
        {
            int        p    = ptr;
            bool       ecma = IsECMAScript(options);
            Expression expr = null;

            switch (pattern[ptr++])
            {
            // categories

            case 'd':
                expr = new CharacterClass(ecma ? Category.EcmaDigit : Category.Digit, false);
                break;

            case 'w':
                expr = new CharacterClass(ecma ? Category.EcmaWord : Category.Word, false);
                break;

            case 's':
                expr = new CharacterClass(ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
                break;

            case 'p':
                // this is odd - ECMAScript isn't supposed to support Unicode,
                // yet \p{..} compiles and runs under the MS implementation
                // identically to canonical mode. That's why I'm ignoring the
                // value of ecma here.

                expr = new CharacterClass(ParseUnicodeCategory(), false);
                break;

            case 'D':
                expr = new CharacterClass(ecma ? Category.EcmaDigit : Category.Digit, true);
                break;

            case 'W':
                expr = new CharacterClass(ecma ? Category.EcmaWord : Category.Word, true);
                break;

            case 'S':
                expr = new CharacterClass(ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
                break;

            case 'P':
                expr = new CharacterClass(ParseUnicodeCategory(), true);
                break;

            // positions

            case 'A': expr = new PositionAssertion(Position.StartOfString); break;

            case 'Z': expr = new PositionAssertion(Position.End); break;

            case 'z': expr = new PositionAssertion(Position.EndOfString); break;

            case 'G': expr = new PositionAssertion(Position.StartOfScan); break;

            case 'b': expr = new PositionAssertion(Position.Boundary); break;

            case 'B': expr = new PositionAssertion(Position.NonBoundary); break;

            // references

            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9': {
                ptr--;
                int n = ParseNumber(10, 1, 0);
                if (n < 0)
                {
                    ptr = p;
                    return(null);
                }

                // FIXME test if number is within number of assigned groups
                // this may present a problem for right-to-left matching

                Reference reference = new BackslashNumber(IsIgnoreCase(options), ecma);
                refs.Add(reference, n.ToString());
                expr = reference;
                break;
            }

            case 'k': {
                char delim = pattern[ptr++];
                if (delim == '<')
                {
                    delim = '>';
                }
                else if (delim != '\'')
                {
                    throw NewParseException("Malformed \\k<...> named backreference.");
                }

                string name = ParseName();
                if (name == null || pattern[ptr] != delim)
                {
                    throw NewParseException("Malformed \\k<...> named backreference.");
                }

                ++ptr;
                Reference reference = new Reference(IsIgnoreCase(options));
                refs.Add(reference, name);
                expr = reference;
                break;
            }

            default:
                expr = null;
                break;
            }

            if (expr == null)
            {
                ptr = p;
            }

            return(expr);
        }