Beispiel #1
0
        private CharSet EscapedCharToAcceptCharRange(char c)
        {
            switch (c)
            {
            // A lot of these are REALLY funky numbers. Tibetan numbers and such. You name it
            case 'd':
                return(new CharSet(false, escd));

            // Shorthand for [^0-9]
            case 'D':
                return(new CharSet(false, escD));

            case 's':
                return(AllWhitespaceCharacters);

            case 'S':
                return(AllCharactersExceptNull.Except(AllWhitespaceCharacters));

            case 'w':
                return(new CharSet(false, escw));

            case 'W':
                return(new CharSet(false, escW));

            case 'n':
                return(SingleChar('\n'));

            case 'r':
                return(SingleChar('\r'));

            case '.':
            case '*':
            case '|':
            case '[':
            case ']':
            case '+':
            case '(':
            case ')':
            case '\\':
            case '{':
            case '}':
            case ' ':
            case '?':
                return(SingleChar(c));

            default:
                return(new CharSet());      // Empty charset, might be added to
            }
        }
Beispiel #2
0
        public RegExToken NextToken()
        {
            // These keeps track of classes
            var classState = new CharacterClassState();
            var numberedRepetitionState = new NumberedRepetitionState();

            state = State.Normal;

            while (input.Peek() != -1)
            {
                var c = (char)input.Read();

                switch (state)
                {
                case State.Normal:
                    switch (c)
                    {
                    case '\\':
                        state = State.NormalEscaped;
                        break;

                    case '[':
                        state = State.BeginCharacterClass;
                        break;

                    case '{':
                        state = State.NumberedRepetition;
                        break;

                    case '(':   return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorOpenParanthesis
                        });

                    case ')':   return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorCloseParanthesis
                        });

                    case '|':   return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorOr
                        });

                    case '+':   return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorPlus
                        });

                    case '*':   return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorMul
                        });

                    case '?':   return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorQuestion
                        });

                    case '.':   return(new RegExToken {
                            Type = RegExToken.TokenType.Accept, Characters = AllCharactersExceptNull
                        });

                    default:    return(new RegExToken {
                            Type = RegExToken.TokenType.Accept, Characters = SingleChar(c)
                        });
                    }
                    break;

                case State.NormalEscaped:
                {
                    var characters = EscapedCharToAcceptCharRange(c);
                    if (!characters.Any())
                    {
                        throw new LexerConstructionException(string.Format("Unknown escaped character '{0}'", c));
                    }
                    return(new RegExToken {
                            Characters = characters, Type = RegExToken.TokenType.Accept
                        });
                }

                case State.BeginCharacterClass:
                    switch (c)
                    {
                    case '^':
                        if (classState.Negated)
                        {
                            // If the classstate is ALREADY negated
                            // Readd the ^ to the expression
                            classState.LastChar = '^';
                            state = State.InsideCharacterClass;
                        }
                        classState.Negated = true;
                        break;

                    case '[':
                    case ']':
                    case '-':
                        // This does not break the character class TODO: I THINK!!!
                        classState.LastChar = c;
                        break;

                    case '\\':
                        state = State.InsideCharacterClassEscaped;
                        break;

                    default:
                        classState.LastChar = c;
                        state = State.InsideCharacterClass;
                        break;
                    }
                    break;

                case State.InsideCharacterClass:
                    switch (c)
                    {
                    case '-':
                        state = State.RangeEnd;
                        break;

                    case '[':
                        throw new LexerConstructionException("Opening new character class inside an already open one");

                    case ']':
                        if (classState.LastChar != (char)0)
                        {
                            classState.CharsSet.Add(classState.LastChar);
                        }

                        // Ending class
                        return(new RegExToken
                        {
                            Type = RegExToken.TokenType.Accept,
                            Characters = classState.Negated
                                                                ? AllCharactersExceptNull.Except(classState.CharsSet)
                                                                : classState.CharsSet
                        });

                    case '\\':
                        state = State.InsideCharacterClassEscaped;
                        break;

                    default:
                        if (classState.LastChar != 0)
                        {
                            classState.CharsSet.Add(classState.LastChar);
                        }
                        classState.LastChar = c;
                        break;
                    }
                    break;

                case State.InsideCharacterClassEscaped:
                {
                    var characters = EscapedCharToAcceptCharsInClass(c);
                    if (!characters.Any())
                    {
                        throw new LexerConstructionException(string.Format("Unknown escaped character '{0}' in character class", c));
                    }

                    if (classState.LastChar != 0)
                    {
                        classState.CharsSet.Add(classState.LastChar);
                    }

                    classState.CharsSet.UnionWith(characters);
                    classState.LastChar = (char)0;
                    state = State.InsideCharacterClass;
                }
                break;


                case State.RangeEnd:
                    switch (c)
                    {
                    case ']':
                        // We found the - at the position BEFORE the end of the class
                        // which means we should handle it as a litteral and end the class
                        classState.CharsSet.Add(classState.LastChar);
                        classState.CharsSet.Add('-');

                        return(new RegExToken
                        {
                            Type = RegExToken.TokenType.Accept,
                            Characters = classState.Negated
                                                     ? AllCharactersExceptNull.Except(classState.CharsSet)
                                                     : classState.CharsSet
                        });

                    default:
                        char lastClassChar = classState.LastChar;
                        char from          = lastClassChar < c ? lastClassChar : c;
                        char to            = lastClassChar < c ? c : lastClassChar;
                        classState.CharsSet.AddRange(from, to);
                        classState.LastChar = (char)0;
                        state = State.InsideCharacterClass;
                        break;
                    }
                    break;

                case State.NumberedRepetition:
                    switch (c)
                    {
                    case '0':           // Is it really OK to start with a 0. It is now.
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                        numberedRepetitionState.Chars.Add(c);
                        break;

                    case '}':
                    case ':':
                    case ',':
                        // Parse whatever is in Chars
                        int reps;

                        // Number is required in FIRST part but OPTIONAL in the second
                        if (numberedRepetitionState.Chars.Any() || numberedRepetitionState.CurrentPart == 0)
                        {
                            if (!int.TryParse(new string(numberedRepetitionState.Chars.ToArray()), out reps))
                            {
                                throw new LexerConstructionException("Numbered repetition operator contains operand that is not a number");
                            }
                        }
                        else
                        {
                            // End up here when nothing specified in the last part.
                            // Use the max value to say that it can be infinite numbers.
                            reps = int.MaxValue;
                        }
                        numberedRepetitionState.Chars.Clear();

                        // Set the right value
                        if (numberedRepetitionState.CurrentPart == 0)
                        {
                            numberedRepetitionState.MinRepetitions = reps;
                        }
                        else
                        {
                            numberedRepetitionState.MaxRepetitions = reps;
                        }

                        if (c == ':' || c == ',')
                        {
                            ++numberedRepetitionState.CurrentPart;
                            if (numberedRepetitionState.CurrentPart > 1)
                            {
                                throw new LexerConstructionException("More than one , in numbered repetition.");
                            }
                        }
                        else
                        {
                            return(new RegExToken
                            {
                                Type = RegExToken.TokenType.NumberedRepeat,
                                MinRepetitions = numberedRepetitionState.MinRepetitions,
                                MaxRepetitions = numberedRepetitionState.MaxRepetitions
                            });
                        }
                        break;

                    default:
                        throw new LexerConstructionException(
                                  string.Format("Illegal character {0} in numbered repetition", c));
                    }
                    break;
                }
            }

            // We get here if we try to lex when the expression has ended.
            return(null);
        }