Exemplo n.º 1
0
        public RegExToken NextToken(bool ignorecase)
        {
            // These keeps track of classes
            CharacterClassState     classState = new CharacterClassState();
            NumberedRepetitionState numberedRepetitionState = new NumberedRepetitionState();

            state = State.Normal;

            while (input.Peek() != -1)
            {
                char c = (char)input.Read();

                switch (state)
                {
                case State.Normal:
                    switch (c)
                    {
                    case '\\':
                        state = State.NormalEscaped;
                        break;

                    case '[':
                        state = State.BeginCharacterClass;
                        break;

                    case '{':
                        state = State.NumberedRepetition;
                        break;

                    case '(': return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorOpenParanthesis
                        });

                    case ')': return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorCloseParanthesis
                        });

                    case '|': return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorOr
                        });

                    case '+': return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorPlus
                        });

                    case '*': return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorMul
                        });

                    case '?': return(new RegExToken {
                            Type = RegExToken.TokenType.OperatorQuestion
                        });

                    case '.': return(new RegExToken {
                            Type = RegExToken.TokenType.Accept, Characters = AllCharactersExceptNull
                        });

                    default:  return(new RegExToken {
                            Type = RegExToken.TokenType.Accept, Characters = SingleChar(c, ignorecase)
                        });
                    }

                    break;

                case State.NormalEscaped:
                {
                    CharSet characters = EscapedCharToAcceptCharRange(c);

                    if (characters.Any())
                    {
                        return new RegExToken {
                                   Characters = characters, Type = RegExToken.TokenType.Accept
                        }
                    }
                    ;
                    else
                    {
                        throw new LexerConstructionException($"Unknown escaped character '{c}'");
                    }
                }

                case State.BeginCharacterClass:
                    switch (c)
                    {
                    case '^':
                        if (classState.Negated)
                        {
                            // If the classstate is ALREADY negated
                            // Read the ^ to the expression
                            classState.LastChar = '^';
                            state = State.InsideCharacterClass;
                        }

                        classState.Negated = true;

                        break;

                    case '[':
                    case ']':
                    case '-':
                        // This does not break the character class TODO: I THINK!!!
                        classState.LastChar = c;

                        break;

                    case '\\':
                        state = State.InsideCharacterClassEscaped;

                        break;

                    default:
                        classState.LastChar = c;
                        state = State.InsideCharacterClass;

                        break;
                    }

                    break;

                case State.InsideCharacterClass:
                    switch (c)
                    {
                    case '-':
                        state = State.RangeEnd;

                        break;

                    case '[':
                        throw new LexerConstructionException("Opening new character class inside an already open one");

                    case ']':
                        if (classState.LastChar != (char)0)
                        {
                            classState.CharsSet.Add(classState.LastChar);
                        }

                        // Ending class
                        return(new RegExToken
                        {
                            Type = RegExToken.TokenType.Accept,
                            Characters = classState.Negated ? AllCharactersExceptNull.Except(classState.CharsSet)
                                                                    : classState.CharsSet
                        });

                    case '\\':
                        state = State.InsideCharacterClassEscaped;

                        break;

                    default:
                        if (classState.LastChar != 0)
                        {
                            classState.CharsSet.Add(classState.LastChar);
                        }

                        classState.LastChar = c;

                        break;
                    }

                    break;

                case State.InsideCharacterClassEscaped:
                {
                    CharSet characters = EscapedCharToAcceptCharsInClass(c);

                    if (!characters.Any())
                    {
                        throw new LexerConstructionException(string.Format("Unknown escaped character '{0}' in character class", c));
                    }

                    if (classState.LastChar != 0)
                    {
                        classState.CharsSet.Add(classState.LastChar);
                    }

                    classState.CharsSet.UnionWith(characters);
                    classState.LastChar = (char)0;
                    state = State.InsideCharacterClass;

                    break;
                }

                case State.RangeEnd:
                    switch (c)
                    {
                    case ']':
                        // We found the - at the position BEFORE the end of the class
                        // which means we should handle it as a litteral and end the class
                        classState.CharsSet.Add(classState.LastChar);
                        classState.CharsSet.Add('-');

                        return(new RegExToken
                        {
                            Type = RegExToken.TokenType.Accept,
                            Characters = classState.Negated
                                                     ? AllCharactersExceptNull.Except(classState.CharsSet)
                                                     : classState.CharsSet
                        });

                    default:
                        char lastClassChar = classState.LastChar;
                        char from          = lastClassChar < c ? lastClassChar : c;
                        char to            = lastClassChar < c ? c : lastClassChar;

                        if (ignorecase)
                        {
                            classState.CharsSet.AddRange(char.ToUpper(from), char.ToUpper(to));
                            classState.CharsSet.AddRange(char.ToLower(from), char.ToLower(to));
                        }
                        else
                        {
                            classState.CharsSet.AddRange(from, to);
                        }

                        classState.LastChar = (char)0;

                        state = State.InsideCharacterClass;

                        break;
                    }

                    break;

                case State.NumberedRepetition:
                    switch (c)
                    {
                    case '0':           // Is it really OK to start with a 0. It is now.
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                        numberedRepetitionState.Chars.Add(c);

                        break;

                    case '}':
                    case ':':
                    case ',':
                        // Parse whatever is in Chars
                        int reps;

                        // Number is required in FIRST part but OPTIONAL in the second
                        if (numberedRepetitionState.Chars.Any() || numberedRepetitionState.CurrentPart == 0)
                        {
                            if (!int.TryParse(new string(numberedRepetitionState.Chars.ToArray()), out reps))
                            {
                                throw new LexerConstructionException("Numbered repetition operator contains operand that is not a number");
                            }
                        }
                        else
                        {
                            // End up here when nothing specified in the last part.
                            // Use the max value to say that it can be infinite numbers.
                            reps = int.MaxValue;
                        }

                        numberedRepetitionState.Chars.Clear();

                        // Set the right value
                        if (numberedRepetitionState.CurrentPart == 0)
                        {
                            numberedRepetitionState.MinRepetitions = reps;
                        }
                        else
                        {
                            numberedRepetitionState.MaxRepetitions = reps;
                        }

                        if (c == ':' || c == ',')
                        {
                            ++numberedRepetitionState.CurrentPart;

                            if (numberedRepetitionState.CurrentPart > 1)
                            {
                                throw new LexerConstructionException("More than one , in numbered repetition.");
                            }
                        }
                        else
                        {
                            return new RegExToken
                                   {
                                       Type           = RegExToken.TokenType.NumberedRepeat,
                                       MinRepetitions = numberedRepetitionState.MinRepetitions,
                                       MaxRepetitions = numberedRepetitionState.MaxRepetitions
                                   }
                        };

                        break;

                    default:
                        throw new LexerConstructionException(
                                  string.Format("Illegal character {0} in numbered repetition", c));
                    }

                    break;
                }
            }

            // We get here if we try to lex when the expression has ended.
            return(null);
        }
Exemplo n.º 2
0
        public RegExToken NextToken()
        {
            // These keeps track of classes
            var classState = new CharacterClassState();
            var numberedRepetitionState = new NumberedRepetitionState();
            state = State.Normal;

            while (input.Peek() != -1)
            {
                var c = (char)input.Read();

                switch (state)
                {
                    case State.Normal:
                        switch (c)
                        {
                            case '\\':
                                state = State.NormalEscaped;
                                break;
                            case '[':
                                state = State.BeginCharacterClass;
                                break;
                            case '{':
                                state = State.NumberedRepetition;
                                break;

                            case '(':   return new RegExToken { Type = RegExToken.TokenType.OperatorOpenParanthesis };
                            case ')':   return new RegExToken { Type = RegExToken.TokenType.OperatorCloseParanthesis };
                            case '|':   return new RegExToken { Type = RegExToken.TokenType.OperatorOr };
                            case '+':   return new RegExToken { Type = RegExToken.TokenType.OperatorPlus };
                            case '*':   return new RegExToken { Type = RegExToken.TokenType.OperatorMul };
                            case '?':   return new RegExToken { Type = RegExToken.TokenType.OperatorQuestion };
                            case '.':   return new RegExToken { Type = RegExToken.TokenType.Accept, Characters = AllCharactersExceptNull };
                            default:    return new RegExToken { Type = RegExToken.TokenType.Accept, Characters = SingleChar(c)};
                        }
                        break;

                    case State.NormalEscaped:
                        {
                            var characters = EscapedCharToAcceptCharRange(c);
                            if (!characters.Any())
                            {
                                throw new LexerConstructionException(string.Format("Unknown escaped character '{0}'", c));
                            }
                            return new RegExToken {Characters = characters, Type = RegExToken.TokenType.Accept};
                        }

                    case State.BeginCharacterClass:
                        switch (c)
                        {
                            case '^':
                                if (classState.Negated)
                                {
                                    // If the classstate is ALREADY negated
                                    // Readd the ^ to the expression
                                    classState.LastChar = '^';
                                    state = State.InsideCharacterClass;
                                }
                                classState.Negated = true;
                                break;
                            case '[':
                            case ']':
                            case '-':
                                // This does not break the character class TODO: I THINK!!!
                                classState.LastChar = c;
                                break;
                            case '\\':
                                state = State.InsideCharacterClassEscaped;
                                break;
                            default:
                                classState.LastChar = c;
                                state = State.InsideCharacterClass;
                                break;
                        }
                        break;

                    case State.InsideCharacterClass:
                        switch (c)
                        {
                            case '-':
                                state = State.RangeEnd;
                                break;
                            case '[':
                                throw new LexerConstructionException("Opening new character class inside an already open one");
                            case ']':
                                if (classState.LastChar != (char)0)
                                    classState.CharsSet.Add(classState.LastChar);

                                // Ending class
                                return new RegExToken
                                           {
                                               Type = RegExToken.TokenType.Accept,
                                               Characters = classState.Negated
                                                                ? AllCharactersExceptNull.Except(classState.CharsSet)
                                                                : classState.CharsSet
                                           };
                            case '\\':
                                state = State.InsideCharacterClassEscaped;
                                break;
                            default:
                                if (classState.LastChar != 0)
                                    classState.CharsSet.Add(classState.LastChar);
                                classState.LastChar = c;
                                break;
                        }
                        break;

                    case State.InsideCharacterClassEscaped:
                        {
                            var characters = EscapedCharToAcceptCharsInClass(c);
                            if (!characters.Any())
                            {
                                throw new LexerConstructionException(string.Format("Unknown escaped character '{0}' in character class", c));
                            }

                            if (classState.LastChar != 0)
                                classState.CharsSet.Add(classState.LastChar);

                            classState.CharsSet.UnionWith(characters);
                            classState.LastChar = (char)0;
                            state = State.InsideCharacterClass;
                        }
                        break;

                    case State.RangeEnd:
                        switch (c)
                        {
                            case ']':
                                // We found the - at the position BEFORE the end of the class
                                // which means we should handle it as a litteral and end the class
                                classState.CharsSet.Add(classState.LastChar);
                                classState.CharsSet.Add('-');

                                return new RegExToken
                                {
                                    Type = RegExToken.TokenType.Accept,
                                    Characters = classState.Negated
                                                     ? AllCharactersExceptNull.Except(classState.CharsSet)
                                                     : classState.CharsSet
                                };

                            default:
                                char lastClassChar = classState.LastChar;
                                char from = lastClassChar < c ? lastClassChar : c;
                                char to = lastClassChar < c ? c : lastClassChar;
                                classState.CharsSet.AddRange(from, to);
                                classState.LastChar = (char) 0;
                                state = State.InsideCharacterClass;
                                break;
                        }
                        break;

                    case State.NumberedRepetition:
                        switch (c)
                        {
                            case '0':   // Is it really OK to start with a 0. It is now.
                            case '1':
                            case '2':
                            case '3':
                            case '4':
                            case '5':
                            case '6':
                            case '7':
                            case '8':
                            case '9':
                                numberedRepetitionState.Chars.Add(c);
                                break;
                            case '}':
                            case ':':
                            case ',':
                                // Parse whatever is in Chars
                                int reps;

                                // Number is required in FIRST part but OPTIONAL in the second
                                if (numberedRepetitionState.Chars.Any() || numberedRepetitionState.CurrentPart == 0)
                                {
                                    if (!int.TryParse(new string(numberedRepetitionState.Chars.ToArray()), out reps))
                                    {
                                        throw new LexerConstructionException("Numbered repetition operator contains operand that is not a number");
                                    }
                                }
                                else
                                {
                                    // End up here when nothing specified in the last part.
                                    // Use the max value to say that it can be infinite numbers.
                                    reps = int.MaxValue;
                                }
                                numberedRepetitionState.Chars.Clear();

                                // Set the right value
                                if (numberedRepetitionState.CurrentPart == 0)
                                {
                                    numberedRepetitionState.MinRepetitions = reps;
                                }
                                else
                                {
                                    numberedRepetitionState.MaxRepetitions = reps;
                                }

                                if (c == ':' || c == ',')
                                {
                                    ++numberedRepetitionState.CurrentPart;
                                    if (numberedRepetitionState.CurrentPart > 1)
                                        throw new LexerConstructionException("More than one , in numbered repetition.");
                                }
                                else
                                {
                                    return new RegExToken
                                    {
                                        Type = RegExToken.TokenType.NumberedRepeat,
                                        MinRepetitions = numberedRepetitionState.MinRepetitions,
                                        MaxRepetitions = numberedRepetitionState.MaxRepetitions
                                    };
                                }
                                break;
                            default:
                                throw new LexerConstructionException(
                                    string.Format("Illegal character {0} in numbered repetition", c));
                        }
                        break;
                }
            }

            // We get here if we try to lex when the expression has ended.
            return null;
        }