private CharSet EscapedCharToAcceptCharRange(char c) { switch (c) { // A lot of these are REALLY funky numbers. Tibetan numbers and such. You name it case 'd': return(new CharSet(false, escd)); // Shorthand for [^0-9] case 'D': return(new CharSet(false, escD)); case 's': return(AllWhitespaceCharacters); case 'S': return(AllCharactersExceptNull.Except(AllWhitespaceCharacters)); case 'w': return(new CharSet(false, escw)); case 'W': return(new CharSet(false, escW)); case 'n': return(SingleChar('\n')); case 'r': return(SingleChar('\r')); case '.': case '*': case '|': case '[': case ']': case '+': case '(': case ')': case '\\': case '{': case '}': case ' ': case '?': return(SingleChar(c)); default: return(new CharSet()); // Empty charset, might be added to } }
public RegExToken NextToken() { // These keeps track of classes var classState = new CharacterClassState(); var numberedRepetitionState = new NumberedRepetitionState(); state = State.Normal; while (input.Peek() != -1) { var c = (char)input.Read(); switch (state) { case State.Normal: switch (c) { case '\\': state = State.NormalEscaped; break; case '[': state = State.BeginCharacterClass; break; case '{': state = State.NumberedRepetition; break; case '(': return(new RegExToken { Type = RegExToken.TokenType.OperatorOpenParanthesis }); case ')': return(new RegExToken { Type = RegExToken.TokenType.OperatorCloseParanthesis }); case '|': return(new RegExToken { Type = RegExToken.TokenType.OperatorOr }); case '+': return(new RegExToken { Type = RegExToken.TokenType.OperatorPlus }); case '*': return(new RegExToken { Type = RegExToken.TokenType.OperatorMul }); case '?': return(new RegExToken { Type = RegExToken.TokenType.OperatorQuestion }); case '.': return(new RegExToken { Type = RegExToken.TokenType.Accept, Characters = AllCharactersExceptNull }); default: return(new RegExToken { Type = RegExToken.TokenType.Accept, Characters = SingleChar(c) }); } break; case State.NormalEscaped: { var characters = EscapedCharToAcceptCharRange(c); if (!characters.Any()) { throw new LexerConstructionException(string.Format("Unknown escaped character '{0}'", c)); } return(new RegExToken { Characters = characters, Type = RegExToken.TokenType.Accept }); } case State.BeginCharacterClass: switch (c) { case '^': if (classState.Negated) { // If the classstate is ALREADY negated // Readd the ^ to the expression classState.LastChar = '^'; state = State.InsideCharacterClass; } classState.Negated = true; break; case '[': case ']': case '-': // This does not break the character class TODO: I THINK!!! classState.LastChar = c; break; case '\\': state = State.InsideCharacterClassEscaped; break; default: classState.LastChar = c; state = State.InsideCharacterClass; break; } break; case State.InsideCharacterClass: switch (c) { case '-': state = State.RangeEnd; break; case '[': throw new LexerConstructionException("Opening new character class inside an already open one"); case ']': if (classState.LastChar != (char)0) { classState.CharsSet.Add(classState.LastChar); } // Ending class return(new RegExToken { Type = RegExToken.TokenType.Accept, Characters = classState.Negated ? AllCharactersExceptNull.Except(classState.CharsSet) : classState.CharsSet }); case '\\': state = State.InsideCharacterClassEscaped; break; default: if (classState.LastChar != 0) { classState.CharsSet.Add(classState.LastChar); } classState.LastChar = c; break; } break; case State.InsideCharacterClassEscaped: { var characters = EscapedCharToAcceptCharsInClass(c); if (!characters.Any()) { throw new LexerConstructionException(string.Format("Unknown escaped character '{0}' in character class", c)); } if (classState.LastChar != 0) { classState.CharsSet.Add(classState.LastChar); } classState.CharsSet.UnionWith(characters); classState.LastChar = (char)0; state = State.InsideCharacterClass; } break; case State.RangeEnd: switch (c) { case ']': // We found the - at the position BEFORE the end of the class // which means we should handle it as a litteral and end the class classState.CharsSet.Add(classState.LastChar); classState.CharsSet.Add('-'); return(new RegExToken { Type = RegExToken.TokenType.Accept, Characters = classState.Negated ? AllCharactersExceptNull.Except(classState.CharsSet) : classState.CharsSet }); default: char lastClassChar = classState.LastChar; char from = lastClassChar < c ? lastClassChar : c; char to = lastClassChar < c ? c : lastClassChar; classState.CharsSet.AddRange(from, to); classState.LastChar = (char)0; state = State.InsideCharacterClass; break; } break; case State.NumberedRepetition: switch (c) { case '0': // Is it really OK to start with a 0. It is now. case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': numberedRepetitionState.Chars.Add(c); break; case '}': case ':': case ',': // Parse whatever is in Chars int reps; // Number is required in FIRST part but OPTIONAL in the second if (numberedRepetitionState.Chars.Any() || numberedRepetitionState.CurrentPart == 0) { if (!int.TryParse(new string(numberedRepetitionState.Chars.ToArray()), out reps)) { throw new LexerConstructionException("Numbered repetition operator contains operand that is not a number"); } } else { // End up here when nothing specified in the last part. // Use the max value to say that it can be infinite numbers. reps = int.MaxValue; } numberedRepetitionState.Chars.Clear(); // Set the right value if (numberedRepetitionState.CurrentPart == 0) { numberedRepetitionState.MinRepetitions = reps; } else { numberedRepetitionState.MaxRepetitions = reps; } if (c == ':' || c == ',') { ++numberedRepetitionState.CurrentPart; if (numberedRepetitionState.CurrentPart > 1) { throw new LexerConstructionException("More than one , in numbered repetition."); } } else { return(new RegExToken { Type = RegExToken.TokenType.NumberedRepeat, MinRepetitions = numberedRepetitionState.MinRepetitions, MaxRepetitions = numberedRepetitionState.MaxRepetitions }); } break; default: throw new LexerConstructionException( string.Format("Illegal character {0} in numbered repetition", c)); } break; } } // We get here if we try to lex when the expression has ended. return(null); }