/** * Parses a regular expression factor. This method handles the * Fact production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseFact(NFAState start) { NFAState placeholder = new NFAState(); NFAState end; end = ParseAtom(placeholder); switch (PeekChar(0)) { case '?': case '*': case '+': case '{': end = ParseAtomModifier(placeholder, end); break; } if (placeholder.incoming.Length > 0 && start.outgoing.Length > 0) { start.AddOut(new NFAEpsilonTransition(placeholder)); return(end); } else { placeholder.MergeInto(start); return((end == placeholder) ? start : end); } }
/** * Parses a regular expression. This method handles the Expr * production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseExpr(NFAState start) { NFAState end = new NFAState(); NFAState subStart; NFAState subEnd; do { if (PeekChar(0) == '|') { ReadChar('|'); } subStart = new NFAState(); subEnd = ParseTerm(subStart); if (subStart.incoming.Length == 0) { subStart.MergeInto(start); } else { start.AddOut(new NFAEpsilonTransition(subStart)); } if (subEnd.outgoing.Length == 0 || (!end.HasTransitions() && PeekChar(0) != '|')) { subEnd.MergeInto(end); } else { subEnd.AddOut(new NFAEpsilonTransition(end)); } } while (PeekChar(0) == '|'); return(end); }
/// <summary> /// Parses a regular expression character set. This method handles /// the contents of the '[...]' construct in a regular expression. /// </summary> /// <param name="start">The initial NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseCharSet(NFAState start) { NFAState end = new NFAState(); NFACharRangeTransition range; char min; char max; if (this.PeekChar(0) == '^') { this.ReadChar('^'); range = new NFACharRangeTransition(true, this.ignoreCase, end); } else { range = new NFACharRangeTransition(false, this.ignoreCase, end); } start.AddOut(range); while (this.PeekChar(0) > 0) { min = (char)this.PeekChar(0); switch (min) { case ']': return(end); case '\\': range.AddCharacter(this.ReadEscapeChar()); break; default: this.ReadChar(min); if (this.PeekChar(0) == '-' && this.PeekChar(1) > 0 && this.PeekChar(1) != ']') { this.ReadChar('-'); max = this.ReadChar(); range.AddRange(min, max); } else { range.AddCharacter(min); } break; } } return(end); }
/** * Adds a string match to this automaton. New states and * transitions will be added to extend this automaton to support * the specified string. * * @param str the string to match * @param ignoreCase the case-insensitive match flag * @param value the match value */ public void AddTextMatch(string str, bool ignoreCase, TokenPattern value) { NFAState state; char ch = str[0]; if (ch < 128 && !ignoreCase) { state = initialChar[ch]; if (state == null) { state = initialChar[ch] = new NFAState(); } } else { state = initial.AddOut(ch, ignoreCase, null); } for (int i = 1; i < str.Length; i++) { state = state.AddOut(str[i], ignoreCase, null); } state.value = value; }
/** * Merges all the transitions in this state into another * state. * * @param state the state to merge into */ public void MergeInto(NFAState state) { for (int i = 0; i < incoming.Length; i++) { state.AddIn(incoming[i]); incoming[i].state = state; } incoming = null; for (int i = 0; i < outgoing.Length; i++) { state.AddOut(outgoing[i]); } outgoing = null; }
/// <summary> /// Parses a regular expression character. This method handles /// a single normal character in a regular expression. /// </summary> /// <param name="start">The initial NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseChar(NFAState start) { switch (this.PeekChar(0)) { case '\\': return(this.ParseEscapeChar(start)); case '^': case '$': throw new RegExpException( RegExpException.ErrorType.UnsupportedSpecialCharacter, this.pos, this.pattern); default: return(start.AddOut(this.ReadChar(), this.ignoreCase, new NFAState())); } }
/** * Parses a regular expression character. This method handles * a single normal character in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseChar(NFAState start) { switch (PeekChar(0)) { case '\\': return(ParseEscapeChar(start)); case '^': case '$': throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); default: return(start.AddOut(ReadChar(), ignoreCase, new NFAState())); } }
/// <summary> /// Parses a regular expression atom. This method handles the /// Atom production in the grammar (see regexp.grammar). /// </summary> /// <param name="start">The initial NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseAtom(NFAState start) { NFAState end; switch (this.PeekChar(0)) { case '.': this.ReadChar('.'); return(start.AddOut(new NFADotTransition(new NFAState()))); case '(': this.ReadChar('('); end = this.ParseExpr(start); this.ReadChar(')'); return(end); case '[': this.ReadChar('['); end = this.ParseCharSet(start); this.ReadChar(']'); return(end); case -1: case ')': case ']': case '{': case '}': case '?': case '*': case '+': case '|': throw new RegExpException( RegExpException.ErrorType.UnexpectedCharacter, this.pos, this.pattern); default: return(this.ParseChar(start)); } }
/** * Parses a regular expression atom. This method handles the * Atom production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtom(NFAState start) { NFAState end; switch (PeekChar(0)) { case '.': ReadChar('.'); return(start.AddOut(new NFADotTransition(new NFAState()))); case '(': ReadChar('('); end = ParseExpr(start); ReadChar(')'); return(end); case '[': ReadChar('['); end = ParseCharSet(start); ReadChar(']'); return(end); case -1: case ')': case ']': case '{': case '}': case '?': case '*': case '+': case '|': throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); default: return(ParseChar(start)); } }
/// <summary> /// Parses a regular expression character escape. This method /// handles a single character escape in a regular expression. /// </summary> /// <param name="start">The initial NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseEscapeChar(NFAState start) { NFAState end = new NFAState(); if (this.PeekChar(0) == '\\' && this.PeekChar(1) > 0) { switch ((char)this.PeekChar(1)) { case 'd': this.ReadChar(); this.ReadChar(); return(start.AddOut(new NFADigitTransition(end))); case 'D': this.ReadChar(); this.ReadChar(); return(start.AddOut(new NFANonDigitTransition(end))); case 's': this.ReadChar(); this.ReadChar(); return(start.AddOut(new NFAWhitespaceTransition(end))); case 'S': this.ReadChar(); this.ReadChar(); return(start.AddOut(new NFANonWhitespaceTransition(end))); case 'w': this.ReadChar(); this.ReadChar(); return(start.AddOut(new NFAWordTransition(end))); case 'W': this.ReadChar(); this.ReadChar(); return(start.AddOut(new NFANonWordTransition(end))); } } return(start.AddOut(this.ReadEscapeChar(), this.ignoreCase, end)); }
/// <summary> /// Parses a regular expression. This method handles the <c>Expr</c> /// production in the grammar (see regexp.grammar). /// </summary> /// <param name="start">The initial NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseExpr(NFAState start) { NFAState end = new NFAState(); NFAState subStart; NFAState subEnd; do { if (this.PeekChar(0) == '|') { this.ReadChar('|'); } subStart = new NFAState(); subEnd = this.ParseTerm(subStart); if (subStart.Incoming.Count == 0) { subStart.MergeInto(start); } else { start.AddOut(new NFAEpsilonTransition(subStart)); } if (subEnd.Outgoing.Count == 0 || (!end.HasTransitions && this.PeekChar(0) != '|')) { subEnd.MergeInto(end); } else { subEnd.AddOut(new NFAEpsilonTransition(end)); } }while (this.PeekChar(0) == '|'); return(end); }
/** * Parses a regular expression character escape. This method * handles a single character escape in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseEscapeChar(NFAState start) { NFAState end = new NFAState(); if (PeekChar(0) == '\\' && PeekChar(1) > 0) { switch ((char) PeekChar(1)) { case 'd': ReadChar(); ReadChar(); return start.AddOut(new NFADigitTransition(end)); case 'D': ReadChar(); ReadChar(); return start.AddOut(new NFANonDigitTransition(end)); case 's': ReadChar(); ReadChar(); return start.AddOut(new NFAWhitespaceTransition(end)); case 'S': ReadChar(); ReadChar(); return start.AddOut(new NFANonWhitespaceTransition(end)); case 'w': ReadChar(); ReadChar(); return start.AddOut(new NFAWordTransition(end)); case 'W': ReadChar(); ReadChar(); return start.AddOut(new NFANonWordTransition(end)); } } return start.AddOut(ReadEscapeChar(), ignoreCase, end); }
/** * Parses a regular expression character. This method handles * a single normal character in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseChar(NFAState start) { switch (PeekChar(0)) { case '\\': return ParseEscapeChar(start); case '^': case '$': throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); default: return start.AddOut(ReadChar(), ignoreCase, new NFAState()); } }
/** * Parses a regular expression character set. This method handles * the contents of the '[...]' construct in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseCharSet(NFAState start) { NFAState end = new NFAState(); NFACharRangeTransition range; char min; char max; if (PeekChar(0) == '^') { ReadChar('^'); range = new NFACharRangeTransition(true, ignoreCase, end); } else { range = new NFACharRangeTransition(false, ignoreCase, end); } start.AddOut(range); while (PeekChar(0) > 0) { min = (char) PeekChar(0); switch (min) { case ']': return end; case '\\': range.AddCharacter(ReadEscapeChar()); break; default: ReadChar(min); if (PeekChar(0) == '-' && PeekChar(1) > 0 && PeekChar(1) != ']') { ReadChar('-'); max = ReadChar(); range.AddRange(min, max); } else { range.AddCharacter(min); } break; } } return end; }
/** * Parses a regular expression atom modifier. This method handles * the AtomModifier production in the grammar (see regexp.grammar). * * @param start the initial NFA state * @param end the terminal NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtomModifier(NFAState start, NFAState end) { int min = 0; int max = -1; int firstPos = pos; // Read min and max switch (ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': min = ReadNumber(); max = min; if (PeekChar(0) == ',') { ReadChar(','); max = -1; if (PeekChar(0) != '}') { max = ReadNumber(); } } ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos - 1, pattern); } // Read possessive or reluctant modifiers if (PeekChar(0) == '?') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } else if (PeekChar(0) == '+') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } // Handle supported repeaters if (min == 0 && max == 1) { return start.AddOut(new NFAEpsilonTransition(end)); } else if (min == 0 && max == -1) { if (end.outgoing.Length == 0) { end.MergeInto(start); } else { end.AddOut(new NFAEpsilonTransition(start)); } return start; } else if (min == 1 && max == -1) { if (start.outgoing.Length == 1 && end.outgoing.Length == 0 && end.incoming.Length == 1 && start.outgoing[0] == end.incoming[0]) { end.AddOut(start.outgoing[0].Copy(end)); } else { end.AddOut(new NFAEpsilonTransition(start)); } return end; } else { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } }
/** * Parses a regular expression atom. This method handles the * Atom production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtom(NFAState start) { NFAState end; switch (PeekChar(0)) { case '.': ReadChar('.'); return start.AddOut(new NFADotTransition(new NFAState())); case '(': ReadChar('('); end = ParseExpr(start); ReadChar(')'); return end; case '[': ReadChar('['); end = ParseCharSet(start); ReadChar(']'); return end; case -1: case ')': case ']': case '{': case '}': case '?': case '*': case '+': case '|': throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); default: return ParseChar(start); } }
/** * Parses a regular expression factor. This method handles the * Fact production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseFact(NFAState start) { NFAState placeholder = new NFAState(); NFAState end; end = ParseAtom(placeholder); switch (PeekChar(0)) { case '?': case '*': case '+': case '{': end = ParseAtomModifier(placeholder, end); break; } if (placeholder.incoming.Length > 0 && start.outgoing.Length > 0) { start.AddOut(new NFAEpsilonTransition(placeholder)); return end; } else { placeholder.MergeInto(start); return (end == placeholder) ? start : end; } }
/** * Parses a regular expression. This method handles the Expr * production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseExpr(NFAState start) { NFAState end = new NFAState(); NFAState subStart; NFAState subEnd; do { if (PeekChar(0) == '|') { ReadChar('|'); } subStart = new NFAState(); subEnd = ParseTerm(subStart); if (subStart.incoming.Length == 0) { subStart.MergeInto(start); } else { start.AddOut(new NFAEpsilonTransition(subStart)); } if (subEnd.outgoing.Length == 0 || (!end.HasTransitions() && PeekChar(0) != '|')) { subEnd.MergeInto(end); } else { subEnd.AddOut(new NFAEpsilonTransition(end)); } } while (PeekChar(0) == '|'); return end; }
/** * Parses a regular expression atom modifier. This method handles * the AtomModifier production in the grammar (see regexp.grammar). * * @param start the initial NFA state * @param end the terminal NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtomModifier(NFAState start, NFAState end) { int min = 0; int max = -1; int firstPos = pos; // Read min and max switch (ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': min = ReadNumber(); max = min; if (PeekChar(0) == ',') { ReadChar(','); max = -1; if (PeekChar(0) != '}') { max = ReadNumber(); } } ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos - 1, pattern); } // Read possessive or reluctant modifiers if (PeekChar(0) == '?') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } else if (PeekChar(0) == '+') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } // Handle supported repeaters if (min == 0 && max == 1) { return(start.AddOut(new NFAEpsilonTransition(end))); } else if (min == 0 && max == -1) { if (end.outgoing.Length == 0) { end.MergeInto(start); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(start); } else if (min == 1 && max == -1) { if (start.outgoing.Length == 1 && end.outgoing.Length == 0 && end.incoming.Length == 1 && start.outgoing[0] == end.incoming[0]) { end.AddOut(start.outgoing[0].Copy(end)); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(end); } else { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } }
/// <summary> /// Parses a regular expression atom modifier. This method handles /// the AtomModifier production in the grammar (see regexp.grammar). /// </summary> /// <param name="start">The initial NFA state</param> /// <param name="end">The terminal NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseAtomModifier( NFAState start, NFAState end) { int min; int max; int firstPos = this.pos; // Read min and max switch (this.ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': min = this.ReadNumber(); max = min; if (this.PeekChar(0) == ',') { this.ReadChar(','); max = -1; if (this.PeekChar(0) != '}') { max = this.ReadNumber(); } } this.ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.InvalidRepeatCount, firstPos, this.pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UnexpectedCharacter, this.pos - 1, this.pattern); } // Read possessive or reluctant modifiers if (this.PeekChar(0) == '?') { throw new RegExpException( RegExpException.ErrorType.UnsupportedSpecialCharacter, this.pos, this.pattern); } else if (this.PeekChar(0) == '+') { throw new RegExpException( RegExpException.ErrorType.UnsupportedSpecialCharacter, this.pos, this.pattern); } // Handle supported repeaters if (min == 0 && max == 1) { return(start.AddOut(new NFAEpsilonTransition(end))); } else if (min == 0 && max == -1) { if (end.Outgoing.Count == 0) { end.MergeInto(start); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(start); } else if (min == 1 && max == -1) { if (start.Outgoing.Count == 1 && end.Outgoing.Count == 0 && end.Incoming.Count == 1 && start.Outgoing[0] == end.Incoming[0]) { end.AddOut(start.Outgoing[0].Copy(end)); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(end); } else { throw new RegExpException( RegExpException.ErrorType.InvalidRepeatCount, firstPos, this.pattern); } }