/** * Creates a new character range transition. * * @param inverse the inverse match flag * @param ignoreCase the case-insensitive match flag * @param state the target state */ public NFACharRangeTransition(bool inverse, bool ignoreCase, NFAState state) : base(state) { this.inverse = inverse; this.ignoreCase = ignoreCase; }
/** * Adds a string match to this automaton. New states and * transitions will be added to extend this automaton to support * the specified string. * * @param str the string to match * @param ignoreCase the case-insensitive match flag * @param value the match value */ public void AddTextMatch(string str, bool ignoreCase, TokenPattern value) { NFAState state; char ch = str[0]; if (ch < 128 && !ignoreCase) { state = initialChar[ch]; if (state == null) { state = initialChar[ch] = new NFAState(); } } else { state = initial.AddOut(ch, ignoreCase, null); } for (int i = 1; i < str.Length; i++) { state = state.AddOut(str[i], ignoreCase, null); } state.value = value; }
/** * Creates a new regular expression parser. The regular * expression can be either case-sensitive or case-insensitive. * Note that this will trigger the parsing of the regular * expression. * * @param pattern the regular expression pattern * @param ignoreCase the character case ignore flag * * @throws RegExpException if the regular expression couldn't be * parsed correctly */ public TokenRegExpParser(string pattern, bool ignoreCase) { this.pattern = pattern; this.ignoreCase = ignoreCase; this.pos = 0; this.end = ParseExpr(start); if (pos < pattern.Length) { throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); } }
/** * Creates a new non-whitespace character set transition. * * @param state the target state */ public NFANonWhitespaceTransition(NFAState state) : base(state) { }
/** * Creates a new dot character set transition. * * @param state the target state */ public NFADotTransition(NFAState state) : base(state) { }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return new NFAEpsilonTransition(state); }
/** * Parses a regular expression atom modifier. This method handles * the AtomModifier production in the grammar (see regexp.grammar). * * @param start the initial NFA state * @param end the terminal NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtomModifier(NFAState start, NFAState end) { int min = 0; int max = -1; int firstPos = pos; // Read min and max switch (ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': min = ReadNumber(); max = min; if (PeekChar(0) == ',') { ReadChar(','); max = -1; if (PeekChar(0) != '}') { max = ReadNumber(); } } ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos - 1, pattern); } // Read possessive or reluctant modifiers if (PeekChar(0) == '?') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } else if (PeekChar(0) == '+') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } // Handle supported repeaters if (min == 0 && max == 1) { return start.AddOut(new NFAEpsilonTransition(end)); } else if (min == 0 && max == -1) { if (end.outgoing.Length == 0) { end.MergeInto(start); } else { end.AddOut(new NFAEpsilonTransition(start)); } return start; } else if (min == 1 && max == -1) { if (start.outgoing.Length == 1 && end.outgoing.Length == 0 && end.incoming.Length == 1 && start.outgoing[0] == end.incoming[0]) { end.AddOut(start.outgoing[0].Copy(end)); } else { end.AddOut(new NFAEpsilonTransition(start)); } return end; } else { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } }
/** * Parses a regular expression atom modifier. This method handles * the AtomModifier production in the grammar (see regexp.grammar). * * @param start the initial NFA state * @param end the terminal NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtomModifier(NFAState start, NFAState end) { int min = 0; int max = -1; int firstPos = pos; // Read min and max switch (ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': min = ReadNumber(); max = min; if (PeekChar(0) == ',') { ReadChar(','); max = -1; if (PeekChar(0) != '}') { max = ReadNumber(); } } ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos - 1, pattern); } // Read possessive or reluctant modifiers if (PeekChar(0) == '?') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } else if (PeekChar(0) == '+') { throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); } // Handle supported repeaters if (min == 0 && max == 1) { return(start.AddOut(new NFAEpsilonTransition(end))); } else if (min == 0 && max == -1) { if (end.outgoing.Length == 0) { end.MergeInto(start); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(start); } else if (min == 1 && max == -1) { if (start.outgoing.Length == 1 && end.outgoing.Length == 0 && end.incoming.Length == 1 && start.outgoing[0] == end.incoming[0]) { end.AddOut(start.outgoing[0].Copy(end)); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(end); } else { throw new RegExpException( RegExpException.ErrorType.INVALID_REPEAT_COUNT, firstPos, pattern); } }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return(new NFAEpsilonTransition(state)); }
/** * Creates a new character transition. * * @param match the character to match * @param state the target state */ public NFACharTransition(char match, NFAState state) : base(state) { this.match = match; }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public abstract NFATransition Copy(NFAState state);
/** * Creates a new epsilon transition. * * @param state the target state */ public NFAEpsilonTransition(NFAState state) : base(state) { }
/** * Creates a new state transition. * * @param state the target state */ public NFATransition(NFAState state) { this.state = state; this.state.AddIn(this); }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return(new NFANonWordTransition(state)); }
/** * Creates a new non-word character set transition. * * @param state the target state */ public NFANonWordTransition(NFAState state) : base(state) { }
/** * Parses a regular expression. This method handles the Expr * production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseExpr(NFAState start) { NFAState end = new NFAState(); NFAState subStart; NFAState subEnd; do { if (PeekChar(0) == '|') { ReadChar('|'); } subStart = new NFAState(); subEnd = ParseTerm(subStart); if (subStart.incoming.Length == 0) { subStart.MergeInto(start); } else { start.AddOut(new NFAEpsilonTransition(subStart)); } if (subEnd.outgoing.Length == 0 || (!end.HasTransitions() && PeekChar(0) != '|')) { subEnd.MergeInto(end); } else { subEnd.AddOut(new NFAEpsilonTransition(end)); } } while (PeekChar(0) == '|'); return end; }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return(new NFACharTransition(match, state)); }
/** * Parses a regular expression factor. This method handles the * Fact production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseFact(NFAState start) { NFAState placeholder = new NFAState(); NFAState end; end = ParseAtom(placeholder); switch (PeekChar(0)) { case '?': case '*': case '+': case '{': end = ParseAtomModifier(placeholder, end); break; } if (placeholder.incoming.Length > 0 && start.outgoing.Length > 0) { start.AddOut(new NFAEpsilonTransition(placeholder)); return end; } else { placeholder.MergeInto(start); return (end == placeholder) ? start : end; } }
/** * Parses a regular expression character. This method handles * a single normal character in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseChar(NFAState start) { switch (PeekChar(0)) { case '\\': return ParseEscapeChar(start); case '^': case '$': throw new RegExpException( RegExpException.ErrorType.UNSUPPORTED_SPECIAL_CHARACTER, pos, pattern); default: return start.AddOut(ReadChar(), ignoreCase, new NFAState()); } }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return(new NFADigitTransition(state)); }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return new NFADigitTransition(state); }
/** * Creates a new non-digit character set transition. * * @param state the target state */ public NFANonDigitTransition(NFAState state) : base(state) { }
/** * Adds a new outgoing character transition. If the target * state specified was null and an identical transition * already exists, it will be reused and its target returned. * * @param ch he character to match * @param ignoreCase the case-insensitive flag * @param state the target state, or null * * @return the transition target state */ public NFAState AddOut(char ch, bool ignoreCase, NFAState state) { if (ignoreCase) { if (state == null) { state = new NFAState(); } AddOut(new NFACharTransition(Char.ToLower(ch), state)); AddOut(new NFACharTransition(Char.ToUpper(ch), state)); return state; } else { if (state == null) { state = FindUniqueCharTransition(ch); if (state != null) { return state; } state = new NFAState(); } return AddOut(new NFACharTransition(ch, state)); } }
/** * Adds a new entry at the end of the queue. This operation * is mostly fast, unless all the allocated queue space has * already been used. * * @param state the state to add */ public void AddLast(NFAState state) { if (last >= queue.Length) { if (first <= 0) { Array.Resize(ref queue, queue.Length * 2); } else { Array.Copy(queue, first, queue, 0, last - first); last -= first; mark -= first; first = 0; } } queue[last++] = state; }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return(new NFAWhitespaceTransition(state)); }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return new NFANonWordTransition(state); }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { NFACharRangeTransition copy; copy = new NFACharRangeTransition(inverse, ignoreCase, state); copy.contents = contents; return copy; }
/** * Merges all the transitions in this state into another * state. * * @param state the state to merge into */ public void MergeInto(NFAState state) { for (int i = 0; i < incoming.Length; i++) { state.AddIn(incoming[i]); incoming[i].state = state; } incoming = null; for (int i = 0; i < outgoing.Length; i++) { state.AddOut(outgoing[i]); } outgoing = null; }
/** * Updates the statistical counters for the NFA generated. * * @param state the current state to visit * @param visited the lookup map of visited states */ private void UpdateStats(NFAState state, Hashtable visited) { if (!visited.ContainsKey(state)) { visited.Add(state, state); stateCount++; for (int i = 0; i < state.outgoing.Length; i++) { transitionCount++; if (state.outgoing[i] is NFAEpsilonTransition) { epsilonCount++; } UpdateStats(state.outgoing[i].state, visited); } } }
/** * Parses a regular expression term. This method handles the * Term production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseTerm(NFAState start) { NFAState end; end = ParseFact(start); while (true) { switch (PeekChar(0)) { case -1: case ')': case ']': case '{': case '}': case '?': case '+': case '|': return end; default: end = ParseFact(end); break; } } }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return new NFAWhitespaceTransition(state); }
/** * Parses a regular expression atom. This method handles the * Atom production in the grammar (see regexp.grammar). * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseAtom(NFAState start) { NFAState end; switch (PeekChar(0)) { case '.': ReadChar('.'); return start.AddOut(new NFADotTransition(new NFAState())); case '(': ReadChar('('); end = ParseExpr(start); ReadChar(')'); return end; case '[': ReadChar('['); end = ParseCharSet(start); ReadChar(']'); return end; case -1: case ')': case ']': case '{': case '}': case '?': case '*': case '+': case '|': throw new RegExpException( RegExpException.ErrorType.UNEXPECTED_CHARACTER, pos, pattern); default: return ParseChar(start); } }
/** * Parses a regular expression character set. This method handles * the contents of the '[...]' construct in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseCharSet(NFAState start) { NFAState end = new NFAState(); NFACharRangeTransition range; char min; char max; if (PeekChar(0) == '^') { ReadChar('^'); range = new NFACharRangeTransition(true, ignoreCase, end); } else { range = new NFACharRangeTransition(false, ignoreCase, end); } start.AddOut(range); while (PeekChar(0) > 0) { min = (char) PeekChar(0); switch (min) { case ']': return end; case '\\': range.AddCharacter(ReadEscapeChar()); break; default: ReadChar(min); if (PeekChar(0) == '-' && PeekChar(1) > 0 && PeekChar(1) != ']') { ReadChar('-'); max = ReadChar(); range.AddRange(min, max); } else { range.AddCharacter(min); } break; } } return end; }
/** * Creates a copy of this transition but with another target * state. * * @param state the new target state * * @return an identical copy of this transition */ public override NFATransition Copy(NFAState state) { return new NFACharTransition(match, state); }
/** * Parses a regular expression character escape. This method * handles a single character escape in a regular expression. * * @param start the initial NFA state * * @return the terminating NFA state * * @throws RegExpException if an error was encountered in the * pattern string */ private NFAState ParseEscapeChar(NFAState start) { NFAState end = new NFAState(); if (PeekChar(0) == '\\' && PeekChar(1) > 0) { switch ((char) PeekChar(1)) { case 'd': ReadChar(); ReadChar(); return start.AddOut(new NFADigitTransition(end)); case 'D': ReadChar(); ReadChar(); return start.AddOut(new NFANonDigitTransition(end)); case 's': ReadChar(); ReadChar(); return start.AddOut(new NFAWhitespaceTransition(end)); case 'S': ReadChar(); ReadChar(); return start.AddOut(new NFANonWhitespaceTransition(end)); case 'w': ReadChar(); ReadChar(); return start.AddOut(new NFAWordTransition(end)); case 'W': ReadChar(); ReadChar(); return start.AddOut(new NFANonWordTransition(end)); } } return start.AddOut(ReadEscapeChar(), ignoreCase, end); }
/// <summary> /// Parses a regular expression atom modifier. This method handles /// the AtomModifier production in the grammar (see regexp.grammar). /// </summary> /// <param name="start">The initial NFA state</param> /// <param name="end">The terminal NFA state</param> /// <returns>The terminating NFA state</returns> /// <exception cref="RegExpException"> /// If an error was encountered in the pattern string /// </exception> private NFAState ParseAtomModifier( NFAState start, NFAState end) { int min = 0; int max = -1; int firstPos = this.pos; // Read min and max switch (this.ReadChar()) { case '?': min = 0; max = 1; break; case '*': min = 0; max = -1; break; case '+': min = 1; max = -1; break; case '{': min = this.ReadNumber(); max = min; if (this.PeekChar(0) == ',') { this.ReadChar(','); max = -1; if (this.PeekChar(0) != '}') { max = this.ReadNumber(); } } this.ReadChar('}'); if (max == 0 || (max > 0 && min > max)) { throw new RegExpException( RegExpException.ErrorType.InvalidRepeatCount, firstPos, this.pattern); } break; default: throw new RegExpException( RegExpException.ErrorType.UnexpectedCharacter, this.pos - 1, this.pattern); } // Read possessive or reluctant modifiers if (this.PeekChar(0) == '?') { throw new RegExpException( RegExpException.ErrorType.UnsupportedSpecialCharacter, this.pos, this.pattern); } else if (this.PeekChar(0) == '+') { throw new RegExpException( RegExpException.ErrorType.UnsupportedSpecialCharacter, this.pos, this.pattern); } // Handle supported repeaters if (min == 0 && max == 1) { return(start.AddOut(new NFAEpsilonTransition(end))); } else if (min == 0 && max == -1) { if (end.Outgoing.Count == 0) { end.MergeInto(start); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(start); } else if (min == 1 && max == -1) { if (start.Outgoing.Count == 1 && end.Outgoing.Count == 0 && end.Incoming.Count == 1 && start.Outgoing[0] == end.Incoming[0]) { end.AddOut(start.Outgoing[0].Copy(end)); } else { end.AddOut(new NFAEpsilonTransition(start)); } return(end); } else { throw new RegExpException( RegExpException.ErrorType.InvalidRepeatCount, firstPos, this.pattern); } }