/// <summary> /// Replace end states in all transitions starting from base /// </summary> /// <param name="instat"></param> /// <param name="from"></param> /// <param name="to"></param> private static void replaceStates(RegularState instat, RegularState from, RegularState to) { Stack <RegularState> states = new Stack <RegularState>(); states.Push(instat); Set <RegularState> visited = new Set <RegularState>(); visited.Add(to); while (states.Count > 0) { RegularState state = states.Pop(); visited.Add(state); foreach (RegularTransition rt in state.getOutTransitions()) { if (rt.getEndState().Equals(from)) { rt.setEndState(to); continue; } RegularState st = rt.getEndState(); if (!visited.Contains(st)) { states.Push(st); } } } }
private int traverseStates(string word) { Queue <RegularState> queue = new Queue <RegularState>(states); Set <RegularState> visited = new Set <RegularState>(); states.Clear(); while (queue.Count > 0) { RegularState state = queue.Dequeue(); hasGreedyEnd = state.isGreedyEnd(); if (hasGreedyEnd) { return(0); } visited.Add(state); foreach (RegularTransition rt in state.getOutTransitions()) { if (rt.isEmpty()) { RegularState rst = rt.getEndState(); if (!queue.Contains(rst) && !visited.Contains(rst)) { queue.Enqueue(rst); } } else if (rt.match(word)) { addState(rt.getEndState()); } } } return(states.Count); }
/// <summary> /// Process multiplication (?,*,+) of subexpressions. /// </summary> /// <param name="expr"></param> /// <param name="lexer"></param> /// <param name="endState"></param> private static void multTransform(RegularState expr, Lexer lexer, RegularState endState) { Token t = lexer.token(); if (t.type == Token.STAR) { lexer.nextToken(); // replace the end states with self replaceStates(expr, endState, expr); // lambda makes it optional new RegularTransition(expr, endState); } else if (t.type == Token.OPT) { lexer.nextToken(); // simply add lambda new RegularTransition(expr, endState); } else if (t.type == Token.PLUS) { lexer.nextToken(); // replace the end states with this state RegularState newEnd = new RegularState(); replaceStates(expr, endState, newEnd); // lambda to begin of expression new RegularTransition(newEnd, expr); // or to end state (had 1 iteration) new RegularTransition(newEnd, endState); } }
/// <summary> /// Set a new start state /// </summary> /// <param name="newStart"></param> public void setStartState(RegularState newStart) { if (startState != null) { startState.removeOutTransition(this); } startState = newStart; startState.addOutTransition(this); }
/// <summary> /// Process a word. /// </summary> /// <param name="lexer"></param> /// <param name="endState"></param> /// <returns></returns> private static RegularState pWord(Lexer lexer, RegularState endState) { Token t = lexer.token(); lexer.nextToken(); RegularState result = new RegularState(); RegularTransition transition = new RegularTransition(result, endState); transition.addLabel(t.ToString()); return(result); }
/// <summary> /// Process a subexpression /// </summary> /// <param name="lexer"></param> /// <param name="endState"></param> /// <returns></returns> private static RegularState pSubexp(Lexer lexer, RegularState endState) { lexer.nextToken(); RegularState result = pAlt(lexer, endState); if (lexer.token().type != Token.PRIGHT) { throw new PatternParseException(String.Format("Missing right paranthesis at #%d", lexer.charPos())); } lexer.nextToken(); multTransform(result, lexer, endState); return(result); }
private void addState(RegularState state) { if (states.Contains(state)) { return; } states.Add(state); // add lambda transitions foreach (RegularTransition rt in state.getOutTransitions()) { if (rt.isEmpty()) { addState(rt.getEndState()); } } }
/// <summary> /// Resolve greedy ends. /// </summary> /// <param name="visited"></param> /// <param name="endState"></param> public void resolveGreedyEnd(Set <RegularState> visited, RegularState endState) { bool hasSelfRef = false; bool hasEndRef = false; visited.Add(this); foreach (RegularTransition transition in outTransitions) { if (transition.isWildcard() && transition.getEndState() == this) { hasSelfRef = true; } else if (transition.isEmpty() && transition.getEndState() == endState) { // TODO: doesn't check if end state is reachable through lambda // transitions hasEndRef = true; } if (visited.Contains(transition.getEndState())) { continue; } transition.getEndState().resolveGreedyEnd(visited, endState); if (greedyEnd) { continue; } if (transition.isEmpty()) { if (transition.getEndState().isGreedyEnd()) { // a lambda transition to a greedyEnd state makes this state // also a greedyEnd state greedyEnd = true; } } } if (!greedyEnd) { greedyEnd = hasSelfRef && hasEndRef; } }
/// <summary> /// Parse the input string and return a regular automaton /// </summary> /// <param name="pattern"></param> /// <returns></returns> public static RegularAutomaton parse(string pattern) { if (pattern == null) { throw new PatternParseException("Pattern can not be null"); } if (!pattern.StartsWith("^")) { pattern = ".*" + pattern; } else { pattern = pattern.Substring(1); } if (!pattern.EndsWith("$")) { pattern = pattern + ".*"; } else { if (pattern.Length > 0) { pattern = pattern.Substring(0, pattern.Length - 1); } } Lexer lexer = new Lexer(pattern); RegularState end = new FinalRegularState(); RegularState start = pAlt(lexer, end); start.resolveGreedyEnd(new Set <RegularState>(), end); RegularAutomaton auto = new RegularAutomaton(); auto.setStartState(start); auto.setEndState(end); if (lexer.token().type != Token.EOF) { throw new PatternParseException(String.Format("Garbage token '%s' at #%d", lexer.token().text, lexer .charPos())); } return(auto); }
/// <summary> /// Return true when the provided state has a path to itself /// </summary> /// <param name="self"></param> /// <returns></returns> private static bool hasSelfReference(RegularState self) { Stack <RegularState> states = new Stack <RegularState>(); states.Push(self); Set <RegularState> visited = new Set <RegularState>(); while (states.Count > 0) { RegularState state = states.Pop(); visited.Add(state); foreach (RegularTransition rt in state.getOutTransitions()) { if (rt.getEndState().Equals(self)) { return(true); } } } return(false); }
/// <summary> /// set the new end state /// </summary> /// <param name="newEndState"></param> public void setEndState(RegularState newEndState) { endState = newEndState; }
/// <summary> /// Set the new start state /// </summary> /// <param name="newStartState"></param> public void setStartState(RegularState newStartState) { startState = newStartState; }
/// <summary> /// Creates an automaton with a given start and end state /// </summary> /// <param name="inStart"></param> /// <param name="inEnd"></param> public RegularAutomaton(RegularState inStart, RegularState inEnd) { startState = inStart; endState = inEnd; }
/// <summary> /// Create an automaton with a given start state and a newly created end state /// </summary> /// <param name="inStart"></param> public RegularAutomaton(RegularState inStart) { startState = inStart; endState = new RegularState(); }
/// <summary> /// Process alternatives. This will optimize the state machine where /// possible. Edges with identical destinations will be merged, but only /// when they are not a lambda transition or contain a self reference. /// </summary> /// <param name="lexer"></param> /// <param name="endState"></param> /// <returns></returns> private static RegularState pAlt(Lexer lexer, RegularState endState) { List <RegularState> alts = new List <RegularState>(); alts.Add(pSeq(lexer, endState)); while (lexer.token().type == Token.OR) { lexer.nextToken(); alts.Add(pSeq(lexer, endState)); } if (alts.Count == 1) { return(alts[0]); } // combine transitions with identical destinations and negation RegularState result = new RegularState(); Dictionary <RegularState, List <RegularTransition> > destMap = new Dictionary <RegularState, List <RegularTransition> >(); foreach (RegularState state in alts) { if (hasSelfReference(state)) { // add lambda to self referencing states new RegularTransition(result, state); continue; } foreach (RegularTransition rt in state.getOutTransitions()) { List <RegularTransition> dst; if (!destMap.TryGetValue(rt.getEndState(), out dst)) { dst = new List <RegularTransition>(); destMap.Add(rt.getEndState(), dst); } dst.Add(rt); } } foreach (RegularState key in destMap.Keys) { List <RegularTransition> value = destMap[key]; RegularTransition regrt = null; RegularTransition neqrt = null; RegularTransition lambda = null; foreach (RegularTransition rt in value) { if (rt.isEmpty()) { // don't combine lambda transitions with others if (lambda == null) { lambda = new RegularTransition(result, key); } } else if (rt.isNegation()) { if (neqrt == null) { neqrt = new RegularTransition(result, key); neqrt.setNegation(true); } neqrt.addLabels(rt.getLabels()); } else { if (regrt == null) { regrt = new RegularTransition(result, key); } regrt.addLabels(rt.getLabels()); } } } return(result); }
/// <summary> /// Process sequences of words and subexpressions /// </summary> /// <param name="lexer"></param> /// <param name="endState"></param> /// <returns></returns> private static RegularState pSeq(Lexer lexer, RegularState endState) { RegularState result = null; RegularState lhs = null; while (lexer.token().type == Token.WORD || lexer.token().type == Token.NOT || lexer.token().type == Token.PLEFT || lexer.token().type == Token.DOT) { RegularState rhs; if (lexer.token().type == Token.WORD) { rhs = pWord(lexer, endState); } else if (lexer.token().type == Token.DOT) { lexer.nextToken(); rhs = new RegularState(); RegularTransition transition = new RegularTransition(rhs, endState); transition.addLabel(RegularTransition.WILDCARD); multTransform(rhs, lexer, endState); } else if (lexer.token().type == Token.NOT) { // ![word1,word2,word3,word4,...] rhs = new RegularState(); RegularTransition transition = new RegularTransition(rhs, endState); transition.setNegation(true); lexer.nextToken(); while (lexer.token().type == Token.WORD) { transition.addLabel(lexer.token().ToString()); lexer.nextToken(); if (lexer.token().type == Token.COMMA) { lexer.nextToken(); } } if (lexer.token().type != Token.SRIGHT) { throw new PatternParseException(String.Format("Missing right square bracket at #%d", lexer .charPos())); } lexer.nextToken(); multTransform(rhs, lexer, endState); } else { rhs = pSubexp(lexer, endState); } if (result == null) { // first item in the sequence is the return value; result = rhs; } if (lhs != null) { // link end state of previous item in the list to current // item replaceStates(lhs, endState, rhs); } lhs = rhs; } if (lexer.token().type == Token.EOF && result == null) { // empty regex return(endState); } if (result == null) { throw new PatternParseException(String.Format("Unexpected token '%s' at #%d", lexer.token().text, lexer .charPos())); } return(result); }
/// <summary> /// Create a new transition with a given start and end /// </summary> /// <param name="mystart"></param> /// <param name="myend"></param> public RegularTransition(RegularState mystart, RegularState myend) { setStartState(mystart); setEndState(myend); labels = new Set <string>(); }