//Adds a control character transition private RegexTrie <T> addSpecialTransition(int controlCode, RegexTrie <T> endNodeOfPreviousComponent) { RegexTrie <T> startNodeOfCurrentComponent = new RegexTrie <T>(); //Connect start node to previous end node endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent); //Use STX control character return(startNodeOfCurrentComponent.GetOrAddChildNode((char)controlCode, 0)); }
//This adds the transitions required to make a group trie correspond to the correct quantified regex private RegexTrie <T> joinQuantifiedTrie(MinMaxResult result, RegexTrie <T> startNodeOfCurrentComponent, RegexTrie <T> endNodeOfCurrentComponent, RegexTrie <T> endNodeOfPreviousComponent) { //This is the end node which will be reached by epsilon transition in case of zeroed quantifiers //or quantifiers with a number range of more than one, start node is used for zeroing RegexTrie <T> epsilonEndNode = new RegexTrie <T>(); RegexTrie <T> epsilonStartNode = new RegexTrie <T>(); //This holds the end node used during the loop RegexTrie <T> loopEndNode = endNodeOfCurrentComponent; //Loop from 1 (one occurrence already exists) to max occurrences. If loop index is greater than min, //add epsilon from iteration trie end to final trie end. if (result.minOccurs > 1 || result.maxOccurs > 1) { //This is used as a model for copying, only needs to be calculated if there's a set number of occurrences KeyValuePair <RegexTrie <T>, RegexTrie <T> > trieModel = copyTrie(startNodeOfCurrentComponent, endNodeOfCurrentComponent); //Result of trie copy KeyValuePair <RegexTrie <T>, RegexTrie <T> > trieCopy; for (int index = 2; index <= result.maxOccurs; index++) { if (index <= result.maxOccurs) { //Copy the trie and link it to the end of trie trieCopy = copyTrie(trieModel); loopEndNode.AddEpsilonTransition(trieCopy.Key); //If the index is equal or greater than minoccurs, add epsilon to end node if (result.minOccurs <= index) { trieCopy.Value.AddEpsilonTransition(epsilonEndNode); } //Assign the end node of the copy as the end node of the component loopEndNode = trieCopy.Value; } } } //If there's less than two occurrences, connect the end of the first span of the trie to the epsilon end trie //Do this after the copy loop in order to not disturb the copying. if (result.minOccurs < 2) { endNodeOfCurrentComponent.AddEpsilonTransition(epsilonEndNode); } //If the quantifier is starred, add epsilon from end to start if (result.starred) { endNodeOfCurrentComponent.AddEpsilonTransition(startNodeOfCurrentComponent); } //If the quantifier is zeroed, add an epsilon from epsilon start node //newCurrentTrie if (result.zeroInclusive) { epsilonStartNode.AddEpsilonTransition(startNodeOfCurrentComponent); epsilonStartNode.AddEpsilonTransition(epsilonEndNode); startNodeOfCurrentComponent = epsilonStartNode; } //Connect trie to previous trie endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent); return(epsilonEndNode); }
//This checks for a quantifier immediately after the character, and joins the current trie to the end of the //previous trie. Returns a pair of int (to increment string index) and the node to which further tries will be connected private KeyValuePair <int, RegexTrie <T> > checkForQuantifier( string sourceTail, RegexTrie <T> endNodeOfCurrentComponent, RegexTrie <T> startNodeOfCurrentComponent, RegexTrie <T> endNodeOfPreviousComponent, Stack <RegexTrie <T> > commonDestination) { char nextChar; try { nextChar = sourceTail[0]; } catch { //assign a non-special character to trigger the default case nextChar = 'n'; } switch (nextChar) { case '*': { return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie( new MinMaxResult(1, 1, true, true), startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } case '+': { return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie( new MinMaxResult(1, 1, true, false), startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } case '?': { return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie( new MinMaxResult(1, 1, false, true), startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } case '{': { int closeBraceIndex = sourceTail.IndexOf('}'); string quantString = sourceTail.Substring(1, closeBraceIndex - 1); MinMaxResult minMax = findMinMax(quantString); return(new KeyValuePair <int, RegexTrie <T> >(closeBraceIndex + 1, joinQuantifiedTrie( minMax, startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } default: { endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent); return(new KeyValuePair <int, RegexTrie <T> >(0, endNodeOfCurrentComponent)); } } }
//This has to keep track of four nodes: end of the previous component, start of the current component, //end of the current component and the node that is connected to the end of the current component with //an epsilon transition (which will become the end of the previous component for the next component). public void AddToRegexTrie (RegexTrie <T> trie, string source, T matchobject, Stack <RegexTrie <T> > endStack, Stack <RegexTrie <T> > startStack, Stack <RegexTrie <T> > commonDestination) { RegexTrie <T> newTrie = new RegexTrie <T>(); //Current trie being added to RegexTrie <T> endNodeOfCurrentComponent = trie; //End of previous trie, to which the current trie is joined RegexTrie <T> endNodeOfPreviousComponent = trie; //End of previous trie, to which the current trie is joined RegexTrie <T> startNodeOfCurrentComponent = trie; //Push trie on the start stack startStack.Push(trie); //Push a null on the commonStart and commonDestination stacks commonDestination.Push(null); //Matching group counter. byte groupCount = 1; //True if previous character was an escape char Boolean escapedCharacter = false; //Variable for the character at the loop index char currentChar; //Holds the result of the quantifier check KeyValuePair <int, RegexTrie <T> > quantifierCheckResult; for (int stringIndex = 0; stringIndex < source.Length; stringIndex++) { currentChar = source[stringIndex]; //Check for escape character if (currentChar == '\\') { escapedCharacter = true; continue; } //If the character is escaped, just make it into a trie if (escapedCharacter) { //Special char handling if (currentChar == 't') { currentChar = '\t'; } startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = startNodeOfCurrentComponent.GetOrAddChildNode(currentChar, 0); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; escapedCharacter = false; continue; } //Else check character for special meaning else { switch (currentChar) { //Open a new group case '(': { //Push the previous trie to the end stack endStack.Push(endNodeOfPreviousComponent); //Create a new trie endNodeOfPreviousComponent = new RegexTrie <T>(); //Push the newly created trie on the start stack startStack.Push(endNodeOfPreviousComponent); //Push a null trie on the common destination and start stacks (to be defined, if pipes are found) commonDestination.Push(null); break; } //Close a group case ')': { //If common destination exists, add an epsilon transition to it if (commonDestination.Peek() != null) { //Connect the end node and common destination as part of the same epsilon closure commonDestination.Peek().AddEpsilonTransition(endNodeOfPreviousComponent); endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek()); //Move the current trie to common destination, as that's where the building will continue //Pop the common destination, it won't be needed anymore endNodeOfCurrentComponent = commonDestination.Pop(); } else { endNodeOfCurrentComponent = endNodeOfPreviousComponent; //Pop the null destination commonDestination.Pop(); } startNodeOfCurrentComponent = startStack.Pop(); endNodeOfPreviousComponent = endStack.Pop(); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; try { endStack.Peek(); } //If we're at the top, add group number to each node of the trie catch { addGroupNumbers(startNodeOfCurrentComponent, groupCount); groupCount++; } break; } //Handle square bracket set case '[': { startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = new RegexTrie <T>(); //This skips over the closing square bracket, so there's no need for closing square bracket handling stringIndex += handleSquareBracketGroup(startNodeOfCurrentComponent, endNodeOfCurrentComponent, source.Substring(stringIndex + 1)); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; break; } //Caret at the start: add a transition with a control character that won't exist in text. //Feed the control character when finding matches. case '^': { endNodeOfPreviousComponent = addSpecialTransition(2, endNodeOfPreviousComponent); break; } //Dollar at end: add a transition with a control character that won't exist in text. //Feed the control character when finding matches. case '$': { endNodeOfPreviousComponent = addSpecialTransition(3, endNodeOfPreviousComponent); break; } //Period handling case '.': { startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = new RegexTrie <T>(); //Add complement of null character startNodeOfCurrentComponent.complementTransitions.Add( new ComplementTransition <T>( 0, endNodeOfCurrentComponent, new List <char>(), new List <KeyValuePair <char, char> >() { new KeyValuePair <char, char>((char)0, (char)0) })); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; break; } //Change previous end to common destination and move end node to common start case '|': { if (commonDestination.Peek() == null) { commonDestination.Pop(); commonDestination.Push(endNodeOfPreviousComponent); } else { //Connect the end node and common destination as part of the same epsilon closure commonDestination.Peek().AddEpsilonTransition(endNodeOfPreviousComponent); endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek()); } endNodeOfPreviousComponent = startStack.Peek(); break; } default: { startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = startNodeOfCurrentComponent.GetOrAddChildNode(currentChar, 0); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; break; } } } } //Link end node to common destination and start node to common start if they exist if (commonDestination.Peek() != null) { endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek()); endNodeOfPreviousComponent = commonDestination.Pop(); } //Add translation and replace fields to the current trie endNodeOfPreviousComponent.matches = new List <T> { matchobject }; }