//Adds the a translation at the node to the results private void addTranslation( RegexPathWithStartCharacterPosition triePath, List <PositionAndTranslation> positionAndTranslationOfTerms, int endCharacter) { try { int termStartCharacter = triePath.startCharacterPosition; string termTranslation; string termReplaces; //If there are groups, insert their contents into the regex translation termTranslation = insertGroupsIntoRegex(triePath.trie.matches[0].translation, triePath.groups, false); termReplaces = insertGroupsIntoRegex(triePath.trie.matches[0].replacement, triePath.groups, true); //The end character position is character index minus one, as the current character is a boundary character positionAndTranslationOfTerms.Add(new PositionAndTranslation( termStartCharacter, endCharacter, termTranslation, termReplaces, true)); } catch (Exception e) { MessageBox.Show(e.StackTrace); MessageBox.Show(e.Message); } }
//This will return all regexmatches found from the search string public List <PositionAndTranslation> FindRegexMatches (RegexTrie <TranslationAndReplacement> regexTrie, string inputString, string tokenBoundaryCharacterString, Boolean useBoundaryCharacters) { //triePathsBeingTraversed field holds the paths within the regexTrie that are currently being followed //Each value of the dictionary also contains the character position where following of the path was started //and string builder for the groups within the regex List <RegexPathWithStartCharacterPosition> triePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>(); //This list will be built during iteration and used as the new list in the next loop cycle List <RegexPathWithStartCharacterPosition> newTriePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>(); //positionAndTranslationOfTerms field holds the translations of the terms that have been discovered. //Each value of the dictionary also contains the start and end positions of the source term, //so that the translations can be inserted later List <PositionAndTranslation> positionAndTranslationOfTerms = new List <PositionAndTranslation>(); //Define the set of characters used to tokenize the searchString (I call these boundary characters) HashSet <Char> setOfTokenBoundaryCharacters = new HashSet <char>(); foreach (char tokenBoundaryChar in tokenBoundaryCharacterString) { setOfTokenBoundaryCharacters.Add(tokenBoundaryChar); } //Boolean for holding the result of checking whether character is a boundary character bool isTokenBoundaryCharacter; //Add the initial path to newTriePathsBeingTraversed at index 0 int pathIndex = 0; newTriePathsBeingTraversed.Add(new RegexPathWithStartCharacterPosition(0, regexTrie)); pathIndex++; //Initialize the current character variable char currentChar; //Feed the start control character to reach the portion of the trie with the //string start relative regexes if (regexTrie.characterTransitions.ContainsKey((char)2)) { //The start character is never a part of group, so just select the zero group transition newTriePathsBeingTraversed.Add( new RegexPathWithStartCharacterPosition( 0, regexTrie.characterTransitions[(char)2][0].destination)); } //Iterate over the inputString. for (int charIndex = 0; charIndex < inputString.Length; charIndex++) { //Remove duplicate paths (with same trieAndGroups) from the paths triePathsBeingTraversed = removeDuplicateTrieAndGroups(newTriePathsBeingTraversed); newTriePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>(); currentChar = inputString[charIndex]; //Check if char currentChar is a boundary character, or if the boundary characters are not used isTokenBoundaryCharacter = setOfTokenBoundaryCharacters.Contains(currentChar) || !useBoundaryCharacters; //Iterate over the triePathsBeingTraversed and check if they continue with the current char foreach (RegexPathWithStartCharacterPosition triePath in triePathsBeingTraversed) { //If the char is a boundary character, check for translation if (isTokenBoundaryCharacter) { if (triePath.trie.matches != null) { addTranslation(triePath, positionAndTranslationOfTerms, charIndex - 1); } } //Check whether path continues with the current char or range transitions, //and add continuing paths checkWhetherPathContinues(triePath, newTriePathsBeingTraversed, currentChar); } //If char is a boundarycharacter, add a new path if (isTokenBoundaryCharacter) { //The term will actually begin at the next charIndex (next loop), so add +1 to key newTriePathsBeingTraversed.Add(new RegexPathWithStartCharacterPosition(charIndex + 1, regexTrie)); } } //Check if any of the tries has a translation at the last position foreach (RegexPathWithStartCharacterPosition triePath in newTriePathsBeingTraversed) { //File.AppendAllText(@"C:\Users\Anonyymi\Desktop\log.txt", String.Join(",",triePath.trie.matches.Select(x => x.translation).ToArray())); //Feed the end control character to reach the portion of the trie with the //string end relative regexes if (triePath.trie.characterTransitions.ContainsKey((char)3)) { RegexPathWithStartCharacterPosition endCharConsumed = new RegexPathWithStartCharacterPosition( triePath.startCharacterPosition, triePath.trie.characterTransitions[(char)3][0].destination, triePath.groups); //There's always an translation behind the end character addTranslation(endCharConsumed, positionAndTranslationOfTerms, inputString.Length - 1); } //The end position needs to be the length of the input string - 1 , as spans are start to end character if (triePath.trie.matches != null) { addTranslation(triePath, positionAndTranslationOfTerms, inputString.Length - 1); } } return(positionAndTranslationOfTerms); }
//Checks whether a trie path continues with the current character private void checkWhetherPathContinues( RegexPathWithStartCharacterPosition triePath, List <RegexPathWithStartCharacterPosition> newTriePathBeingTraversed, char currentChar) { //Get the groups in this path Dictionary <int, StringBuilder> currentGroups = triePath.groups; //Get the list of transitions for the current character from this path if (triePath.trie.characterTransitions.ContainsKey(currentChar)) { //Iterate over the possible paths for the char (possible groups) foreach (Transition <TranslationAndReplacement> trans in triePath.trie.characterTransitions[currentChar]) { //Check the group number. If the number is greater than zero, record the transition Dictionary <int, StringBuilder> newGroups = recordGroups(currentGroups, trans.groupNumber, currentChar); //Add trie to newTries newTriePathBeingTraversed.Add(new RegexPathWithStartCharacterPosition( triePath.startCharacterPosition, trans.destination, newGroups)); } } //Check normal range transitions foreach (var rangeTrans in triePath.trie.rangeTransitions) { //First check normal range transitions if (currentChar >= rangeTrans.rangeStart && currentChar <= rangeTrans.rangeEnd) { //Check the group number. If the number is greater than zero, record the transition //First make sure a string builder exists for the group number Dictionary <int, StringBuilder> newGroups = recordGroups(currentGroups, rangeTrans.groupNumber, currentChar); newTriePathBeingTraversed.Add(new RegexPathWithStartCharacterPosition( triePath.startCharacterPosition, rangeTrans.destination, newGroups)); } } //Check complement transitions foreach (var compTrans in triePath.trie.complementTransitions) { Boolean follow = true; if (compTrans.characters.Contains(currentChar)) { follow = false; } foreach (var range in compTrans.ranges) { if (currentChar >= range.Key && currentChar <= range.Value) { follow = false; } } if (follow) { Dictionary <int, StringBuilder> newGroups = recordGroups(currentGroups, compTrans.groupNumber, currentChar); newTriePathBeingTraversed.Add(new RegexPathWithStartCharacterPosition( triePath.startCharacterPosition, compTrans.destination, newGroups)); } } }