public ComplementTransition(byte group, RegexTrie <T> dest, List <char> characters, List <KeyValuePair <char, char> > ranges) { this.groupNumber = group; this.destination = dest; this.characters = characters; this.ranges = ranges; }
public RangeTransition(byte group, RegexTrie <T> dest, char rangeStart, char rangeEnd) { this.groupNumber = group; this.destination = dest; this.rangeStart = rangeStart; this.rangeEnd = rangeEnd; }
public ComplementTransition(byte group, RegexTrie <T> dest) { this.groupNumber = group; this.destination = dest; this.characters = new List <char>(); this.ranges = new List <KeyValuePair <char, char> >(); }
public void loadTries() { //Reset the node counter RegexTrie <TranslationAndReplacement> .counter = 0; //Load tries char delimiter = delimiterToChar(this.Options.Delimiter); //Load the exact match tries this.exactMatchTrieSource = new Trie(); this.exactMatchTrieReplaces = new Trie(); this.trieLoader.loadTrieFromFile( this.Options.GlossaryFileName, this.Options.MatchCase == "true" ? true : false, delimiter, this.exactMatchTrieSource, this.exactMatchTrieReplaces); //Load regex tries this.regexTrieSource = new RegexTrie <TranslationAndReplacement>(); this.regexTrieReplaces = new RegexTrie <TranslationAndReplacement>(); //Pass the tries by ref, as determinisation needs to return a new trie this.trieLoader.loadRegexTrieFromFile( this.Options.RegexFileName, delimiter, ref this.regexTrieSource, ref this.regexTrieReplaces); }
//Adds a range transition, which is used for character ranges and complements public void AddRangeTransition( RegexTrie <T> dest, char rangeStart, char rangeEnd, byte groupNumber) { this.rangeTransitions.Add(new RangeTransition <T>(groupNumber, dest, rangeStart, rangeEnd)); }
//Adds a control character transition private RegexTrie <T> addSpecialTransition(int controlCode, RegexTrie <T> endNodeOfPreviousComponent) { RegexTrie <T> startNodeOfCurrentComponent = new RegexTrie <T>(); //Connect start node to previous end node endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent); //Use STX control character return(startNodeOfCurrentComponent.GetOrAddChildNode((char)controlCode, 0)); }
public RegexPathWithStartCharacterPosition(int startCharacterPosition, RegexTrie <TranslationAndReplacement> trie) { //Record the position where following of the regex trie was started this.startCharacterPosition = startCharacterPosition; //Initialize the list of trieAndGroups objects this.trie = trie; this.groups = new Dictionary <int, StringBuilder>(); }
//Version of the method without stack arguments (so they don't need to be initialized in the calling code) public int AddToRegexTrie(RegexTrie <T> trie, string source, T matchobject) { AddToRegexTrie( trie, source, matchobject, new Stack <RegexTrie <T> >(), new Stack <RegexTrie <T> >(), new Stack <RegexTrie <T> >()); return(0); }
public TermInjectorTranslationProviderElementTermExtractionVisitor( TermInjectorTranslationOptions options, Trie glossaryTrie, RegexTrie <TranslationAndReplacement> regexTrie) { _options = options; _trieProcessor = new TrieProcessing(); _trie = glossaryTrie; //Initialize the regex trie _regexTrie = regexTrie; _TermList = new List <PositionAndTranslation>(); }
public RegexPathWithStartCharacterPosition( int startCharacterPosition, RegexTrie <TranslationAndReplacement> trie, Dictionary <int, StringBuilder> groups) { //Record the position where following of the regex trie was started this.startCharacterPosition = startCharacterPosition; //Initialize the list of trieAndGroups objects this.trie = trie; //String builder does not need to be added at this point, as string builders are only added //when group number greater than one is encountered in a transition this.groups = groups; }
public void addCharacterTransitionsToDFA( Dictionary <KeyValuePair <char, byte>, int> dfaTransitions, Dictionary <int, RegexTrie <T> > allDTries, RegexTrie <T> origin) { foreach (var trans in dfaTransitions) { if (!allDTries.ContainsKey(trans.Value)) { allDTries[trans.Value] = new RegexTrie <T>(); } RegexTrie <T> target = allDTries[trans.Value]; origin.AddCharacterTransition(trans.Key.Key, trans.Key.Value, target); } }
//This adds a list of three fields to the normal regex trie or the fuzzy regex trie //Returns true if fields were added to normal term trie, false if added to fuzzy trie, //and null if validation fails public Boolean?addFieldsToRegexTrie( List <string> fields, RegexTrie <TranslationAndReplacement> regexTrieSource, RegexTrie <TranslationAndReplacement> regexTrieReplaces) { //If first field has content, add fields to normal trie if (fields[0].Length > 0) { if (checkValidationErrors(fields[0], this.validationErrors)) { return(null); } //If there's a third field, also validate that if (fields.Count() > 2) { if (checkValidationErrors(fields[2], this.validationErrors)) { return(null); } } //add the fields to the trie this.regexTrieFactory.AddToRegexTrie( regexTrieSource, fields[0], new TranslationAndReplacement( fields[1], fields[2])); return(true); } //Otherwise add fields to fuzzy trie else { if (checkValidationErrors(fields[2], this.validationErrors)) { return(null); } this.regexTrieFactory.AddToRegexTrie( regexTrieReplaces, fields[2], new TranslationAndReplacement( fields[1], "")); return(false); } }
public void addComplementTransitionsToDFA( Dictionary <ComplementTransition <T>, int> dfaTransitions, Dictionary <int, RegexTrie <T> > allDTries, RegexTrie <T> origin) { foreach (var trans in dfaTransitions) { if (!allDTries.ContainsKey(trans.Value)) { allDTries[trans.Value] = new RegexTrie <T>(); } RegexTrie <T> target = allDTries[trans.Value]; origin.complementTransitions.Add(new ComplementTransition <T>( trans.Key.groupNumber, target, trans.Key.characters, trans.Key.ranges)); } }
public void addRangeTransitionsToDFA( Dictionary <RangeTransition <T>, int> dfaTransitions, Dictionary <int, RegexTrie <T> > allDTries, RegexTrie <T> origin) { foreach (var trans in dfaTransitions) { if (!allDTries.ContainsKey(trans.Value)) { allDTries[trans.Value] = new RegexTrie <T>(); } RegexTrie <T> target = allDTries[trans.Value]; origin.AddRangeTransition( target, trans.Key.rangeStart, trans.Key.rangeEnd, trans.Key.groupNumber); } }
//This is used to add a transition to the specified trie //The transitionDict determines whether the transition is added to the normal or complement transitions public void AddCharacterTransition( char key, byte groupNumber, RegexTrie <T> newTransition) { if (this.characterTransitions.ContainsKey(key)) { Transition <T> newTrans = new Transition <T>(groupNumber, newTransition); this.characterTransitions[key].Add(newTrans); } else { Transition <T> newTrans = new Transition <T>(groupNumber, newTransition); List <Transition <T> > newTransList = new List <Transition <T> >(); newTransList.Add(newTrans); this.characterTransitions.Add(key, newTransList); } }
public TermInjectorTranslationProviderElementTermReplacementVisitor( TermInjectorTranslationOptions options, Trie glossaryTrie, RegexTrie <TranslationAndReplacement> regexTrie) { _options = options; _segment = new Segment(); //Initialize the glossary trie _trie = glossaryTrie; //Initialize the regex trie _regexTrie = regexTrie; //Create a new trie processor _trieProcessor = new TrieProcessing(); //Initialize the dictionary which will contain the positions and translations //of terms _positionAndTranslationOfTerms = new List <PositionAndTranslation>(); //Boolean for indicating whether the original segment has been changed _originalSegmentChanged = false; }
//This goes through the trie adding the specified group number to transitions of each node private void addGroupNumbers(RegexTrie <T> trie, byte groupNumber) { //Make sure the same trie is not handled twice if (trie.groupsMarked) { return; } trie.groupsMarked = true; //Iterate over epsilon transitions foreach (RegexTrie <T> epsilon in trie.epsilonTransitions) { addGroupNumbers(epsilon, groupNumber); } //Iterate over the transitions, change the group number and call addGroupNumbers for transition destinations foreach (List <Transition <T> > transList in trie.characterTransitions.Values) { foreach (Transition <T> trans in transList) { trans.groupNumber = groupNumber; addGroupNumbers(trans.destination, groupNumber); } } //Do the same for range transitions foreach (RangeTransition <T> trans in trie.rangeTransitions) { trans.groupNumber = groupNumber; addGroupNumbers(trans.destination, groupNumber); } //Do the same for complement transitions foreach (ComplementTransition <T> trans in trie.complementTransitions) { trans.groupNumber = groupNumber; addGroupNumbers(trans.destination, groupNumber); } return; }
//This adds the transitions required to make a group trie correspond to the correct quantified regex private RegexTrie <T> joinQuantifiedTrie(MinMaxResult result, RegexTrie <T> startNodeOfCurrentComponent, RegexTrie <T> endNodeOfCurrentComponent, RegexTrie <T> endNodeOfPreviousComponent) { //This is the end node which will be reached by epsilon transition in case of zeroed quantifiers //or quantifiers with a number range of more than one, start node is used for zeroing RegexTrie <T> epsilonEndNode = new RegexTrie <T>(); RegexTrie <T> epsilonStartNode = new RegexTrie <T>(); //This holds the end node used during the loop RegexTrie <T> loopEndNode = endNodeOfCurrentComponent; //Loop from 1 (one occurrence already exists) to max occurrences. If loop index is greater than min, //add epsilon from iteration trie end to final trie end. if (result.minOccurs > 1 || result.maxOccurs > 1) { //This is used as a model for copying, only needs to be calculated if there's a set number of occurrences KeyValuePair <RegexTrie <T>, RegexTrie <T> > trieModel = copyTrie(startNodeOfCurrentComponent, endNodeOfCurrentComponent); //Result of trie copy KeyValuePair <RegexTrie <T>, RegexTrie <T> > trieCopy; for (int index = 2; index <= result.maxOccurs; index++) { if (index <= result.maxOccurs) { //Copy the trie and link it to the end of trie trieCopy = copyTrie(trieModel); loopEndNode.AddEpsilonTransition(trieCopy.Key); //If the index is equal or greater than minoccurs, add epsilon to end node if (result.minOccurs <= index) { trieCopy.Value.AddEpsilonTransition(epsilonEndNode); } //Assign the end node of the copy as the end node of the component loopEndNode = trieCopy.Value; } } } //If there's less than two occurrences, connect the end of the first span of the trie to the epsilon end trie //Do this after the copy loop in order to not disturb the copying. if (result.minOccurs < 2) { endNodeOfCurrentComponent.AddEpsilonTransition(epsilonEndNode); } //If the quantifier is starred, add epsilon from end to start if (result.starred) { endNodeOfCurrentComponent.AddEpsilonTransition(startNodeOfCurrentComponent); } //If the quantifier is zeroed, add an epsilon from epsilon start node //newCurrentTrie if (result.zeroInclusive) { epsilonStartNode.AddEpsilonTransition(startNodeOfCurrentComponent); epsilonStartNode.AddEpsilonTransition(epsilonEndNode); startNodeOfCurrentComponent = epsilonStartNode; } //Connect trie to previous trie endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent); return(epsilonEndNode); }
//Makes an exact copy of a trie, returns the start and end nodes private KeyValuePair <RegexTrie <T>, RegexTrie <T> > copyTrie(RegexTrie <T> start, RegexTrie <T> end) { //The start node of the copied trie RegexTrie <T> trieCopy = new RegexTrie <T>(); //The source node being copied RegexTrie <T> sourceNode; //The target node being copied RegexTrie <T> targetNode; //Dictionary specifying which source node corresponds to which target node Dictionary <RegexTrie <T>, RegexTrie <T> > nodeCorrespondences = new Dictionary <RegexTrie <T>, RegexTrie <T> >(); //The first correspondence is that of start and trieCopy nodeCorrespondences.Add(start, trieCopy); //List of visited nodes, which won't be added to the stack when encountered List <RegexTrie <T> > visitedNodes = new List <RegexTrie <T> >(); //Node stack used to imitate recursion Stack <KeyValuePair <RegexTrie <T>, RegexTrie <T> > > nodeStack = new Stack <KeyValuePair <RegexTrie <T>, RegexTrie <T> > >(); //Push the start of the trie and the copy to the stack nodeStack.Push(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(start, trieCopy)); while (nodeStack.Count > 0) { KeyValuePair <RegexTrie <T>, RegexTrie <T> > currentPair = nodeStack.Pop(); sourceNode = currentPair.Key; targetNode = currentPair.Value; visitedNodes.Add(sourceNode); //Copy epsilon transitions foreach (var node in sourceNode.epsilonTransitions) { //Add the epsilon transition to the target node if (!nodeCorrespondences.ContainsKey(node)) { RegexTrie <T> newCorrespondingNode = new RegexTrie <T>(); nodeCorrespondences.Add(node, newCorrespondingNode); } targetNode.AddEpsilonTransition(nodeCorrespondences[node]); //If the node has not been visited, push it to the stack if (!visitedNodes.Contains(node)) { nodeStack.Push(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(node, nodeCorrespondences[node])); } } //Copy character transitions foreach (var key in sourceNode.characterTransitions.Keys) { foreach (var transition in sourceNode.characterTransitions[key]) { //Add the transition to the target node if (!nodeCorrespondences.ContainsKey(transition.destination)) { RegexTrie <T> newCorrespondingNode = new RegexTrie <T>(); nodeCorrespondences.Add(transition.destination, newCorrespondingNode); } if (targetNode.characterTransitions.ContainsKey(key)) { targetNode.characterTransitions[key].Add( new Transition <T>(transition.groupNumber, nodeCorrespondences[transition.destination])); } else { targetNode.characterTransitions.Add(key, new List <Transition <T> >()); targetNode.characterTransitions[key].Add( new Transition <T>(transition.groupNumber, nodeCorrespondences[transition.destination])); } //If the node has not been visited, push it to the stack if (!visitedNodes.Contains(transition.destination)) { nodeStack.Push(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >( transition.destination, nodeCorrespondences[transition.destination])); } } } //Copy range transitions foreach (var rangeTrans in sourceNode.rangeTransitions) { if (!nodeCorrespondences.ContainsKey(rangeTrans.destination)) { RegexTrie <T> newCorrespondingNode = new RegexTrie <T>(); nodeCorrespondences.Add(rangeTrans.destination, newCorrespondingNode); } targetNode.AddRangeTransition( nodeCorrespondences[rangeTrans.destination], rangeTrans.rangeStart, rangeTrans.rangeEnd, rangeTrans.groupNumber); //If the node has not been visited, push it to the stack if (!visitedNodes.Contains(rangeTrans.destination)) { nodeStack.Push( new KeyValuePair <RegexTrie <T>, RegexTrie <T> >( rangeTrans.destination, nodeCorrespondences[rangeTrans.destination])); } } //Copy complement transitions foreach (var compTrans in sourceNode.complementTransitions) { if (!nodeCorrespondences.ContainsKey(compTrans.destination)) { RegexTrie <T> newCorrespondingNode = new RegexTrie <T>(); nodeCorrespondences.Add(compTrans.destination, newCorrespondingNode); } targetNode.complementTransitions.Add(new ComplementTransition <T>( compTrans.groupNumber, nodeCorrespondences[compTrans.destination], compTrans.characters, compTrans.ranges)); //If the node has not been visited, push it to the stack if (!visitedNodes.Contains(compTrans.destination)) { nodeStack.Push( new KeyValuePair <RegexTrie <T>, RegexTrie <T> >( compTrans.destination, nodeCorrespondences[compTrans.destination])); } } } //Return the node corresponding to the end node return(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(nodeCorrespondences[start], nodeCorrespondences[end])); }
public TrieAndGroups(RegexTrie <T> trie, Dictionary <int, StringBuilder> groups) { this.trie = trie; this.groups = groups; }
//This checks for a quantifier immediately after the character, and joins the current trie to the end of the //previous trie. Returns a pair of int (to increment string index) and the node to which further tries will be connected private KeyValuePair <int, RegexTrie <T> > checkForQuantifier( string sourceTail, RegexTrie <T> endNodeOfCurrentComponent, RegexTrie <T> startNodeOfCurrentComponent, RegexTrie <T> endNodeOfPreviousComponent, Stack <RegexTrie <T> > commonDestination) { char nextChar; try { nextChar = sourceTail[0]; } catch { //assign a non-special character to trigger the default case nextChar = 'n'; } switch (nextChar) { case '*': { return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie( new MinMaxResult(1, 1, true, true), startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } case '+': { return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie( new MinMaxResult(1, 1, true, false), startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } case '?': { return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie( new MinMaxResult(1, 1, false, true), startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } case '{': { int closeBraceIndex = sourceTail.IndexOf('}'); string quantString = sourceTail.Substring(1, closeBraceIndex - 1); MinMaxResult minMax = findMinMax(quantString); return(new KeyValuePair <int, RegexTrie <T> >(closeBraceIndex + 1, joinQuantifiedTrie( minMax, startNodeOfCurrentComponent, endNodeOfCurrentComponent, endNodeOfPreviousComponent))); } default: { endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent); return(new KeyValuePair <int, RegexTrie <T> >(0, endNodeOfCurrentComponent)); } } }
public RegexTrie <T> determiniseNFA(RegexTrie <T> regTrie) { //If regTrie is empty, return empty trie if (regTrie.characterTransitions.Count == 0 && regTrie.epsilonTransitions.Count == 0) { return(new RegexTrie <T>()); } //Keeps track of the amount of DFA states added int stateCounter = 0; //Lists of DFA states. List <List <RegexTrie <T> > > unmarkedDStates = new List <List <RegexTrie <T> > >(); List <List <RegexTrie <T> > > markedDStates = new List <List <RegexTrie <T> > >(); //List of DFA transitions. List <ClosureTransitions <T, int> > dTrans = new List <ClosureTransitions <T, int> >(); //This compares two states to see if they are equal (and provides hash codes) StateComparer <T> comparer = new StateComparer <T>(); //This needs to be a dictionary so that the existence of a state can be quickly checked Dictionary <List <RegexTrie <T> >, int> stateExists = new Dictionary <List <RegexTrie <T> >, int>(comparer); //Get the first epsilonClosure List <RegexTrie <T> > epsilonClosure = getEpsilonClosure(new List <RegexTrie <T> >() { regTrie }); //Add the first state to the unmarked states and all states unmarkedDStates.Add(epsilonClosure); //Map the epsilon closure to counter stateExists.Add(epsilonClosure, stateCounter); stateCounter++; int lowestKey = 0; //This will hold the transitions from an epsilon closure ClosureTransitions <T, List <RegexTrie <T> > > transitions; while (true) { //If the list does not contain lowest key, end the loop if (lowestKey > unmarkedDStates.Count - 1) { break; } //Move the state from unmarked to marked and get its transitions //The lowest key is always the lowest key by definition, so the order will be preserved markedDStates.Add(unmarkedDStates[lowestKey]); transitions = getClosureTransitions(unmarkedDStates[lowestKey]); unmarkedDStates[lowestKey] = null; //Go through the transitions of the closure //Always add the dTrans entry to keep lists in sync dTrans.Add(new ClosureTransitions <T, int>()); //Handle normal transitions addCharacterTransitionsToTransitionTable( transitions.normalTransitions, dTrans[lowestKey].normalTransitions, stateExists, unmarkedDStates); //Handle range transitions addRangeTransitionsToTransitionTable( transitions.rangeTransitions, dTrans[lowestKey].rangeTransitions, stateExists, unmarkedDStates); //Handle complement transitions addComplementTransitionsToTransitionTable( transitions.complementTransitions, dTrans[lowestKey].complementTransitions, stateExists, unmarkedDStates); //increment lowest key lowestKey++; } //Construct the new trie adding tries as necessary //This is a dictionary as the states are added non-consecutively Dictionary <int, RegexTrie <T> > allDTries = new Dictionary <int, RegexTrie <T> >(markedDStates.Count); for (int index = 0; index < markedDStates.Count; index++) { //Add the origin state to all tries, unless it already exists if (!allDTries.ContainsKey(index)) { allDTries[index] = new RegexTrie <T>(); } //Add normal transitions to all tries addCharacterTransitionsToDFA( dTrans[index].normalTransitions, allDTries, allDTries[index]); addRangeTransitionsToDFA( dTrans[index].rangeTransitions, allDTries, allDTries[index]); addComplementTransitionsToDFA( dTrans[index].complementTransitions, allDTries, allDTries[index]); //Add translations and replace fields to states that have epsilon closures containing translations foreach (RegexTrie <T> state in markedDStates[index]) { if (state.matches != null) { if (allDTries[index].matches == null) { allDTries[index].matches = new List <T>() { state.matches[0] }; } else { allDTries[index].matches.Add(state.matches[0]); } } } } //Return a reference to the root node of the trie return(allDTries[0]); }
//This has to keep track of four nodes: end of the previous component, start of the current component, //end of the current component and the node that is connected to the end of the current component with //an epsilon transition (which will become the end of the previous component for the next component). public void AddToRegexTrie (RegexTrie <T> trie, string source, T matchobject, Stack <RegexTrie <T> > endStack, Stack <RegexTrie <T> > startStack, Stack <RegexTrie <T> > commonDestination) { RegexTrie <T> newTrie = new RegexTrie <T>(); //Current trie being added to RegexTrie <T> endNodeOfCurrentComponent = trie; //End of previous trie, to which the current trie is joined RegexTrie <T> endNodeOfPreviousComponent = trie; //End of previous trie, to which the current trie is joined RegexTrie <T> startNodeOfCurrentComponent = trie; //Push trie on the start stack startStack.Push(trie); //Push a null on the commonStart and commonDestination stacks commonDestination.Push(null); //Matching group counter. byte groupCount = 1; //True if previous character was an escape char Boolean escapedCharacter = false; //Variable for the character at the loop index char currentChar; //Holds the result of the quantifier check KeyValuePair <int, RegexTrie <T> > quantifierCheckResult; for (int stringIndex = 0; stringIndex < source.Length; stringIndex++) { currentChar = source[stringIndex]; //Check for escape character if (currentChar == '\\') { escapedCharacter = true; continue; } //If the character is escaped, just make it into a trie if (escapedCharacter) { //Special char handling if (currentChar == 't') { currentChar = '\t'; } startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = startNodeOfCurrentComponent.GetOrAddChildNode(currentChar, 0); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; escapedCharacter = false; continue; } //Else check character for special meaning else { switch (currentChar) { //Open a new group case '(': { //Push the previous trie to the end stack endStack.Push(endNodeOfPreviousComponent); //Create a new trie endNodeOfPreviousComponent = new RegexTrie <T>(); //Push the newly created trie on the start stack startStack.Push(endNodeOfPreviousComponent); //Push a null trie on the common destination and start stacks (to be defined, if pipes are found) commonDestination.Push(null); break; } //Close a group case ')': { //If common destination exists, add an epsilon transition to it if (commonDestination.Peek() != null) { //Connect the end node and common destination as part of the same epsilon closure commonDestination.Peek().AddEpsilonTransition(endNodeOfPreviousComponent); endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek()); //Move the current trie to common destination, as that's where the building will continue //Pop the common destination, it won't be needed anymore endNodeOfCurrentComponent = commonDestination.Pop(); } else { endNodeOfCurrentComponent = endNodeOfPreviousComponent; //Pop the null destination commonDestination.Pop(); } startNodeOfCurrentComponent = startStack.Pop(); endNodeOfPreviousComponent = endStack.Pop(); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; try { endStack.Peek(); } //If we're at the top, add group number to each node of the trie catch { addGroupNumbers(startNodeOfCurrentComponent, groupCount); groupCount++; } break; } //Handle square bracket set case '[': { startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = new RegexTrie <T>(); //This skips over the closing square bracket, so there's no need for closing square bracket handling stringIndex += handleSquareBracketGroup(startNodeOfCurrentComponent, endNodeOfCurrentComponent, source.Substring(stringIndex + 1)); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; break; } //Caret at the start: add a transition with a control character that won't exist in text. //Feed the control character when finding matches. case '^': { endNodeOfPreviousComponent = addSpecialTransition(2, endNodeOfPreviousComponent); break; } //Dollar at end: add a transition with a control character that won't exist in text. //Feed the control character when finding matches. case '$': { endNodeOfPreviousComponent = addSpecialTransition(3, endNodeOfPreviousComponent); break; } //Period handling case '.': { startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = new RegexTrie <T>(); //Add complement of null character startNodeOfCurrentComponent.complementTransitions.Add( new ComplementTransition <T>( 0, endNodeOfCurrentComponent, new List <char>(), new List <KeyValuePair <char, char> >() { new KeyValuePair <char, char>((char)0, (char)0) })); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; break; } //Change previous end to common destination and move end node to common start case '|': { if (commonDestination.Peek() == null) { commonDestination.Pop(); commonDestination.Push(endNodeOfPreviousComponent); } else { //Connect the end node and common destination as part of the same epsilon closure commonDestination.Peek().AddEpsilonTransition(endNodeOfPreviousComponent); endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek()); } endNodeOfPreviousComponent = startStack.Peek(); break; } default: { startNodeOfCurrentComponent = new RegexTrie <T>(); endNodeOfCurrentComponent = startNodeOfCurrentComponent.GetOrAddChildNode(currentChar, 0); quantifierCheckResult = checkForQuantifier( source.Substring(stringIndex + 1), endNodeOfCurrentComponent, startNodeOfCurrentComponent, endNodeOfPreviousComponent, commonDestination); stringIndex += quantifierCheckResult.Key; endNodeOfPreviousComponent = quantifierCheckResult.Value; break; } } } } //Link end node to common destination and start node to common start if they exist if (commonDestination.Peek() != null) { endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek()); endNodeOfPreviousComponent = commonDestination.Pop(); } //Add translation and replace fields to the current trie endNodeOfPreviousComponent.matches = new List <T> { matchobject }; }
private Segment InjectTermsFuzzyMatch(Segment segment, SearchResult result) { //Reset the fuzzy current visitor _provider.FuzzyCurrentVisitor.Reset(); //Have the fuzzy visitor go through the current source segment and pick up found //terms and their possible replacements foreach (var element in segment.Elements) { element.AcceptSegmentElementVisitor(_provider.FuzzyCurrentVisitor); } //Generate a new replacement trie from the fuzzy current visitor term list Trie replacementTrie = _provider.ExactMatchTrieReplaces.Clone(); //Create a new regex trie which will be used as the secondary regex trie in the visitor RegexTrie <TranslationAndReplacement> replacementRegexTrie = new RegexTrie <TranslationAndReplacement>(); Boolean regexesAdded = false; //This will be the new term list of the visitor, with replacement terms removed List <PositionAndTranslation> newTermList = new List <PositionAndTranslation>(); //Go through the terms adding the terms to either the normal or regex trie or to the new term list foreach (var term in _provider.FuzzyCurrentVisitor.TermList) { //If the term has a replaces value, add it to trie if (term.Replaces != "") { //Use replaces field as source. If the term is from a regex trie, add //it to a regex trie if (term.Regex) { _provider.RegexTrieFactory.AddToRegexTrie( replacementRegexTrie, term.Replaces, new TranslationAndReplacement( term.Translation, "")); regexesAdded = true; } else { replacementTrie.AddToTrie(term.Replaces, term.Translation, ""); } } //If there's no replaces field, handle this in the comparison phase else { newTermList.Add(term); } } //Determinise the regex trie if (regexesAdded) { replacementRegexTrie = _provider.Determiniser.determiniseNFA(replacementRegexTrie); } //Update the fuzzy term list with the list that does not contain the replacement terms _provider.FuzzyCurrentVisitor.TermList = newTermList; //Reset fuzzy replace visitor and update the tries to it _provider.FuzzyReplaceVisitor.Reset(); _provider.FuzzyReplaceVisitor.SndTrie = replacementTrie; _provider.FuzzyReplaceVisitor.SndRegexTrie = replacementRegexTrie; //Go through the translation proposal target segment with the visitor foreach (var element in result.TranslationProposal.TargetSegment.Elements) { //Why would there be null elements? if (element != null) { element.AcceptSegmentElementVisitor(_provider.FuzzyReplaceVisitor); } } Segment segmentWithTerms = _provider.FuzzyReplaceVisitor.Segment; //Visit the elements of the results, compare the resulting visitor with the _fuzzyVisitor and //and add the terms only found in _fuzzyVisitor to the translation proposal if (_options.InjectNewTermsIntoFuzzies == "true") { _provider.FuzzyMatchVisitor.Reset(); foreach (var element in result.MemoryTranslationUnit.SourceSegment.Elements) { element.AcceptSegmentElementVisitor(_provider.FuzzyMatchVisitor); } Text newTerms = _provider.FuzzyCurrentVisitor.TermDifference(_provider.FuzzyMatchVisitor); if (newTerms.Value.Length > 0) { segmentWithTerms.Elements.Insert(0, newTerms); } } //Return a deep copy of the segment (if reference is used, all results will display //the last segment constructed.) return(segmentWithTerms.Duplicate()); }
public void addStringToTries(string ruleString) { if (ruleString.Contains(this.Options.TermAdditionSeparator)) { string[] splitTerm = ruleString.Split(this.Options.TermAdditionSeparator.ToCharArray()); List <string> newTerm = splitTerm.ToList(); //If there's only two fields, add an empty field if (newTerm.Count == 2) { newTerm.Add(""); } //Check that either first or the third field of the new term are non-empty if ((newTerm[0].Length > 0) || (newTerm[2].Length > 0)) { char fileDelimiter = delimiterToChar(this.Options.Delimiter); //Regex for converting unicode escape sequences to characters Regex rx = new Regex(@"\[uU]([0-9a-fA-F]{4})"); //Check whether this is a regex term or a normal term if (ruleString[0] == 'r' && ruleString[1] == '\\') { //Remove the regex marker newTerm[0] = newTerm[0].Substring(2); //Convert the unicode escape sequences in the new term List <string> unicodeParsedNewTerm = new List <string>(); foreach (string field in newTerm) { unicodeParsedNewTerm.Add( rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); })); } //Validate source and replaces fields foreach (var num in new List <int> { 0, 2 }) { try { int validationResult = this.regexTrieFactory.validateRegex(unicodeParsedNewTerm[num]); if (validationResult != 0) { List <KeyValuePair <string, string> > results = new List <KeyValuePair <string, string> >(); results.Add( new KeyValuePair <string, string>(unicodeParsedNewTerm[num], this.regexTrieFactory.errorMessages[validationResult])); ValidationErrorForm errorForm = new ValidationErrorForm(results); return; } } catch { //The field does not exist, no need to validate } } if (File.Exists(this.Options.RegexFileName)) { TextWriter tw = new StreamWriter(this.Options.RegexFileName, true); tw.WriteLine(); tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]); tw.Close(); } else if (this.Options.RegexFileName != "" || this.Options.RegexFileName == null) { MessageBox.Show("Regular expression rule file does not exist", "TermInjector"); this.Options.RegexFileName = ""; } if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == true) { this.regexTrieSource = this.determiniser.determiniseNFA( this.regexTrieSource); } else if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == false) { this.regexTrieReplaces = this.determiniser.determiniseNFA( this.regexTrieReplaces); } } else { if (File.Exists(this.Options.GlossaryFileName)) { TextWriter tw = new StreamWriter(this.Options.GlossaryFileName, true); tw.WriteLine(); tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]); tw.Close(); } else if (this.Options.GlossaryFileName != "" || this.Options.GlossaryFileName == null) { MessageBox.Show("Exact match rule file does not exist", "TermInjector"); } if (!matchCaseToBool(this.Options.MatchCase)) { newTerm[0] = newTerm[0].ToLower(); newTerm[2] = newTerm[2].ToLower(); } //Convert the unicode escape sequences in the new term List <string> unicodeParsedNewTerm = new List <string>(); foreach (string field in newTerm) { unicodeParsedNewTerm.Add(rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); })); } //Add term to normal or fuzzy trie this.trieLoader.addFieldsToTrie(unicodeParsedNewTerm, this.exactMatchTrieSource, this.exactMatchTrieReplaces); } } //Update the possible new tries to visitors initializeVisitors(); } return; }
//This will return all regexmatches found from the search string public List <PositionAndTranslation> FindRegexMatches (RegexTrie <TranslationAndReplacement> regexTrie, string inputString, string tokenBoundaryCharacterString, Boolean useBoundaryCharacters) { //triePathsBeingTraversed field holds the paths within the regexTrie that are currently being followed //Each value of the dictionary also contains the character position where following of the path was started //and string builder for the groups within the regex List <RegexPathWithStartCharacterPosition> triePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>(); //This list will be built during iteration and used as the new list in the next loop cycle List <RegexPathWithStartCharacterPosition> newTriePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>(); //positionAndTranslationOfTerms field holds the translations of the terms that have been discovered. //Each value of the dictionary also contains the start and end positions of the source term, //so that the translations can be inserted later List <PositionAndTranslation> positionAndTranslationOfTerms = new List <PositionAndTranslation>(); //Define the set of characters used to tokenize the searchString (I call these boundary characters) HashSet <Char> setOfTokenBoundaryCharacters = new HashSet <char>(); foreach (char tokenBoundaryChar in tokenBoundaryCharacterString) { setOfTokenBoundaryCharacters.Add(tokenBoundaryChar); } //Boolean for holding the result of checking whether character is a boundary character bool isTokenBoundaryCharacter; //Add the initial path to newTriePathsBeingTraversed at index 0 int pathIndex = 0; newTriePathsBeingTraversed.Add(new RegexPathWithStartCharacterPosition(0, regexTrie)); pathIndex++; //Initialize the current character variable char currentChar; //Feed the start control character to reach the portion of the trie with the //string start relative regexes if (regexTrie.characterTransitions.ContainsKey((char)2)) { //The start character is never a part of group, so just select the zero group transition newTriePathsBeingTraversed.Add( new RegexPathWithStartCharacterPosition( 0, regexTrie.characterTransitions[(char)2][0].destination)); } //Iterate over the inputString. for (int charIndex = 0; charIndex < inputString.Length; charIndex++) { //Remove duplicate paths (with same trieAndGroups) from the paths triePathsBeingTraversed = removeDuplicateTrieAndGroups(newTriePathsBeingTraversed); newTriePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>(); currentChar = inputString[charIndex]; //Check if char currentChar is a boundary character, or if the boundary characters are not used isTokenBoundaryCharacter = setOfTokenBoundaryCharacters.Contains(currentChar) || !useBoundaryCharacters; //Iterate over the triePathsBeingTraversed and check if they continue with the current char foreach (RegexPathWithStartCharacterPosition triePath in triePathsBeingTraversed) { //If the char is a boundary character, check for translation if (isTokenBoundaryCharacter) { if (triePath.trie.matches != null) { addTranslation(triePath, positionAndTranslationOfTerms, charIndex - 1); } } //Check whether path continues with the current char or range transitions, //and add continuing paths checkWhetherPathContinues(triePath, newTriePathsBeingTraversed, currentChar); } //If char is a boundarycharacter, add a new path if (isTokenBoundaryCharacter) { //The term will actually begin at the next charIndex (next loop), so add +1 to key newTriePathsBeingTraversed.Add(new RegexPathWithStartCharacterPosition(charIndex + 1, regexTrie)); } } //Check if any of the tries has a translation at the last position foreach (RegexPathWithStartCharacterPosition triePath in newTriePathsBeingTraversed) { //File.AppendAllText(@"C:\Users\Anonyymi\Desktop\log.txt", String.Join(",",triePath.trie.matches.Select(x => x.translation).ToArray())); //Feed the end control character to reach the portion of the trie with the //string end relative regexes if (triePath.trie.characterTransitions.ContainsKey((char)3)) { RegexPathWithStartCharacterPosition endCharConsumed = new RegexPathWithStartCharacterPosition( triePath.startCharacterPosition, triePath.trie.characterTransitions[(char)3][0].destination, triePath.groups); //There's always an translation behind the end character addTranslation(endCharConsumed, positionAndTranslationOfTerms, inputString.Length - 1); } //The end position needs to be the length of the input string - 1 , as spans are start to end character if (triePath.trie.matches != null) { addTranslation(triePath, positionAndTranslationOfTerms, inputString.Length - 1); } } return(positionAndTranslationOfTerms); }
//Add transitions from a square bracket group between the tries given as arguments. //Returns the amount of characters consumed from the input string (so the string index can be incremented). private int handleSquareBracketGroup(RegexTrie <T> sourceTrie, RegexTrie <T> targetTrie, string sourceTail) { Boolean isEscaped = false; Boolean isComplement = false; //The complement transition to build ComplementTransition <T> complement = new ComplementTransition <T>(0, targetTrie); //Check whether this is a complement set if (sourceTail[0] == '^') { //Add complement transition to trie sourceTrie.complementTransitions.Add(complement); isComplement = true; sourceTail = sourceTail.Substring(1); } StringBuilder set = new StringBuilder(); char currentChar; //First build a string out of the expression, then add every char in it or its complement to the trie //as transitions for (int stringIndex = 0; stringIndex < sourceTail.Length; stringIndex++) { currentChar = sourceTail[stringIndex]; if (isEscaped) { set = handleEscaped(set, currentChar); isEscaped = false; continue; } switch (currentChar) { case '\\': { isEscaped = true; break; } case '-': { char start = sourceTail[stringIndex - 1]; char end = sourceTail[stringIndex + 1]; //Add a range transition, which can be a complement if (isComplement) { complement.ranges.Add(new KeyValuePair <char, char>(start, end)); } else { sourceTrie.AddRangeTransition( targetTrie, start, end, 0); } //Remove the range start from set set.Remove(set.Length - 1, 1); //skip the range end stringIndex++; break; } //Add the set or its complement as transitions case ']': { string finalSet = set.ToString(); if (isComplement) { complement.characters.AddRange(finalSet); //One character was consumed by the complement special character, so increment by 2 return(stringIndex + 2); } else { foreach (char c in finalSet) { sourceTrie.AddCharacterTransition(c, 0, targetTrie); } return(stringIndex + 1); } } default: { set.Append(currentChar); break; } } } //This should never be reached, as the regex is validated to always contain closing square bracket return(-1); }
public void AddEpsilonTransition(RegexTrie <T> newEpsilon) { this.epsilonTransitions.Add(newEpsilon); }
public Transition(byte group, RegexTrie <T> dest) { this.groupNumber = group; this.destination = dest; }
//This loads the regex tries from a file to the two regextries given as parameters public void loadRegexTrieFromFile( string fileName, char delimiter, ref RegexTrie <TranslationAndReplacement> regexTrieSource, ref RegexTrie <TranslationAndReplacement> regexTrieReplaces) { //Regextrie is used for visiting source segment, fuzzy regex trie for visiting //the fuzzy match target segment //Check if file exists, exit method and show a message if it doesn't if (!File.Exists(fileName)) { //If the file name is not empty, display alert if (fileName != "") { MessageBox.Show("Regular expression rule file does not exist", "TermInjector"); } return; } //Regex for converting unicode escape sequences to characters Regex rx = new Regex(@"\\[uU]([0-9a-fA-F]{4})"); //Counter for restricting glossary size int stringMemoryUsage = 0; //Counters for checking whether terms are being added int lineCount = 0; int termCount = 0; Boolean addedNormalRegexes = false; Boolean addedFuzzyRegexes = false; using (StreamReader sourceFile = File.OpenText(fileName)) { while (!sourceFile.EndOfStream) { //Check if memory usage is within bounds if (stringMemoryUsage > 2500000) { MessageBox.Show("Regular expression rule file loading stopped due to excessive size: Only part of the regular expression rule file has been loaded.", "TermInjector"); break; } //Split the line before Unicode conversion (so as not to accidentally add separators) List <string> unicodeEscapedSplitTerm = sourceFile.ReadLine().Split(delimiter).ToList(); //Convert the unicode escape sequences in the fields List <string> splitTerm = new List <string>(); foreach (var field in unicodeEscapedSplitTerm) { splitTerm.Add( rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); })); } //Check whether the line was valid (at least two fields) if (splitTerm.Count < 2) { lineCount++; continue; } List <string> newTerm = splitTerm.ToList(); //If both first and third fields are empty //, skip to next iteration if (newTerm[0].Length == 0 && newTerm[1].Length == 0) { lineCount++; continue; } //If length of list is two, add empty field if (newTerm.Count == 2) { newTerm.Add(""); } //Tally the proxy for memory usage, depending on whether source or replaces //field was used as path if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == true) { stringMemoryUsage += newTerm[0].Length; addedNormalRegexes = true; termCount++; lineCount += 1; } else if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == false) { stringMemoryUsage += newTerm[2].Length; addedFuzzyRegexes = true; termCount++; lineCount += 1; } } sourceFile.Close(); //Determinise the regex tries if (addedNormalRegexes) { //Here's the problem, determiniser breaks the reference: use ref keywords regexTrieSource = this.determiniser.determiniseNFA( regexTrieSource); } if (addedFuzzyRegexes) { regexTrieReplaces = this.determiniser.determiniseNFA( regexTrieReplaces); } //If the proportion of terms stored and lines read is skewed, the wrong delimiter may have been used. //Don't check very small glossaries, as otherwise an empty line or two could trigger the message if (lineCount - termCount > (lineCount / 2)) { string delimiterUsed = ""; if (delimiter == '\t') { delimiterUsed = "Tab"; } else { delimiterUsed = delimiter.ToString(); } MessageBox.Show((string.Format("The amount of regular expression rules stored is small compared to the amount of lines read: {0} lines read, but only {1} regular expression rules found. Are you sure the delimiter character {2} is correct?" , lineCount.ToString(), termCount.ToString(), delimiterUsed)), "TermInjector"); } } if (this.validationErrors.Count > 0) { ValidationErrorForm errorForm = new ValidationErrorForm(this.validationErrors); } this.validationErrors.Clear(); }