Exemplo n.º 1
0
 public ComplementTransition(byte group, RegexTrie <T> dest, List <char> characters, List <KeyValuePair <char, char> > ranges)
 {
     this.groupNumber = group;
     this.destination = dest;
     this.characters  = characters;
     this.ranges      = ranges;
 }
Exemplo n.º 2
0
 public RangeTransition(byte group, RegexTrie <T> dest, char rangeStart, char rangeEnd)
 {
     this.groupNumber = group;
     this.destination = dest;
     this.rangeStart  = rangeStart;
     this.rangeEnd    = rangeEnd;
 }
Exemplo n.º 3
0
 public ComplementTransition(byte group, RegexTrie <T> dest)
 {
     this.groupNumber = group;
     this.destination = dest;
     this.characters  = new List <char>();
     this.ranges      = new List <KeyValuePair <char, char> >();
 }
        public void loadTries()
        {
            //Reset the node counter
            RegexTrie <TranslationAndReplacement> .counter = 0;
            //Load tries
            char delimiter = delimiterToChar(this.Options.Delimiter);

            //Load the exact match tries
            this.exactMatchTrieSource   = new Trie();
            this.exactMatchTrieReplaces = new Trie();
            this.trieLoader.loadTrieFromFile(
                this.Options.GlossaryFileName,
                this.Options.MatchCase == "true" ? true : false,
                delimiter,
                this.exactMatchTrieSource,
                this.exactMatchTrieReplaces);

            //Load regex tries
            this.regexTrieSource   = new RegexTrie <TranslationAndReplacement>();
            this.regexTrieReplaces = new RegexTrie <TranslationAndReplacement>();
            //Pass the tries by ref, as determinisation needs to return a new trie
            this.trieLoader.loadRegexTrieFromFile(
                this.Options.RegexFileName,
                delimiter,
                ref this.regexTrieSource,
                ref this.regexTrieReplaces);
        }
Exemplo n.º 5
0
 //Adds a range transition, which is used for character ranges and complements
 public void AddRangeTransition(
     RegexTrie <T> dest,
     char rangeStart,
     char rangeEnd,
     byte groupNumber)
 {
     this.rangeTransitions.Add(new RangeTransition <T>(groupNumber, dest, rangeStart, rangeEnd));
 }
        //Adds a control character transition
        private RegexTrie <T> addSpecialTransition(int controlCode, RegexTrie <T> endNodeOfPreviousComponent)
        {
            RegexTrie <T> startNodeOfCurrentComponent = new RegexTrie <T>();

            //Connect start node to previous end node
            endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent);
            //Use STX control character
            return(startNodeOfCurrentComponent.GetOrAddChildNode((char)controlCode, 0));
        }
Exemplo n.º 7
0
        public RegexPathWithStartCharacterPosition(int startCharacterPosition, RegexTrie <TranslationAndReplacement> trie)
        {
            //Record the position where following of the regex trie was started
            this.startCharacterPosition = startCharacterPosition;

            //Initialize the list of trieAndGroups objects
            this.trie = trie;

            this.groups = new Dictionary <int, StringBuilder>();
        }
 //Version of the method without stack arguments (so they don't need to be initialized in the calling code)
 public int AddToRegexTrie(RegexTrie <T> trie, string source, T matchobject)
 {
     AddToRegexTrie(
         trie,
         source,
         matchobject,
         new Stack <RegexTrie <T> >(),
         new Stack <RegexTrie <T> >(),
         new Stack <RegexTrie <T> >());
     return(0);
 }
Exemplo n.º 9
0
        public TermInjectorTranslationProviderElementTermExtractionVisitor(
            TermInjectorTranslationOptions options,
            Trie glossaryTrie,
            RegexTrie <TranslationAndReplacement> regexTrie)
        {
            _options       = options;
            _trieProcessor = new TrieProcessing();
            _trie          = glossaryTrie;

            //Initialize the regex trie
            _regexTrie = regexTrie;

            _TermList = new List <PositionAndTranslation>();
        }
Exemplo n.º 10
0
        public RegexPathWithStartCharacterPosition(
            int startCharacterPosition,
            RegexTrie <TranslationAndReplacement> trie,
            Dictionary <int, StringBuilder> groups)
        {
            //Record the position where following of the regex trie was started
            this.startCharacterPosition = startCharacterPosition;

            //Initialize the list of trieAndGroups objects
            this.trie = trie;

            //String builder does not need to be added at this point, as string builders are only added
            //when group number greater than one is encountered in a transition
            this.groups = groups;
        }
Exemplo n.º 11
0
        public void addCharacterTransitionsToDFA(
            Dictionary <KeyValuePair <char, byte>, int> dfaTransitions,
            Dictionary <int, RegexTrie <T> > allDTries,
            RegexTrie <T> origin)
        {
            foreach (var trans in dfaTransitions)
            {
                if (!allDTries.ContainsKey(trans.Value))
                {
                    allDTries[trans.Value] = new RegexTrie <T>();
                }
                RegexTrie <T> target = allDTries[trans.Value];

                origin.AddCharacterTransition(trans.Key.Key, trans.Key.Value, target);
            }
        }
Exemplo n.º 12
0
        //This adds a list of three fields to the normal regex trie or the fuzzy regex trie
        //Returns true if fields were added to normal term trie, false if added to fuzzy trie,
        //and null if validation fails
        public Boolean?addFieldsToRegexTrie(
            List <string> fields,
            RegexTrie <TranslationAndReplacement> regexTrieSource,
            RegexTrie <TranslationAndReplacement> regexTrieReplaces)
        {
            //If first field has content, add fields to normal trie
            if (fields[0].Length > 0)
            {
                if (checkValidationErrors(fields[0], this.validationErrors))
                {
                    return(null);
                }
                //If there's a third field, also validate that
                if (fields.Count() > 2)
                {
                    if (checkValidationErrors(fields[2], this.validationErrors))
                    {
                        return(null);
                    }
                }

                //add the fields to the trie
                this.regexTrieFactory.AddToRegexTrie(
                    regexTrieSource,
                    fields[0],
                    new TranslationAndReplacement(
                        fields[1],
                        fields[2]));
                return(true);
            }
            //Otherwise add fields to fuzzy trie
            else
            {
                if (checkValidationErrors(fields[2], this.validationErrors))
                {
                    return(null);
                }
                this.regexTrieFactory.AddToRegexTrie(
                    regexTrieReplaces,
                    fields[2],
                    new TranslationAndReplacement(
                        fields[1],
                        ""));
                return(false);
            }
        }
Exemplo n.º 13
0
        public void addComplementTransitionsToDFA(
            Dictionary <ComplementTransition <T>, int> dfaTransitions,
            Dictionary <int, RegexTrie <T> > allDTries,
            RegexTrie <T> origin)
        {
            foreach (var trans in dfaTransitions)
            {
                if (!allDTries.ContainsKey(trans.Value))
                {
                    allDTries[trans.Value] = new RegexTrie <T>();
                }
                RegexTrie <T> target = allDTries[trans.Value];

                origin.complementTransitions.Add(new ComplementTransition <T>(
                                                     trans.Key.groupNumber, target, trans.Key.characters, trans.Key.ranges));
            }
        }
Exemplo n.º 14
0
        public void addRangeTransitionsToDFA(
            Dictionary <RangeTransition <T>, int> dfaTransitions,
            Dictionary <int, RegexTrie <T> > allDTries,
            RegexTrie <T> origin)
        {
            foreach (var trans in dfaTransitions)
            {
                if (!allDTries.ContainsKey(trans.Value))
                {
                    allDTries[trans.Value] = new RegexTrie <T>();
                }
                RegexTrie <T> target = allDTries[trans.Value];

                origin.AddRangeTransition(
                    target, trans.Key.rangeStart, trans.Key.rangeEnd, trans.Key.groupNumber);
            }
        }
Exemplo n.º 15
0
 //This is used to add a transition to the specified trie
 //The transitionDict determines whether the transition is added to the normal or complement transitions
 public void AddCharacterTransition(
     char key,
     byte groupNumber,
     RegexTrie <T> newTransition)
 {
     if (this.characterTransitions.ContainsKey(key))
     {
         Transition <T> newTrans = new Transition <T>(groupNumber, newTransition);
         this.characterTransitions[key].Add(newTrans);
     }
     else
     {
         Transition <T>         newTrans     = new Transition <T>(groupNumber, newTransition);
         List <Transition <T> > newTransList = new List <Transition <T> >();
         newTransList.Add(newTrans);
         this.characterTransitions.Add(key, newTransList);
     }
 }
Exemplo n.º 16
0
 public TermInjectorTranslationProviderElementTermReplacementVisitor(
     TermInjectorTranslationOptions options,
     Trie glossaryTrie,
     RegexTrie <TranslationAndReplacement> regexTrie)
 {
     _options = options;
     _segment = new Segment();
     //Initialize the glossary trie
     _trie = glossaryTrie;
     //Initialize the regex trie
     _regexTrie = regexTrie;
     //Create a new trie processor
     _trieProcessor = new TrieProcessing();
     //Initialize the dictionary which will contain the positions and translations
     //of terms
     _positionAndTranslationOfTerms = new List <PositionAndTranslation>();
     //Boolean for indicating whether the original segment has been changed
     _originalSegmentChanged = false;
 }
        //This goes through the trie adding the specified group number to transitions of each node
        private void addGroupNumbers(RegexTrie <T> trie, byte groupNumber)
        {
            //Make sure the same trie is not handled twice
            if (trie.groupsMarked)
            {
                return;
            }
            trie.groupsMarked = true;

            //Iterate over epsilon transitions
            foreach (RegexTrie <T> epsilon in trie.epsilonTransitions)
            {
                addGroupNumbers(epsilon, groupNumber);
            }

            //Iterate over the transitions, change the group number and call addGroupNumbers for transition destinations
            foreach (List <Transition <T> > transList in trie.characterTransitions.Values)
            {
                foreach (Transition <T> trans in transList)
                {
                    trans.groupNumber = groupNumber;
                    addGroupNumbers(trans.destination, groupNumber);
                }
            }

            //Do the same for range transitions
            foreach (RangeTransition <T> trans in trie.rangeTransitions)
            {
                trans.groupNumber = groupNumber;
                addGroupNumbers(trans.destination, groupNumber);
            }

            //Do the same for complement transitions
            foreach (ComplementTransition <T> trans in trie.complementTransitions)
            {
                trans.groupNumber = groupNumber;
                addGroupNumbers(trans.destination, groupNumber);
            }
            return;
        }
        //This adds the transitions required to make a group trie correspond to the correct quantified regex
        private RegexTrie <T> joinQuantifiedTrie(MinMaxResult result,
                                                 RegexTrie <T> startNodeOfCurrentComponent,
                                                 RegexTrie <T> endNodeOfCurrentComponent,
                                                 RegexTrie <T> endNodeOfPreviousComponent)
        {
            //This is the end node which will be reached by epsilon transition in case of zeroed quantifiers
            //or quantifiers with a number range of more than one, start node is used for zeroing
            RegexTrie <T> epsilonEndNode   = new RegexTrie <T>();
            RegexTrie <T> epsilonStartNode = new RegexTrie <T>();

            //This holds the end node used during the loop
            RegexTrie <T> loopEndNode = endNodeOfCurrentComponent;

            //Loop from 1 (one occurrence already exists) to max occurrences. If loop index is greater than min,
            //add epsilon from iteration trie end to final trie end.

            if (result.minOccurs > 1 || result.maxOccurs > 1)
            {
                //This is used as a model for copying, only needs to be calculated if there's a set number of occurrences
                KeyValuePair <RegexTrie <T>, RegexTrie <T> > trieModel = copyTrie(startNodeOfCurrentComponent, endNodeOfCurrentComponent);
                //Result of trie copy
                KeyValuePair <RegexTrie <T>, RegexTrie <T> > trieCopy;

                for (int index = 2; index <= result.maxOccurs; index++)
                {
                    if (index <= result.maxOccurs)
                    {
                        //Copy the trie and link it to the end of trie
                        trieCopy = copyTrie(trieModel);
                        loopEndNode.AddEpsilonTransition(trieCopy.Key);
                        //If the index is equal or greater than minoccurs, add epsilon to end node
                        if (result.minOccurs <= index)
                        {
                            trieCopy.Value.AddEpsilonTransition(epsilonEndNode);
                        }
                        //Assign the end node of the copy as the end node of the component
                        loopEndNode = trieCopy.Value;
                    }
                }
            }

            //If there's less than two occurrences, connect the end of the first span of the trie to the epsilon end trie
            //Do this after the copy loop in order to not disturb the copying.
            if (result.minOccurs < 2)
            {
                endNodeOfCurrentComponent.AddEpsilonTransition(epsilonEndNode);
            }

            //If the quantifier is starred, add epsilon from end to start
            if (result.starred)
            {
                endNodeOfCurrentComponent.AddEpsilonTransition(startNodeOfCurrentComponent);
            }

            //If the quantifier is zeroed, add an epsilon from epsilon start node
            //newCurrentTrie
            if (result.zeroInclusive)
            {
                epsilonStartNode.AddEpsilonTransition(startNodeOfCurrentComponent);
                epsilonStartNode.AddEpsilonTransition(epsilonEndNode);
                startNodeOfCurrentComponent = epsilonStartNode;
            }

            //Connect trie to previous trie
            endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent);

            return(epsilonEndNode);
        }
        //Makes an exact copy of a trie, returns the start and end nodes
        private KeyValuePair <RegexTrie <T>, RegexTrie <T> > copyTrie(RegexTrie <T> start, RegexTrie <T> end)
        {
            //The start node of the copied trie
            RegexTrie <T> trieCopy = new RegexTrie <T>();

            //The source node being copied
            RegexTrie <T> sourceNode;

            //The target node being copied
            RegexTrie <T> targetNode;

            //Dictionary specifying which source node corresponds to which target node
            Dictionary <RegexTrie <T>, RegexTrie <T> > nodeCorrespondences = new Dictionary <RegexTrie <T>, RegexTrie <T> >();

            //The first correspondence is that of start and trieCopy
            nodeCorrespondences.Add(start, trieCopy);

            //List of visited nodes, which won't be added to the stack when encountered
            List <RegexTrie <T> > visitedNodes = new List <RegexTrie <T> >();

            //Node stack used to imitate recursion
            Stack <KeyValuePair <RegexTrie <T>, RegexTrie <T> > > nodeStack = new Stack <KeyValuePair <RegexTrie <T>, RegexTrie <T> > >();

            //Push the start of the trie and the copy to the stack
            nodeStack.Push(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(start, trieCopy));

            while (nodeStack.Count > 0)
            {
                KeyValuePair <RegexTrie <T>, RegexTrie <T> > currentPair = nodeStack.Pop();
                sourceNode = currentPair.Key;
                targetNode = currentPair.Value;
                visitedNodes.Add(sourceNode);

                //Copy epsilon transitions
                foreach (var node in sourceNode.epsilonTransitions)
                {
                    //Add the epsilon transition to the target node
                    if (!nodeCorrespondences.ContainsKey(node))
                    {
                        RegexTrie <T> newCorrespondingNode = new RegexTrie <T>();
                        nodeCorrespondences.Add(node, newCorrespondingNode);
                    }

                    targetNode.AddEpsilonTransition(nodeCorrespondences[node]);

                    //If the node has not been visited, push it to the stack
                    if (!visitedNodes.Contains(node))
                    {
                        nodeStack.Push(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(node, nodeCorrespondences[node]));
                    }
                }

                //Copy character transitions
                foreach (var key in sourceNode.characterTransitions.Keys)
                {
                    foreach (var transition in sourceNode.characterTransitions[key])
                    {
                        //Add the transition to the target node
                        if (!nodeCorrespondences.ContainsKey(transition.destination))
                        {
                            RegexTrie <T> newCorrespondingNode = new RegexTrie <T>();
                            nodeCorrespondences.Add(transition.destination, newCorrespondingNode);
                        }

                        if (targetNode.characterTransitions.ContainsKey(key))
                        {
                            targetNode.characterTransitions[key].Add(
                                new Transition <T>(transition.groupNumber, nodeCorrespondences[transition.destination]));
                        }
                        else
                        {
                            targetNode.characterTransitions.Add(key, new List <Transition <T> >());
                            targetNode.characterTransitions[key].Add(
                                new Transition <T>(transition.groupNumber, nodeCorrespondences[transition.destination]));
                        }

                        //If the node has not been visited, push it to the stack
                        if (!visitedNodes.Contains(transition.destination))
                        {
                            nodeStack.Push(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(
                                               transition.destination, nodeCorrespondences[transition.destination]));
                        }
                    }
                }

                //Copy range transitions
                foreach (var rangeTrans in sourceNode.rangeTransitions)
                {
                    if (!nodeCorrespondences.ContainsKey(rangeTrans.destination))
                    {
                        RegexTrie <T> newCorrespondingNode = new RegexTrie <T>();
                        nodeCorrespondences.Add(rangeTrans.destination, newCorrespondingNode);
                    }

                    targetNode.AddRangeTransition(
                        nodeCorrespondences[rangeTrans.destination],
                        rangeTrans.rangeStart,
                        rangeTrans.rangeEnd,
                        rangeTrans.groupNumber);

                    //If the node has not been visited, push it to the stack
                    if (!visitedNodes.Contains(rangeTrans.destination))
                    {
                        nodeStack.Push(
                            new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(
                                rangeTrans.destination, nodeCorrespondences[rangeTrans.destination]));
                    }
                }

                //Copy complement transitions
                foreach (var compTrans in sourceNode.complementTransitions)
                {
                    if (!nodeCorrespondences.ContainsKey(compTrans.destination))
                    {
                        RegexTrie <T> newCorrespondingNode = new RegexTrie <T>();
                        nodeCorrespondences.Add(compTrans.destination, newCorrespondingNode);
                    }

                    targetNode.complementTransitions.Add(new ComplementTransition <T>(
                                                             compTrans.groupNumber,
                                                             nodeCorrespondences[compTrans.destination],
                                                             compTrans.characters,
                                                             compTrans.ranges));

                    //If the node has not been visited, push it to the stack
                    if (!visitedNodes.Contains(compTrans.destination))
                    {
                        nodeStack.Push(
                            new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(
                                compTrans.destination, nodeCorrespondences[compTrans.destination]));
                    }
                }
            }

            //Return the node corresponding to the end node
            return(new KeyValuePair <RegexTrie <T>, RegexTrie <T> >(nodeCorrespondences[start], nodeCorrespondences[end]));
        }
 public TrieAndGroups(RegexTrie <T> trie, Dictionary <int, StringBuilder> groups)
 {
     this.trie   = trie;
     this.groups = groups;
 }
        //This checks for a quantifier immediately after the character, and joins the current trie to the end of the
        //previous trie. Returns a pair of int (to increment string index) and the node to which further tries will be connected
        private KeyValuePair <int, RegexTrie <T> > checkForQuantifier(
            string sourceTail,
            RegexTrie <T> endNodeOfCurrentComponent,
            RegexTrie <T> startNodeOfCurrentComponent,
            RegexTrie <T> endNodeOfPreviousComponent,
            Stack <RegexTrie <T> > commonDestination)
        {
            char nextChar;

            try
            {
                nextChar = sourceTail[0];
            }
            catch
            {
                //assign a non-special character to trigger the default case
                nextChar = 'n';
            }

            switch (nextChar)
            {
            case '*':
            {
                return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie(
                                                                  new MinMaxResult(1, 1, true, true),
                                                                  startNodeOfCurrentComponent,
                                                                  endNodeOfCurrentComponent,
                                                                  endNodeOfPreviousComponent)));
            }

            case '+':
            {
                return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie(
                                                                  new MinMaxResult(1, 1, true, false),
                                                                  startNodeOfCurrentComponent,
                                                                  endNodeOfCurrentComponent,
                                                                  endNodeOfPreviousComponent)));
            }

            case '?':
            {
                return(new KeyValuePair <int, RegexTrie <T> >(1, joinQuantifiedTrie(
                                                                  new MinMaxResult(1, 1, false, true),
                                                                  startNodeOfCurrentComponent,
                                                                  endNodeOfCurrentComponent,
                                                                  endNodeOfPreviousComponent)));
            }

            case '{':
            {
                int          closeBraceIndex = sourceTail.IndexOf('}');
                string       quantString     = sourceTail.Substring(1, closeBraceIndex - 1);
                MinMaxResult minMax          = findMinMax(quantString);
                return(new KeyValuePair <int, RegexTrie <T> >(closeBraceIndex + 1, joinQuantifiedTrie(
                                                                  minMax,
                                                                  startNodeOfCurrentComponent,
                                                                  endNodeOfCurrentComponent,
                                                                  endNodeOfPreviousComponent)));
            }

            default:
            {
                endNodeOfPreviousComponent.AddEpsilonTransition(startNodeOfCurrentComponent);
                return(new KeyValuePair <int, RegexTrie <T> >(0, endNodeOfCurrentComponent));
            }
            }
        }
Exemplo n.º 22
0
        public RegexTrie <T> determiniseNFA(RegexTrie <T> regTrie)
        {
            //If regTrie is empty, return empty trie
            if (regTrie.characterTransitions.Count == 0 && regTrie.epsilonTransitions.Count == 0)
            {
                return(new RegexTrie <T>());
            }

            //Keeps track of the amount of DFA states added
            int stateCounter = 0;

            //Lists of DFA states.
            List <List <RegexTrie <T> > > unmarkedDStates =
                new List <List <RegexTrie <T> > >();
            List <List <RegexTrie <T> > > markedDStates =
                new List <List <RegexTrie <T> > >();

            //List of DFA transitions.
            List <ClosureTransitions <T, int> > dTrans = new List <ClosureTransitions <T, int> >();

            //This compares two states to see if they are equal (and provides hash codes)
            StateComparer <T> comparer = new StateComparer <T>();
            //This needs to be a dictionary so that the existence of a state can be quickly checked
            Dictionary <List <RegexTrie <T> >, int> stateExists =
                new Dictionary <List <RegexTrie <T> >, int>(comparer);
            //Get the first epsilonClosure
            List <RegexTrie <T> > epsilonClosure =
                getEpsilonClosure(new List <RegexTrie <T> >()
            {
                regTrie
            });

            //Add the first state to the unmarked states and all states
            unmarkedDStates.Add(epsilonClosure);

            //Map the epsilon closure to counter
            stateExists.Add(epsilonClosure, stateCounter);
            stateCounter++;
            int lowestKey = 0;

            //This will hold the transitions from an epsilon closure
            ClosureTransitions <T, List <RegexTrie <T> > > transitions;

            while (true)
            {
                //If the list does not contain lowest key, end the loop
                if (lowestKey > unmarkedDStates.Count - 1)
                {
                    break;
                }

                //Move the state from unmarked to marked and get its transitions
                //The lowest key is always the lowest key by definition, so the order will be preserved
                markedDStates.Add(unmarkedDStates[lowestKey]);
                transitions = getClosureTransitions(unmarkedDStates[lowestKey]);
                unmarkedDStates[lowestKey] = null;

                //Go through the transitions of the closure
                //Always add the dTrans entry to keep lists in sync
                dTrans.Add(new ClosureTransitions <T, int>());

                //Handle normal transitions
                addCharacterTransitionsToTransitionTable(
                    transitions.normalTransitions,
                    dTrans[lowestKey].normalTransitions,
                    stateExists,
                    unmarkedDStates);

                //Handle range transitions
                addRangeTransitionsToTransitionTable(
                    transitions.rangeTransitions,
                    dTrans[lowestKey].rangeTransitions,
                    stateExists,
                    unmarkedDStates);

                //Handle complement transitions
                addComplementTransitionsToTransitionTable(
                    transitions.complementTransitions,
                    dTrans[lowestKey].complementTransitions,
                    stateExists,
                    unmarkedDStates);

                //increment lowest key
                lowestKey++;
            }

            //Construct the new trie adding tries as necessary
            //This is a dictionary as the states are added non-consecutively
            Dictionary <int, RegexTrie <T> > allDTries = new Dictionary <int, RegexTrie <T> >(markedDStates.Count);

            for (int index = 0; index < markedDStates.Count; index++)
            {
                //Add the origin state to all tries, unless it already exists

                if (!allDTries.ContainsKey(index))
                {
                    allDTries[index] = new RegexTrie <T>();
                }

                //Add normal transitions to all tries
                addCharacterTransitionsToDFA(
                    dTrans[index].normalTransitions,
                    allDTries,
                    allDTries[index]);

                addRangeTransitionsToDFA(
                    dTrans[index].rangeTransitions,
                    allDTries,
                    allDTries[index]);

                addComplementTransitionsToDFA(
                    dTrans[index].complementTransitions,
                    allDTries,
                    allDTries[index]);

                //Add translations and replace fields to states that have epsilon closures containing translations
                foreach (RegexTrie <T> state in markedDStates[index])
                {
                    if (state.matches != null)
                    {
                        if (allDTries[index].matches == null)
                        {
                            allDTries[index].matches = new List <T>()
                            {
                                state.matches[0]
                            };
                        }
                        else
                        {
                            allDTries[index].matches.Add(state.matches[0]);
                        }
                    }
                }
            }
            //Return a reference to the root node of the trie
            return(allDTries[0]);
        }
        //This has to keep track of four nodes: end of the previous component, start of the current component,
        //end of the current component and the node that is connected to the end of the current component with
        //an epsilon transition (which will become the end of the previous component for the next component).
        public void AddToRegexTrie
            (RegexTrie <T> trie,
            string source,
            T matchobject,
            Stack <RegexTrie <T> > endStack,
            Stack <RegexTrie <T> > startStack,
            Stack <RegexTrie <T> > commonDestination)
        {
            RegexTrie <T> newTrie = new RegexTrie <T>();

            //Current trie being added to
            RegexTrie <T> endNodeOfCurrentComponent = trie;

            //End of previous trie, to which the current trie is joined
            RegexTrie <T> endNodeOfPreviousComponent = trie;

            //End of previous trie, to which the current trie is joined
            RegexTrie <T> startNodeOfCurrentComponent = trie;

            //Push trie on the start stack
            startStack.Push(trie);

            //Push a null on the commonStart and commonDestination stacks
            commonDestination.Push(null);

            //Matching group counter.
            byte groupCount = 1;

            //True if previous character was an escape char
            Boolean escapedCharacter = false;

            //Variable for the character at the loop index
            char currentChar;

            //Holds the result of the quantifier check
            KeyValuePair <int, RegexTrie <T> > quantifierCheckResult;

            for (int stringIndex = 0; stringIndex < source.Length; stringIndex++)
            {
                currentChar = source[stringIndex];


                //Check for escape character
                if (currentChar == '\\')
                {
                    escapedCharacter = true;
                    continue;
                }

                //If the character is escaped, just make it into a trie
                if (escapedCharacter)
                {
                    //Special char handling
                    if (currentChar == 't')
                    {
                        currentChar = '\t';
                    }
                    startNodeOfCurrentComponent = new RegexTrie <T>();
                    endNodeOfCurrentComponent   = startNodeOfCurrentComponent.GetOrAddChildNode(currentChar, 0);
                    quantifierCheckResult       = checkForQuantifier(
                        source.Substring(stringIndex + 1),
                        endNodeOfCurrentComponent,
                        startNodeOfCurrentComponent,
                        endNodeOfPreviousComponent,
                        commonDestination);
                    stringIndex += quantifierCheckResult.Key;
                    endNodeOfPreviousComponent = quantifierCheckResult.Value;
                    escapedCharacter           = false;
                    continue;
                }
                //Else check character for special meaning
                else
                {
                    switch (currentChar)
                    {
                    //Open a new group
                    case '(':
                    {
                        //Push the previous trie to the end stack
                        endStack.Push(endNodeOfPreviousComponent);
                        //Create a new trie
                        endNodeOfPreviousComponent = new RegexTrie <T>();
                        //Push the newly created trie on the start stack
                        startStack.Push(endNodeOfPreviousComponent);
                        //Push a null trie on the common destination and start stacks (to be defined, if pipes are found)
                        commonDestination.Push(null);
                        break;
                    }

                    //Close a group
                    case ')':
                    {
                        //If common destination exists, add an epsilon transition to it
                        if (commonDestination.Peek() != null)
                        {
                            //Connect the end node and common destination as part of the same epsilon closure
                            commonDestination.Peek().AddEpsilonTransition(endNodeOfPreviousComponent);
                            endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek());
                            //Move the current trie to common destination, as that's where the building will continue
                            //Pop the common destination, it won't be needed anymore
                            endNodeOfCurrentComponent = commonDestination.Pop();
                        }
                        else
                        {
                            endNodeOfCurrentComponent = endNodeOfPreviousComponent;
                            //Pop the null destination
                            commonDestination.Pop();
                        }

                        startNodeOfCurrentComponent = startStack.Pop();
                        endNodeOfPreviousComponent  = endStack.Pop();

                        quantifierCheckResult = checkForQuantifier(
                            source.Substring(stringIndex + 1),
                            endNodeOfCurrentComponent,
                            startNodeOfCurrentComponent,
                            endNodeOfPreviousComponent,
                            commonDestination);
                        stringIndex += quantifierCheckResult.Key;
                        endNodeOfPreviousComponent = quantifierCheckResult.Value;
                        try
                        {
                            endStack.Peek();
                        }
                        //If we're at the top, add group number to each node of the trie
                        catch
                        {
                            addGroupNumbers(startNodeOfCurrentComponent, groupCount);
                            groupCount++;
                        }
                        break;
                    }

                    //Handle square bracket set
                    case '[':
                    {
                        startNodeOfCurrentComponent = new RegexTrie <T>();
                        endNodeOfCurrentComponent   = new RegexTrie <T>();
                        //This skips over the closing square bracket, so there's no need for closing square bracket handling
                        stringIndex          += handleSquareBracketGroup(startNodeOfCurrentComponent, endNodeOfCurrentComponent, source.Substring(stringIndex + 1));
                        quantifierCheckResult = checkForQuantifier(
                            source.Substring(stringIndex + 1),
                            endNodeOfCurrentComponent,
                            startNodeOfCurrentComponent,
                            endNodeOfPreviousComponent,
                            commonDestination);
                        stringIndex += quantifierCheckResult.Key;
                        endNodeOfPreviousComponent = quantifierCheckResult.Value;
                        break;
                    }

                    //Caret at the start: add a transition with a control character that won't exist in text.
                    //Feed the control character when finding matches.
                    case '^':
                    {
                        endNodeOfPreviousComponent = addSpecialTransition(2, endNodeOfPreviousComponent);
                        break;
                    }

                    //Dollar at end: add a transition with a control character that won't exist in text.
                    //Feed the control character when finding matches.
                    case '$':
                    {
                        endNodeOfPreviousComponent = addSpecialTransition(3, endNodeOfPreviousComponent);
                        break;
                    }

                    //Period handling
                    case '.':
                    {
                        startNodeOfCurrentComponent = new RegexTrie <T>();
                        endNodeOfCurrentComponent   = new RegexTrie <T>();

                        //Add complement of null character
                        startNodeOfCurrentComponent.complementTransitions.Add(
                            new ComplementTransition <T>(
                                0,
                                endNodeOfCurrentComponent,
                                new List <char>(),
                                new List <KeyValuePair <char, char> >()
                            {
                                new KeyValuePair <char, char>((char)0, (char)0)
                            }));

                        quantifierCheckResult = checkForQuantifier(
                            source.Substring(stringIndex + 1),
                            endNodeOfCurrentComponent,
                            startNodeOfCurrentComponent,
                            endNodeOfPreviousComponent,
                            commonDestination);
                        stringIndex += quantifierCheckResult.Key;
                        endNodeOfPreviousComponent = quantifierCheckResult.Value;
                        break;
                    }

                    //Change previous end to common destination and move end node to common start
                    case '|':
                    {
                        if (commonDestination.Peek() == null)
                        {
                            commonDestination.Pop();
                            commonDestination.Push(endNodeOfPreviousComponent);
                        }
                        else
                        {
                            //Connect the end node and common destination as part of the same epsilon closure
                            commonDestination.Peek().AddEpsilonTransition(endNodeOfPreviousComponent);
                            endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek());
                        }
                        endNodeOfPreviousComponent = startStack.Peek();
                        break;
                    }

                    default:
                    {
                        startNodeOfCurrentComponent = new RegexTrie <T>();
                        endNodeOfCurrentComponent   = startNodeOfCurrentComponent.GetOrAddChildNode(currentChar, 0);
                        quantifierCheckResult       = checkForQuantifier(
                            source.Substring(stringIndex + 1),
                            endNodeOfCurrentComponent,
                            startNodeOfCurrentComponent,
                            endNodeOfPreviousComponent,
                            commonDestination);
                        stringIndex += quantifierCheckResult.Key;
                        endNodeOfPreviousComponent = quantifierCheckResult.Value;
                        break;
                    }
                    }
                }
            }
            //Link end node to common destination and start node to common start if they exist
            if (commonDestination.Peek() != null)
            {
                endNodeOfPreviousComponent.AddEpsilonTransition(commonDestination.Peek());
                endNodeOfPreviousComponent = commonDestination.Pop();
            }

            //Add translation and replace fields to the current trie
            endNodeOfPreviousComponent.matches = new List <T> {
                matchobject
            };
        }
Exemplo n.º 24
0
        private Segment InjectTermsFuzzyMatch(Segment segment, SearchResult result)
        {
            //Reset the fuzzy current visitor
            _provider.FuzzyCurrentVisitor.Reset();

            //Have the fuzzy visitor go through the current source segment and pick up found
            //terms and their possible replacements
            foreach (var element in segment.Elements)
            {
                element.AcceptSegmentElementVisitor(_provider.FuzzyCurrentVisitor);
            }

            //Generate a new replacement trie from the fuzzy current visitor term list
            Trie replacementTrie = _provider.ExactMatchTrieReplaces.Clone();

            //Create a new regex trie which will be used as the secondary regex trie in the visitor
            RegexTrie <TranslationAndReplacement> replacementRegexTrie = new RegexTrie <TranslationAndReplacement>();
            Boolean regexesAdded = false;

            //This will be the new term list of the visitor, with replacement terms removed
            List <PositionAndTranslation> newTermList = new List <PositionAndTranslation>();

            //Go through the terms adding the terms to either the normal or regex trie or to the new term list
            foreach (var term in _provider.FuzzyCurrentVisitor.TermList)
            {
                //If the term has a replaces value, add it to trie
                if (term.Replaces != "")
                {
                    //Use replaces field as source. If the term is from a regex trie, add
                    //it to a regex trie
                    if (term.Regex)
                    {
                        _provider.RegexTrieFactory.AddToRegexTrie(
                            replacementRegexTrie,
                            term.Replaces,
                            new TranslationAndReplacement(
                                term.Translation,
                                ""));
                        regexesAdded = true;
                    }
                    else
                    {
                        replacementTrie.AddToTrie(term.Replaces, term.Translation, "");
                    }
                }
                //If there's no replaces field, handle this in the comparison phase
                else
                {
                    newTermList.Add(term);
                }
            }
            //Determinise the regex trie
            if (regexesAdded)
            {
                replacementRegexTrie = _provider.Determiniser.determiniseNFA(replacementRegexTrie);
            }
            //Update the fuzzy term list with the list that does not contain the replacement terms
            _provider.FuzzyCurrentVisitor.TermList = newTermList;

            //Reset fuzzy replace visitor and update the tries to it
            _provider.FuzzyReplaceVisitor.Reset();
            _provider.FuzzyReplaceVisitor.SndTrie      = replacementTrie;
            _provider.FuzzyReplaceVisitor.SndRegexTrie = replacementRegexTrie;

            //Go through the translation proposal target segment with the visitor
            foreach (var element in result.TranslationProposal.TargetSegment.Elements)
            {
                //Why would there be null elements?
                if (element != null)
                {
                    element.AcceptSegmentElementVisitor(_provider.FuzzyReplaceVisitor);
                }
            }

            Segment segmentWithTerms = _provider.FuzzyReplaceVisitor.Segment;

            //Visit the elements of the results, compare the resulting visitor with the _fuzzyVisitor and
            //and add the terms only found in _fuzzyVisitor to the translation proposal
            if (_options.InjectNewTermsIntoFuzzies == "true")
            {
                _provider.FuzzyMatchVisitor.Reset();
                foreach (var element in result.MemoryTranslationUnit.SourceSegment.Elements)
                {
                    element.AcceptSegmentElementVisitor(_provider.FuzzyMatchVisitor);
                }

                Text newTerms = _provider.FuzzyCurrentVisitor.TermDifference(_provider.FuzzyMatchVisitor);

                if (newTerms.Value.Length > 0)
                {
                    segmentWithTerms.Elements.Insert(0, newTerms);
                }
            }

            //Return a deep copy of the segment (if reference is used, all results will display
            //the last segment constructed.)
            return(segmentWithTerms.Duplicate());
        }
        public void addStringToTries(string ruleString)
        {
            if (ruleString.Contains(this.Options.TermAdditionSeparator))
            {
                string[] splitTerm = ruleString.Split(this.Options.TermAdditionSeparator.ToCharArray());

                List <string> newTerm = splitTerm.ToList();
                //If there's only two fields, add an empty field
                if (newTerm.Count == 2)
                {
                    newTerm.Add("");
                }

                //Check that either first or the third field of the new term are non-empty
                if ((newTerm[0].Length > 0) || (newTerm[2].Length > 0))
                {
                    char fileDelimiter = delimiterToChar(this.Options.Delimiter);
                    //Regex for converting unicode escape sequences to characters
                    Regex rx = new Regex(@"\[uU]([0-9a-fA-F]{4})");

                    //Check whether this is a regex term or a normal term
                    if (ruleString[0] == 'r' && ruleString[1] == '\\')
                    {
                        //Remove the regex marker
                        newTerm[0] = newTerm[0].Substring(2);

                        //Convert the unicode escape sequences in the new term
                        List <string> unicodeParsedNewTerm = new List <string>();
                        foreach (string field in newTerm)
                        {
                            unicodeParsedNewTerm.Add(
                                rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); }));
                        }

                        //Validate source and replaces fields
                        foreach (var num in new List <int> {
                            0, 2
                        })
                        {
                            try
                            {
                                int validationResult = this.regexTrieFactory.validateRegex(unicodeParsedNewTerm[num]);
                                if (validationResult != 0)
                                {
                                    List <KeyValuePair <string, string> > results = new List <KeyValuePair <string, string> >();
                                    results.Add(
                                        new KeyValuePair <string, string>(unicodeParsedNewTerm[num],
                                                                          this.regexTrieFactory.errorMessages[validationResult]));
                                    ValidationErrorForm errorForm = new ValidationErrorForm(results);
                                    return;
                                }
                            }
                            catch
                            {
                                //The field does not exist, no need to validate
                            }
                        }

                        if (File.Exists(this.Options.RegexFileName))
                        {
                            TextWriter tw = new StreamWriter(this.Options.RegexFileName, true);
                            tw.WriteLine();
                            tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]);
                            tw.Close();
                        }
                        else if (this.Options.RegexFileName != "" || this.Options.RegexFileName == null)
                        {
                            MessageBox.Show("Regular expression rule file does not exist", "TermInjector");
                            this.Options.RegexFileName = "";
                        }


                        if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == true)
                        {
                            this.regexTrieSource =
                                this.determiniser.determiniseNFA(
                                    this.regexTrieSource);
                        }
                        else if (this.trieLoader.addFieldsToRegexTrie(unicodeParsedNewTerm, this.regexTrieSource, this.regexTrieReplaces) == false)
                        {
                            this.regexTrieReplaces =
                                this.determiniser.determiniseNFA(
                                    this.regexTrieReplaces);
                        }
                    }
                    else
                    {
                        if (File.Exists(this.Options.GlossaryFileName))
                        {
                            TextWriter tw = new StreamWriter(this.Options.GlossaryFileName, true);
                            tw.WriteLine();
                            tw.Write(newTerm[0] + fileDelimiter + newTerm[1] + fileDelimiter + newTerm[2]);
                            tw.Close();
                        }
                        else if (this.Options.GlossaryFileName != "" || this.Options.GlossaryFileName == null)
                        {
                            MessageBox.Show("Exact match rule file does not exist", "TermInjector");
                        }
                        if (!matchCaseToBool(this.Options.MatchCase))
                        {
                            newTerm[0] = newTerm[0].ToLower();
                            newTerm[2] = newTerm[2].ToLower();
                        }

                        //Convert the unicode escape sequences in the new term
                        List <string> unicodeParsedNewTerm = new List <string>();
                        foreach (string field in newTerm)
                        {
                            unicodeParsedNewTerm.Add(rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); }));
                        }

                        //Add term to normal or fuzzy trie
                        this.trieLoader.addFieldsToTrie(unicodeParsedNewTerm, this.exactMatchTrieSource, this.exactMatchTrieReplaces);
                    }
                }

                //Update the possible new tries to visitors
                initializeVisitors();
            }
            return;
        }
Exemplo n.º 26
0
        //This will return all regexmatches found from the search string
        public List <PositionAndTranslation> FindRegexMatches
            (RegexTrie <TranslationAndReplacement> regexTrie, string inputString, string tokenBoundaryCharacterString,
            Boolean useBoundaryCharacters)
        {
            //triePathsBeingTraversed field holds the paths within the regexTrie that are currently being followed
            //Each value of the dictionary also contains the character position where following of the path was started
            //and string builder for the groups within the regex
            List <RegexPathWithStartCharacterPosition> triePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>();

            //This list will be built during iteration and used as the new list in the next loop cycle
            List <RegexPathWithStartCharacterPosition> newTriePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>();

            //positionAndTranslationOfTerms field holds the translations of the terms that have been discovered.
            //Each value of the dictionary also contains the start and end positions of the source term,
            //so that the translations can be inserted later
            List <PositionAndTranslation> positionAndTranslationOfTerms = new List <PositionAndTranslation>();

            //Define the set of characters used to tokenize the searchString (I call these boundary characters)
            HashSet <Char> setOfTokenBoundaryCharacters = new HashSet <char>();

            foreach (char tokenBoundaryChar in tokenBoundaryCharacterString)
            {
                setOfTokenBoundaryCharacters.Add(tokenBoundaryChar);
            }


            //Boolean for holding the result of checking whether character is a boundary character
            bool isTokenBoundaryCharacter;

            //Add the initial path to newTriePathsBeingTraversed at index 0
            int pathIndex = 0;

            newTriePathsBeingTraversed.Add(new RegexPathWithStartCharacterPosition(0, regexTrie));
            pathIndex++;

            //Initialize the current character variable
            char currentChar;

            //Feed the start control character to reach the portion of the trie with the
            //string start relative regexes
            if (regexTrie.characterTransitions.ContainsKey((char)2))
            {
                //The start character is never a part of group, so just select the zero group transition
                newTriePathsBeingTraversed.Add(
                    new RegexPathWithStartCharacterPosition(
                        0, regexTrie.characterTransitions[(char)2][0].destination));
            }

            //Iterate over the inputString.
            for (int charIndex = 0; charIndex < inputString.Length; charIndex++)
            {
                //Remove duplicate paths (with same trieAndGroups) from the paths

                triePathsBeingTraversed    = removeDuplicateTrieAndGroups(newTriePathsBeingTraversed);
                newTriePathsBeingTraversed = new List <RegexPathWithStartCharacterPosition>();

                currentChar = inputString[charIndex];

                //Check if char currentChar is a boundary character, or if the boundary characters are not used
                isTokenBoundaryCharacter = setOfTokenBoundaryCharacters.Contains(currentChar) || !useBoundaryCharacters;

                //Iterate over the triePathsBeingTraversed and check if they continue with the current char
                foreach (RegexPathWithStartCharacterPosition triePath in triePathsBeingTraversed)
                {
                    //If the char is a boundary character, check for translation
                    if (isTokenBoundaryCharacter)
                    {
                        if (triePath.trie.matches != null)
                        {
                            addTranslation(triePath, positionAndTranslationOfTerms, charIndex - 1);
                        }
                    }

                    //Check whether path continues with the current char or range transitions,
                    //and add continuing paths
                    checkWhetherPathContinues(triePath, newTriePathsBeingTraversed, currentChar);
                }

                //If char is a boundarycharacter, add a new path
                if (isTokenBoundaryCharacter)
                {
                    //The term will actually begin at the next charIndex (next loop), so add +1 to key
                    newTriePathsBeingTraversed.Add(new RegexPathWithStartCharacterPosition(charIndex + 1, regexTrie));
                }
            }

            //Check if any of the tries has a translation at the last position
            foreach (RegexPathWithStartCharacterPosition triePath in newTriePathsBeingTraversed)
            {
                //File.AppendAllText(@"C:\Users\Anonyymi\Desktop\log.txt", String.Join(",",triePath.trie.matches.Select(x => x.translation).ToArray()));
                //Feed the end control character to reach the portion of the trie with the
                //string end relative regexes
                if (triePath.trie.characterTransitions.ContainsKey((char)3))
                {
                    RegexPathWithStartCharacterPosition endCharConsumed = new RegexPathWithStartCharacterPosition(
                        triePath.startCharacterPosition,
                        triePath.trie.characterTransitions[(char)3][0].destination,
                        triePath.groups);
                    //There's always an translation behind the end character
                    addTranslation(endCharConsumed, positionAndTranslationOfTerms, inputString.Length - 1);
                }
                //The end position needs to be the length of the input string - 1 , as spans are start to end character
                if (triePath.trie.matches != null)
                {
                    addTranslation(triePath, positionAndTranslationOfTerms, inputString.Length - 1);
                }
            }
            return(positionAndTranslationOfTerms);
        }
        //Add transitions from a square bracket group between the tries given as arguments.
        //Returns the amount of characters consumed from the input string (so the string index can be incremented).
        private int handleSquareBracketGroup(RegexTrie <T> sourceTrie, RegexTrie <T> targetTrie, string sourceTail)
        {
            Boolean isEscaped    = false;
            Boolean isComplement = false;
            //The complement transition to build
            ComplementTransition <T> complement = new ComplementTransition <T>(0, targetTrie);

            //Check whether this is a complement set
            if (sourceTail[0] == '^')
            {
                //Add complement transition to trie
                sourceTrie.complementTransitions.Add(complement);

                isComplement = true;
                sourceTail   = sourceTail.Substring(1);
            }

            StringBuilder set = new StringBuilder();
            char          currentChar;

            //First build a string out of the expression, then add every char in it or its complement to the trie
            //as transitions
            for (int stringIndex = 0; stringIndex < sourceTail.Length; stringIndex++)
            {
                currentChar = sourceTail[stringIndex];

                if (isEscaped)
                {
                    set       = handleEscaped(set, currentChar);
                    isEscaped = false;
                    continue;
                }

                switch (currentChar)
                {
                case '\\':
                {
                    isEscaped = true;
                    break;
                }

                case '-':
                {
                    char start = sourceTail[stringIndex - 1];
                    char end   = sourceTail[stringIndex + 1];
                    //Add a range transition, which can be a complement
                    if (isComplement)
                    {
                        complement.ranges.Add(new KeyValuePair <char, char>(start, end));
                    }
                    else
                    {
                        sourceTrie.AddRangeTransition(
                            targetTrie,
                            start,
                            end,
                            0);
                    }
                    //Remove the range start from set
                    set.Remove(set.Length - 1, 1);

                    //skip the range end
                    stringIndex++;
                    break;
                }

                //Add the set or its complement as transitions
                case ']':
                {
                    string finalSet = set.ToString();
                    if (isComplement)
                    {
                        complement.characters.AddRange(finalSet);
                        //One character was consumed by the complement special character, so increment by 2
                        return(stringIndex + 2);
                    }
                    else
                    {
                        foreach (char c in finalSet)
                        {
                            sourceTrie.AddCharacterTransition(c, 0, targetTrie);
                        }
                        return(stringIndex + 1);
                    }
                }

                default:
                {
                    set.Append(currentChar);
                    break;
                }
                }
            }
            //This should never be reached, as the regex is validated to always contain closing square bracket
            return(-1);
        }
Exemplo n.º 28
0
 public void AddEpsilonTransition(RegexTrie <T> newEpsilon)
 {
     this.epsilonTransitions.Add(newEpsilon);
 }
Exemplo n.º 29
0
 public Transition(byte group, RegexTrie <T> dest)
 {
     this.groupNumber = group;
     this.destination = dest;
 }
Exemplo n.º 30
0
        //This loads the regex tries from a file to the two regextries given as parameters
        public void loadRegexTrieFromFile(
            string fileName,
            char delimiter,
            ref RegexTrie <TranslationAndReplacement> regexTrieSource,
            ref RegexTrie <TranslationAndReplacement> regexTrieReplaces)
        {
            //Regextrie is used for visiting source segment, fuzzy regex trie for visiting
            //the fuzzy match target segment

            //Check if file exists, exit method and show a message if it doesn't
            if (!File.Exists(fileName))
            {
                //If the file name is not empty, display alert
                if (fileName != "")
                {
                    MessageBox.Show("Regular expression rule file does not exist", "TermInjector");
                }
                return;
            }

            //Regex for converting unicode escape sequences to characters
            Regex rx = new Regex(@"\\[uU]([0-9a-fA-F]{4})");

            //Counter for restricting glossary size
            int stringMemoryUsage = 0;

            //Counters for checking whether terms are being added
            int     lineCount          = 0;
            int     termCount          = 0;
            Boolean addedNormalRegexes = false;
            Boolean addedFuzzyRegexes  = false;

            using (StreamReader sourceFile = File.OpenText(fileName))
            {
                while (!sourceFile.EndOfStream)
                {
                    //Check if memory usage is within bounds
                    if (stringMemoryUsage > 2500000)
                    {
                        MessageBox.Show("Regular expression rule file loading stopped due to excessive size: Only part of the regular expression rule file has been loaded.", "TermInjector");
                        break;
                    }

                    //Split the line before Unicode conversion (so as not to accidentally add separators)
                    List <string> unicodeEscapedSplitTerm = sourceFile.ReadLine().Split(delimiter).ToList();

                    //Convert the unicode escape sequences in the fields
                    List <string> splitTerm = new List <string>();
                    foreach (var field in unicodeEscapedSplitTerm)
                    {
                        splitTerm.Add(
                            rx.Replace(field, delegate(Match match) { return(((char)Int32.Parse(match.Value.Substring(2), NumberStyles.HexNumber)).ToString()); }));
                    }

                    //Check whether the line was valid (at least two fields)
                    if (splitTerm.Count < 2)
                    {
                        lineCount++;
                        continue;
                    }
                    List <string> newTerm = splitTerm.ToList();

                    //If both first and third fields are empty
                    //, skip to next iteration
                    if (newTerm[0].Length == 0 && newTerm[1].Length == 0)
                    {
                        lineCount++;
                        continue;
                    }
                    //If length of list is two, add empty field
                    if (newTerm.Count == 2)
                    {
                        newTerm.Add("");
                    }

                    //Tally the proxy for memory usage, depending on whether source or replaces
                    //field was used as path
                    if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == true)
                    {
                        stringMemoryUsage += newTerm[0].Length;
                        addedNormalRegexes = true;
                        termCount++;
                        lineCount += 1;
                    }
                    else if (this.addFieldsToRegexTrie(newTerm, regexTrieSource, regexTrieReplaces) == false)
                    {
                        stringMemoryUsage += newTerm[2].Length;
                        addedFuzzyRegexes  = true;
                        termCount++;
                        lineCount += 1;
                    }
                }
                sourceFile.Close();

                //Determinise the regex tries
                if (addedNormalRegexes)
                {
                    //Here's the problem, determiniser breaks the reference: use ref keywords
                    regexTrieSource =
                        this.determiniser.determiniseNFA(
                            regexTrieSource);
                }
                if (addedFuzzyRegexes)
                {
                    regexTrieReplaces =
                        this.determiniser.determiniseNFA(
                            regexTrieReplaces);
                }


                //If the proportion of terms stored and lines read is skewed, the wrong delimiter may have been used.
                //Don't check very small glossaries, as otherwise an empty line or two could trigger the message
                if (lineCount - termCount > (lineCount / 2))
                {
                    string delimiterUsed = "";
                    if (delimiter == '\t')
                    {
                        delimiterUsed = "Tab";
                    }
                    else
                    {
                        delimiterUsed = delimiter.ToString();
                    }
                    MessageBox.Show((string.Format("The amount of regular expression rules stored is small compared to the amount of lines read: {0} lines read, but only {1} regular expression rules found. Are you sure the delimiter character {2} is correct?"
                                                   , lineCount.ToString(), termCount.ToString(), delimiterUsed)), "TermInjector");
                }
            }

            if (this.validationErrors.Count > 0)
            {
                ValidationErrorForm errorForm = new ValidationErrorForm(this.validationErrors);
            }

            this.validationErrors.Clear();
        }