コード例 #1
0
 public PathWithStartCharacterPosition(int newStartCharacterPosition, Trie newPath)
 {
     this.StartCharacterPosition = newStartCharacterPosition;
     this.Path = newPath;
 }
コード例 #2
0
        //Returns a data structure which contains the translations of the glossary terms and their positions in the search string
        public List <PositionAndTranslation> FindMatches
            (Trie glossaryTrie, string inputString, string tokenBoundaryCharacterString, Boolean matchCase, Boolean useBoundaryCharacters)
        {
            string searchString = inputString;

            //If Match case option is false, convert the search string to lower case.
            //The Match case option is also checked when loading the trie, as the
            //trie needs to be loaded in lower case if Match case if false. The loading
            //is done in the static loadTrieFromFile method of
            //TermInjectorTranslationProviderLanguageDirection class

            if (!matchCase)
            {
                searchString = searchString.ToLower();
            }

            //triePathsBeingTraversed field holds the paths within the glossaryTrie that are currently being followed
            //Each value of the dictionary also contains the character position where following of the path was started
            Dictionary <int, PathWithStartCharacterPosition> triePathsBeingTraversed = new Dictionary <int, PathWithStartCharacterPosition>();

            //positionAndTranslationOfTerms field holds the translations of the terms that have been discovered.
            //Each value of the dictionary also contains the start and end positions of the source term,
            //so that the translations can be inserted later
            List <PositionAndTranslation> positionAndTranslationOfTerms = new List <PositionAndTranslation>();

            //Define the set of characters used to tokenize the searchString (I call these boundary characters)
            HashSet <Char> setOfTokenBoundaryCharacters = new HashSet <char>();

            foreach (char tokenBoundaryChar in tokenBoundaryCharacterString)
            {
                setOfTokenBoundaryCharacters.Add(tokenBoundaryChar);
            }

            //Boolean for holding the result of checking whether character is a boundary character
            bool isTokenBoundaryCharacter = new Boolean();

            //Add the initial path to triePathsBeingTraversed at index 0
            int pathIndex = 0;

            triePathsBeingTraversed.Add(pathIndex, new PathWithStartCharacterPosition(0, glossaryTrie.Clone()));
            pathIndex++;

            //Initialize the index of positionAndTranslationOfTerms dictionary
            int termIndex = 0;

            //Initialize the current character variable
            char currentChar;

            //Iterate over the searchString
            for (int charIndex = 0; charIndex < searchString.Length; charIndex++)
            {
                currentChar = searchString[charIndex];
                //Check if char currentChar is a boundary character
                isTokenBoundaryCharacter = setOfTokenBoundaryCharacters.Contains(currentChar) || !useBoundaryCharacters;

                //Iterate over the triePathsBeingTraversed and check if they continue with the current char
                for (int i = 0; i <= pathIndex; i++)
                {
                    //Check whether path exists, if not move to next index
                    if (!triePathsBeingTraversed.ContainsKey(i))
                    {
                        continue;
                    }

                    //If the char is a boundary character and there's a translation at the glossaryTrie node,
                    //add translation to positionAndTranslationOfTerms dictionary with position information

                    if (isTokenBoundaryCharacter && triePathsBeingTraversed[i].Path.GetTranslation() != "")
                    {
                        int    termStartCharacter = triePathsBeingTraversed[i].StartCharacterPosition;
                        string termTranslation    = triePathsBeingTraversed[i].Path.GetTranslation();
                        string termReplaces       = triePathsBeingTraversed[i].Path.GetReplaces();
                        //The end character position is character index minus one, as the current character is a boundary character
                        positionAndTranslationOfTerms.Add(new PositionAndTranslation(
                                                              termStartCharacter, charIndex - 1, termTranslation, termReplaces, false));
                        termIndex++;
                    }

                    //Check whether path continues with the current char
                    if (triePathsBeingTraversed[i].Path.GetChildNode(currentChar) != null)
                    {
                        //Assign the current char child of the path as a new path
                        triePathsBeingTraversed[i].Path = triePathsBeingTraversed[i].Path.GetChildNode(currentChar).Clone();
                    }
                    //If path does not continue, remove the path
                    else
                    {
                        triePathsBeingTraversed.Remove(i);
                    }
                }
                //If char is a boundarycharacter, add a new path and add the token
                //to token array if the length of the token is >0
                if (isTokenBoundaryCharacter)
                {
                    //The term will actually begin at the next charIndex (next loop), so add +1 to key
                    triePathsBeingTraversed.Add(pathIndex, new PathWithStartCharacterPosition(charIndex + 1, glossaryTrie.Clone()));
                    pathIndex += 1;
                }
            }

            //Iterate over the triePathsBeingTraversed one last time, to catch the possible last term
            for (int i = 0; i <= pathIndex; i++)
            {
                //Check whether path exists
                if (!triePathsBeingTraversed.ContainsKey(i))
                {
                    continue;
                }

                if (triePathsBeingTraversed[i].Path.GetTranslation() != "")
                {
                    int    termStartCharacter = triePathsBeingTraversed[i].StartCharacterPosition;
                    string termTranslation    = triePathsBeingTraversed[i].Path.GetTranslation();
                    string termReplaces       = triePathsBeingTraversed[i].Path.GetReplaces();
                    positionAndTranslationOfTerms.Add(new PositionAndTranslation(
                                                          termStartCharacter, (searchString.Length - 1), termTranslation, termReplaces, false));
                    termIndex++;
                }
            }



            //Return the positionAndTranslationOfTerms
            return(positionAndTranslationOfTerms);
        }
コード例 #3
0
 //This adds a list of three fields to the normal term trie or the fuzzy term trie
 //Returns true if fields were added to normal term trie
 public Boolean addFieldsToTrie(List <string> fields, Trie exactMatchTrieSource, Trie exactMatchTrieReplaces)
 {
     //If first field has content, add fields to normal trie
     if (fields[0].Length > 0)
     {
         //add the pair to the trie
         exactMatchTrieSource.AddToTrie(fields[0], fields[1], fields[2]);
         return(true);
     }
     //Otherwise add fields to fuzzy trie
     else
     {
         exactMatchTrieReplaces.AddToTrie(fields[2], fields[1], "");
         return(false);
     }
 }
コード例 #4
0
        private Segment InjectTermsFuzzyMatch(Segment segment, SearchResult result)
        {
            //Reset the fuzzy current visitor
            _provider.FuzzyCurrentVisitor.Reset();

            //Have the fuzzy visitor go through the current source segment and pick up found
            //terms and their possible replacements
            foreach (var element in segment.Elements)
            {
                element.AcceptSegmentElementVisitor(_provider.FuzzyCurrentVisitor);
            }

            //Generate a new replacement trie from the fuzzy current visitor term list
            Trie replacementTrie = _provider.ExactMatchTrieReplaces.Clone();

            //Create a new regex trie which will be used as the secondary regex trie in the visitor
            RegexTrie <TranslationAndReplacement> replacementRegexTrie = new RegexTrie <TranslationAndReplacement>();
            Boolean regexesAdded = false;

            //This will be the new term list of the visitor, with replacement terms removed
            List <PositionAndTranslation> newTermList = new List <PositionAndTranslation>();

            //Go through the terms adding the terms to either the normal or regex trie or to the new term list
            foreach (var term in _provider.FuzzyCurrentVisitor.TermList)
            {
                //If the term has a replaces value, add it to trie
                if (term.Replaces != "")
                {
                    //Use replaces field as source. If the term is from a regex trie, add
                    //it to a regex trie
                    if (term.Regex)
                    {
                        _provider.RegexTrieFactory.AddToRegexTrie(
                            replacementRegexTrie,
                            term.Replaces,
                            new TranslationAndReplacement(
                                term.Translation,
                                ""));
                        regexesAdded = true;
                    }
                    else
                    {
                        replacementTrie.AddToTrie(term.Replaces, term.Translation, "");
                    }
                }
                //If there's no replaces field, handle this in the comparison phase
                else
                {
                    newTermList.Add(term);
                }
            }
            //Determinise the regex trie
            if (regexesAdded)
            {
                replacementRegexTrie = _provider.Determiniser.determiniseNFA(replacementRegexTrie);
            }
            //Update the fuzzy term list with the list that does not contain the replacement terms
            _provider.FuzzyCurrentVisitor.TermList = newTermList;

            //Reset fuzzy replace visitor and update the tries to it
            _provider.FuzzyReplaceVisitor.Reset();
            _provider.FuzzyReplaceVisitor.SndTrie      = replacementTrie;
            _provider.FuzzyReplaceVisitor.SndRegexTrie = replacementRegexTrie;

            //Go through the translation proposal target segment with the visitor
            foreach (var element in result.TranslationProposal.TargetSegment.Elements)
            {
                //Why would there be null elements?
                if (element != null)
                {
                    element.AcceptSegmentElementVisitor(_provider.FuzzyReplaceVisitor);
                }
            }

            Segment segmentWithTerms = _provider.FuzzyReplaceVisitor.Segment;

            //Visit the elements of the results, compare the resulting visitor with the _fuzzyVisitor and
            //and add the terms only found in _fuzzyVisitor to the translation proposal
            if (_options.InjectNewTermsIntoFuzzies == "true")
            {
                _provider.FuzzyMatchVisitor.Reset();
                foreach (var element in result.MemoryTranslationUnit.SourceSegment.Elements)
                {
                    element.AcceptSegmentElementVisitor(_provider.FuzzyMatchVisitor);
                }

                Text newTerms = _provider.FuzzyCurrentVisitor.TermDifference(_provider.FuzzyMatchVisitor);

                if (newTerms.Value.Length > 0)
                {
                    segmentWithTerms.Elements.Insert(0, newTerms);
                }
            }

            //Return a deep copy of the segment (if reference is used, all results will display
            //the last segment constructed.)
            return(segmentWithTerms.Duplicate());
        }
コード例 #5
0
        //A static method to load the tries from a file (this needs to be called from WinFormsUI if the settings are updated,
        //so I've made it static). Two tries are built: exactMatchTrieSource, which uses source term as a path, and fuzzyexactMatchTrieSource,
        //which uses target term as path. A line is added to fuzzyexactMatchTrieSource only in case the first field (source) is empty.
        public void loadTrieFromFile(string fileName, bool matchCase, char delimiter, Trie exactMatchTrieSource, Trie exactMatchTrieReplaces)
        {
            //Check if file exists, exit method and show a message if it doesn't
            if (!File.Exists(fileName))
            {
                //If the file name is not empty, display alert
                if (fileName != "")
                {
                    MessageBox.Show("Exact match rule file does not exist", "TermInjector");
                }
                return;
            }

            //Counter for restricting glossary size
            int stringMemoryUsage = 0;

            //Counters for checking whether terms are being added
            int lineCount = 0;
            int termCount = 0;

            using (StreamReader sourceFile = File.OpenText(fileName))
            {
                while (!sourceFile.EndOfStream)
                {
                    //Check if memory usage is within bounds
                    if (stringMemoryUsage > 2500000)
                    {
                        MessageBox.Show("Exact rule file loading stopped due to excessive size: Only part of the exact rule file has been loaded.", "TermInjector");
                        break;
                    }
                    string[] splitTerm = sourceFile.ReadLine().Split(delimiter);
                    //Check whether the line was valid (at least two fields)
                    if (splitTerm.Length < 2)
                    {
                        lineCount++;
                        continue;
                    }
                    List <string> newTerm = splitTerm.ToList();

                    //If both first and third fields are empty
                    //, skip to next iteration
                    if (newTerm[0].Length == 0 && newTerm[1].Length == 0)
                    {
                        lineCount++;
                        continue;
                    }
                    //If length of list is two, add empty field
                    if (newTerm.Count == 2)
                    {
                        newTerm.Add("");
                    }

                    //If case is not matched, convert both source and replaces fields to lower case
                    if (!matchCase)
                    {
                        newTerm[0] = newTerm[0].ToLower();
                        newTerm[2] = newTerm[2].ToLower();
                    }

                    //Tally the proxy for memory usage, depending on whether source or replaces
                    //field was used as path
                    if (this.addFieldsToTrie(newTerm, exactMatchTrieSource, exactMatchTrieReplaces))
                    {
                        stringMemoryUsage += newTerm[0].Length;
                        termCount++;
                    }
                    else
                    {
                        stringMemoryUsage += newTerm[2].Length;
                        termCount++;
                    }

                    lineCount += 1;
                }
                sourceFile.Close();

                //If the proportion of terms stored and lines read is skewed, the wrong delimiter may have been used.
                //Don't check very small glossaries, as otherwise an empty line or two could trigger the message
                if (lineCount - termCount > (lineCount / 2))
                {
                    string delimiterUsed = "";
                    if (delimiter == '\t')
                    {
                        delimiterUsed = "Tab";
                    }
                    else
                    {
                        delimiterUsed = delimiter.ToString();
                    }
                    MessageBox.Show((string.Format("The amount of exact match rules stored is small compared to the amount of lines read: {0} lines read, but only {1} exact match rules found. Are you sure the delimiter character {2} is correct?"
                                                   , lineCount.ToString(), termCount.ToString(), delimiterUsed)), "TermInjector");
                }
            }
        }