public PathWithStartCharacterPosition(int newStartCharacterPosition, Trie newPath) { this.StartCharacterPosition = newStartCharacterPosition; this.Path = newPath; }
//Returns a data structure which contains the translations of the glossary terms and their positions in the search string public List <PositionAndTranslation> FindMatches (Trie glossaryTrie, string inputString, string tokenBoundaryCharacterString, Boolean matchCase, Boolean useBoundaryCharacters) { string searchString = inputString; //If Match case option is false, convert the search string to lower case. //The Match case option is also checked when loading the trie, as the //trie needs to be loaded in lower case if Match case if false. The loading //is done in the static loadTrieFromFile method of //TermInjectorTranslationProviderLanguageDirection class if (!matchCase) { searchString = searchString.ToLower(); } //triePathsBeingTraversed field holds the paths within the glossaryTrie that are currently being followed //Each value of the dictionary also contains the character position where following of the path was started Dictionary <int, PathWithStartCharacterPosition> triePathsBeingTraversed = new Dictionary <int, PathWithStartCharacterPosition>(); //positionAndTranslationOfTerms field holds the translations of the terms that have been discovered. //Each value of the dictionary also contains the start and end positions of the source term, //so that the translations can be inserted later List <PositionAndTranslation> positionAndTranslationOfTerms = new List <PositionAndTranslation>(); //Define the set of characters used to tokenize the searchString (I call these boundary characters) HashSet <Char> setOfTokenBoundaryCharacters = new HashSet <char>(); foreach (char tokenBoundaryChar in tokenBoundaryCharacterString) { setOfTokenBoundaryCharacters.Add(tokenBoundaryChar); } //Boolean for holding the result of checking whether character is a boundary character bool isTokenBoundaryCharacter = new Boolean(); //Add the initial path to triePathsBeingTraversed at index 0 int pathIndex = 0; triePathsBeingTraversed.Add(pathIndex, new PathWithStartCharacterPosition(0, glossaryTrie.Clone())); pathIndex++; //Initialize the index of positionAndTranslationOfTerms dictionary int termIndex = 0; //Initialize the current character variable char currentChar; //Iterate over the searchString for (int charIndex = 0; charIndex < searchString.Length; charIndex++) { currentChar = searchString[charIndex]; //Check if char currentChar is a boundary character isTokenBoundaryCharacter = setOfTokenBoundaryCharacters.Contains(currentChar) || !useBoundaryCharacters; //Iterate over the triePathsBeingTraversed and check if they continue with the current char for (int i = 0; i <= pathIndex; i++) { //Check whether path exists, if not move to next index if (!triePathsBeingTraversed.ContainsKey(i)) { continue; } //If the char is a boundary character and there's a translation at the glossaryTrie node, //add translation to positionAndTranslationOfTerms dictionary with position information if (isTokenBoundaryCharacter && triePathsBeingTraversed[i].Path.GetTranslation() != "") { int termStartCharacter = triePathsBeingTraversed[i].StartCharacterPosition; string termTranslation = triePathsBeingTraversed[i].Path.GetTranslation(); string termReplaces = triePathsBeingTraversed[i].Path.GetReplaces(); //The end character position is character index minus one, as the current character is a boundary character positionAndTranslationOfTerms.Add(new PositionAndTranslation( termStartCharacter, charIndex - 1, termTranslation, termReplaces, false)); termIndex++; } //Check whether path continues with the current char if (triePathsBeingTraversed[i].Path.GetChildNode(currentChar) != null) { //Assign the current char child of the path as a new path triePathsBeingTraversed[i].Path = triePathsBeingTraversed[i].Path.GetChildNode(currentChar).Clone(); } //If path does not continue, remove the path else { triePathsBeingTraversed.Remove(i); } } //If char is a boundarycharacter, add a new path and add the token //to token array if the length of the token is >0 if (isTokenBoundaryCharacter) { //The term will actually begin at the next charIndex (next loop), so add +1 to key triePathsBeingTraversed.Add(pathIndex, new PathWithStartCharacterPosition(charIndex + 1, glossaryTrie.Clone())); pathIndex += 1; } } //Iterate over the triePathsBeingTraversed one last time, to catch the possible last term for (int i = 0; i <= pathIndex; i++) { //Check whether path exists if (!triePathsBeingTraversed.ContainsKey(i)) { continue; } if (triePathsBeingTraversed[i].Path.GetTranslation() != "") { int termStartCharacter = triePathsBeingTraversed[i].StartCharacterPosition; string termTranslation = triePathsBeingTraversed[i].Path.GetTranslation(); string termReplaces = triePathsBeingTraversed[i].Path.GetReplaces(); positionAndTranslationOfTerms.Add(new PositionAndTranslation( termStartCharacter, (searchString.Length - 1), termTranslation, termReplaces, false)); termIndex++; } } //Return the positionAndTranslationOfTerms return(positionAndTranslationOfTerms); }
//This adds a list of three fields to the normal term trie or the fuzzy term trie //Returns true if fields were added to normal term trie public Boolean addFieldsToTrie(List <string> fields, Trie exactMatchTrieSource, Trie exactMatchTrieReplaces) { //If first field has content, add fields to normal trie if (fields[0].Length > 0) { //add the pair to the trie exactMatchTrieSource.AddToTrie(fields[0], fields[1], fields[2]); return(true); } //Otherwise add fields to fuzzy trie else { exactMatchTrieReplaces.AddToTrie(fields[2], fields[1], ""); return(false); } }
private Segment InjectTermsFuzzyMatch(Segment segment, SearchResult result) { //Reset the fuzzy current visitor _provider.FuzzyCurrentVisitor.Reset(); //Have the fuzzy visitor go through the current source segment and pick up found //terms and their possible replacements foreach (var element in segment.Elements) { element.AcceptSegmentElementVisitor(_provider.FuzzyCurrentVisitor); } //Generate a new replacement trie from the fuzzy current visitor term list Trie replacementTrie = _provider.ExactMatchTrieReplaces.Clone(); //Create a new regex trie which will be used as the secondary regex trie in the visitor RegexTrie <TranslationAndReplacement> replacementRegexTrie = new RegexTrie <TranslationAndReplacement>(); Boolean regexesAdded = false; //This will be the new term list of the visitor, with replacement terms removed List <PositionAndTranslation> newTermList = new List <PositionAndTranslation>(); //Go through the terms adding the terms to either the normal or regex trie or to the new term list foreach (var term in _provider.FuzzyCurrentVisitor.TermList) { //If the term has a replaces value, add it to trie if (term.Replaces != "") { //Use replaces field as source. If the term is from a regex trie, add //it to a regex trie if (term.Regex) { _provider.RegexTrieFactory.AddToRegexTrie( replacementRegexTrie, term.Replaces, new TranslationAndReplacement( term.Translation, "")); regexesAdded = true; } else { replacementTrie.AddToTrie(term.Replaces, term.Translation, ""); } } //If there's no replaces field, handle this in the comparison phase else { newTermList.Add(term); } } //Determinise the regex trie if (regexesAdded) { replacementRegexTrie = _provider.Determiniser.determiniseNFA(replacementRegexTrie); } //Update the fuzzy term list with the list that does not contain the replacement terms _provider.FuzzyCurrentVisitor.TermList = newTermList; //Reset fuzzy replace visitor and update the tries to it _provider.FuzzyReplaceVisitor.Reset(); _provider.FuzzyReplaceVisitor.SndTrie = replacementTrie; _provider.FuzzyReplaceVisitor.SndRegexTrie = replacementRegexTrie; //Go through the translation proposal target segment with the visitor foreach (var element in result.TranslationProposal.TargetSegment.Elements) { //Why would there be null elements? if (element != null) { element.AcceptSegmentElementVisitor(_provider.FuzzyReplaceVisitor); } } Segment segmentWithTerms = _provider.FuzzyReplaceVisitor.Segment; //Visit the elements of the results, compare the resulting visitor with the _fuzzyVisitor and //and add the terms only found in _fuzzyVisitor to the translation proposal if (_options.InjectNewTermsIntoFuzzies == "true") { _provider.FuzzyMatchVisitor.Reset(); foreach (var element in result.MemoryTranslationUnit.SourceSegment.Elements) { element.AcceptSegmentElementVisitor(_provider.FuzzyMatchVisitor); } Text newTerms = _provider.FuzzyCurrentVisitor.TermDifference(_provider.FuzzyMatchVisitor); if (newTerms.Value.Length > 0) { segmentWithTerms.Elements.Insert(0, newTerms); } } //Return a deep copy of the segment (if reference is used, all results will display //the last segment constructed.) return(segmentWithTerms.Duplicate()); }
//A static method to load the tries from a file (this needs to be called from WinFormsUI if the settings are updated, //so I've made it static). Two tries are built: exactMatchTrieSource, which uses source term as a path, and fuzzyexactMatchTrieSource, //which uses target term as path. A line is added to fuzzyexactMatchTrieSource only in case the first field (source) is empty. public void loadTrieFromFile(string fileName, bool matchCase, char delimiter, Trie exactMatchTrieSource, Trie exactMatchTrieReplaces) { //Check if file exists, exit method and show a message if it doesn't if (!File.Exists(fileName)) { //If the file name is not empty, display alert if (fileName != "") { MessageBox.Show("Exact match rule file does not exist", "TermInjector"); } return; } //Counter for restricting glossary size int stringMemoryUsage = 0; //Counters for checking whether terms are being added int lineCount = 0; int termCount = 0; using (StreamReader sourceFile = File.OpenText(fileName)) { while (!sourceFile.EndOfStream) { //Check if memory usage is within bounds if (stringMemoryUsage > 2500000) { MessageBox.Show("Exact rule file loading stopped due to excessive size: Only part of the exact rule file has been loaded.", "TermInjector"); break; } string[] splitTerm = sourceFile.ReadLine().Split(delimiter); //Check whether the line was valid (at least two fields) if (splitTerm.Length < 2) { lineCount++; continue; } List <string> newTerm = splitTerm.ToList(); //If both first and third fields are empty //, skip to next iteration if (newTerm[0].Length == 0 && newTerm[1].Length == 0) { lineCount++; continue; } //If length of list is two, add empty field if (newTerm.Count == 2) { newTerm.Add(""); } //If case is not matched, convert both source and replaces fields to lower case if (!matchCase) { newTerm[0] = newTerm[0].ToLower(); newTerm[2] = newTerm[2].ToLower(); } //Tally the proxy for memory usage, depending on whether source or replaces //field was used as path if (this.addFieldsToTrie(newTerm, exactMatchTrieSource, exactMatchTrieReplaces)) { stringMemoryUsage += newTerm[0].Length; termCount++; } else { stringMemoryUsage += newTerm[2].Length; termCount++; } lineCount += 1; } sourceFile.Close(); //If the proportion of terms stored and lines read is skewed, the wrong delimiter may have been used. //Don't check very small glossaries, as otherwise an empty line or two could trigger the message if (lineCount - termCount > (lineCount / 2)) { string delimiterUsed = ""; if (delimiter == '\t') { delimiterUsed = "Tab"; } else { delimiterUsed = delimiter.ToString(); } MessageBox.Show((string.Format("The amount of exact match rules stored is small compared to the amount of lines read: {0} lines read, but only {1} exact match rules found. Are you sure the delimiter character {2} is correct?" , lineCount.ToString(), termCount.ToString(), delimiterUsed)), "TermInjector"); } } }