/// <summary> /// Substitute the numeric numbers in translated message with their orignal format in source message. /// </summary> /// <param name="translatedDocument">Translated document.</param> private void SubstituteNumericPattern(TranslatedDocument translatedDocument) { var numericMatches = Regex.Matches(translatedDocument.SourceMessage, @"\d+", RegexOptions.Singleline); foreach (Match numericMatch in numericMatches) { var srcIndex = Array.FindIndex(translatedDocument.SourceTokens, row => row == numericMatch.Groups[0].Value); translatedDocument.TranslatedTokens = PostProcessingUtilities.KeepSourceWordInTranslation(translatedDocument.IndexedAlignment, translatedDocument.SourceTokens, translatedDocument.TranslatedTokens, srcIndex); } }
/// <summary> /// Substitutes matched no translate pattern with the original token. /// </summary> /// <param name="translatedDocument">Translated document.</param> /// <param name="pattern">The no translate pattern.</param> private void SubstituteNoTranslatePattern(TranslatedDocument translatedDocument, string pattern) { // get the matched no translate pattern var matchNoTranslate = Regex.Match(translatedDocument.SourceMessage, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); // calculate the boundaries of the pattern match // ex : "mon nom est l'etat // start index = 12 // length = 6 var noTranslateStartChrIndex = matchNoTranslate.Groups[1].Index; // the length of the matched pattern without spaces , which will be used in determining the translated tokens that will be replaced by their original values var noTranslateMatchLength = matchNoTranslate.Groups[1].Value.Replace(" ", string.Empty).Length; var wrdIndx = 0; var chrIndx = 0; var newChrLengthFromMatch = 0; var srcIndex = -1; var newNoTranslateArrayLength = 1; var sourceMessageCharacters = translatedDocument.SourceMessage.ToCharArray(); foreach (var wrd in translatedDocument.SourceTokens) { // if the beginning of the current word equals the beginning of the matched no trasnalate word, then assign the current word index to srcIndex if (chrIndx == noTranslateStartChrIndex) { srcIndex = wrdIndx; } // the following code block does the folowing : // - checks if a match wsa found // - checks if this match length equals the starting matching token length, if yes then this is the only token to process, // otherwise continue the loop and add the next token to the list of tokens to be processed // ex : "mon nom est l'etat" // tokens = {"mon", "nom", "est", "l'", "etat"} // when the loop reaches the token "l'" then srcIndex will = 3, but we don't want to consider only the token "l'" as the no translate token, // instead we want to match the whole "l'etat" string regardless how many tokens it contains ie regardless that "l'etat" is actually composed of 2 tokens "l'" and "etat" // so what these condition is trying to do is make the necessary checks that we got all the matched pattern not just a part of it's tokens! // checks if match was found or not, because srcIndex value changes only in case a match was found ! if (srcIndex != -1) { // checks if we found all the tokens that matches the pattern if (newChrLengthFromMatch + translatedDocument.SourceTokens[wrdIndx].Length >= noTranslateMatchLength) { break; } // if the previous condition fails it means that the next token is also matched in the pattern, so we increase the size of the no translate words array by 1 newNoTranslateArrayLength += 1; // increment newChrLengthFromMatch with the found word size newChrLengthFromMatch += translatedDocument.SourceTokens[wrdIndx].Length; } // the following block of code is used to calculate the next token starting index which could have two cases // the first case is that the current token is followed by a space in this case we increment the next chrIndx by 1 to get the next character after the space // the second case is that the token is followed by the next token without spaces , in this case we calculate chrIndx as chrIndx += wrd.Length without incrementing // assumption : The provided sourceMessage and sourceMessageCharacters doesn't contain any consecutive white spaces, // in our use case this handling is done using the translator output itself using the following line of code in PreprocessMessage function: // textToTranslate = Regex.Replace(textToTranslate, @"\s+", " ");//used to remove multiple spaces in input user message if (chrIndx + wrd.Length < sourceMessageCharacters.Length && sourceMessageCharacters[chrIndx + wrd.Length] == ' ') { chrIndx += wrd.Length + 1; } else { chrIndx += wrd.Length; } wrdIndx++; } // if the loop ends and srcIndex then no match was found if (srcIndex == -1) { return; } // add the no translate words to a new array var wrdNoTranslate = new string[newNoTranslateArrayLength]; Array.Copy(translatedDocument.SourceTokens, srcIndex, wrdNoTranslate, 0, newNoTranslateArrayLength); // loop for each of the no translate words and replace it's translation with it's origin foreach (var srcWrd in wrdNoTranslate) { translatedDocument.TranslatedTokens = PostProcessingUtilities.KeepSourceWordInTranslation(translatedDocument.IndexedAlignment, translatedDocument.SourceTokens, translatedDocument.TranslatedTokens, srcIndex); srcIndex++; } }