/// <summary> /// Process the logic for custom dictionary post processor used to handle user custom vocab translation. /// </summary> /// <param name="translatedDocument">Translated document.</param> /// <param name="languageId">Current source language id.</param> /// <returns>A <see cref="PostProcessedDocument"/> stores the original translated document state and the newly post processed message.</returns> public PostProcessedDocument Process(TranslatedDocument translatedDocument, string languageId) { // Check if provided custom dictionary for this language is not empty if (_userCustomDictionaries.GetLanguageDictionary(languageId).Count > 0) { string processedResult; var languageDictionary = _userCustomDictionaries.GetLanguageDictionary(languageId); // Loop for all the original message tokens, and check if any of these tokens exists in the user custom dictionary, // to forcibly overwrite this token's translation with the user provided translation for (var i = 0; i < translatedDocument.SourceTokens.Length; i++) { if (languageDictionary.ContainsKey(translatedDocument.SourceTokens[i])) { // If a token of the original source message/phrase found in the user dictionary, // replace it's equivalent translated token with the user provided translation // the equivalent translated token can be found using the alignment map in the translated document translatedDocument.TranslatedTokens[translatedDocument.IndexedAlignment[i]] = languageDictionary[translatedDocument.SourceTokens[i]]; } } // Finally return PostProcessedDocument object that holds the orignal TRanslatedDocument and a string that joins all the translated tokens together processedResult = PostProcessingUtilities.Join(" ", translatedDocument.TranslatedTokens); return(new PostProcessedDocument(translatedDocument, processedResult)); } else { return(new PostProcessedDocument(translatedDocument, string.Empty)); } }
/// <summary> /// Substitute the numeric numbers in translated message with their orignal format in source message. /// </summary> /// <param name="translatedDocument">Translated document.</param> private void SubstituteNumericPattern(TranslatedDocument translatedDocument) { MatchCollection numericMatches = Regex.Matches(translatedDocument.SourceMessage, @"\d+", RegexOptions.Singleline); foreach (Match numericMatch in numericMatches) { int srcIndex = Array.FindIndex(translatedDocument.SourceTokens, row => row == numericMatch.Groups[0].Value); translatedDocument.TranslatedTokens = PostProcessingUtilities.KeepSourceWordInTranslation(translatedDocument.IndexedAlignment, translatedDocument.SourceTokens, translatedDocument.TranslatedTokens, srcIndex); } }
/// <summary> /// Process the logic for patterns post processor used to handle numbers and no translate list. /// </summary> /// <param name="translatedDocument">Translated document.</param> /// <param name="languageId">Current source language id.</param> /// <returns>A <see cref="PostProcessedDocument"/> stores the original translated document state and the newly post processed message.</returns> public PostProcessedDocument Process(TranslatedDocument translatedDocument, string languageId) { // validate function arguments for null and incorrect format ValidateParameters(translatedDocument); // flag to indicate if the source message contains number , will used for var containsNum = Regex.IsMatch(translatedDocument.SourceMessage, @"\d"); // output variable declaration string processedResult; // temporary pattern is used to contain two set of patterns : // - the post processed patterns that was configured by the user ie : _processedPatterns and // - the liternal no translate pattern ie : translatedDocument.LiteranlNoTranslatePhrases , which takes the following regx "<literal>(.*)</literal>" , so the following code checks if this pattern exists in the translated document object to be added to the no translate list // - ex : translatedDocument.sourceMessage = I like my friend <literal>happy</literal> , the literal tag here specifies that the word "happy" shouldn't be translated var temporaryPatterns = _processedPatterns[languageId]; if (translatedDocument.LiteranlNoTranslatePhrases != null && translatedDocument.LiteranlNoTranslatePhrases.Count > 0) { temporaryPatterns.UnionWith(translatedDocument.LiteranlNoTranslatePhrases); } if (temporaryPatterns.Count == 0 && !containsNum) { processedResult = translatedDocument.TargetMessage; } if (string.IsNullOrWhiteSpace(translatedDocument.RawAlignment)) { processedResult = translatedDocument.TargetMessage; } // loop for all the patterns and substitute each no translate pattern match with the original source words // ex : assuming the pattern = "mon nom est (.+)" // and the phrase = "mon nom est l'etat" // the original translator output for this phrase would be "My name is the state", // after applying the patterns post processor , the output would be : "My name is l'etat" foreach (var pattern in temporaryPatterns) { if (Regex.IsMatch(translatedDocument.SourceMessage, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase)) { SubstituteNoTranslatePattern(translatedDocument, pattern); } } SubstituteNumericPattern(translatedDocument); processedResult = PostProcessingUtilities.Join(" ", translatedDocument.TranslatedTokens); return(new PostProcessedDocument(translatedDocument, processedResult)); }
/// <summary> /// Validate <see cref="TranslatedDocument"/> object main parameters for null values. /// </summary> /// <param name="translatedDocument"></param> private void ValidateParameters(TranslatedDocument translatedDocument) { if (translatedDocument == null) { throw new ArgumentNullException(nameof(translatedDocument)); } if (translatedDocument.SourceMessage == null) { throw new ArgumentNullException(nameof(translatedDocument.SourceMessage)); } if (translatedDocument.TargetMessage == null) { throw new ArgumentNullException(nameof(translatedDocument.TargetMessage)); } }
/// <summary> /// Initializes a new instance of the <see cref="PostProcessedDocument"/> class using the two states. /// </summary> /// <param name="translatedDocument">Translated object to be post processed.</param> /// <param name="postProcessedMessage">The result message/translation after the post processing.</param> public PostProcessedDocument(TranslatedDocument translatedDocument, string postProcessedMessage) { this.TranslatedDocument = translatedDocument; this.PostProcessedMessage = postProcessedMessage; }
/// <summary> /// Substitutes matched no translate pattern with the original token /// </summary> /// <param name="translatedDocument">Translated document.</param> /// <param name="pattern">The no translate pattern.</param> private void SubstituteNoTranslatePattern(TranslatedDocument translatedDocument, string pattern) { //get the matched no translate pattern Match matchNoTranslate = Regex.Match(translatedDocument.SourceMessage, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase); //calculate the boundaries of the pattern match //ex : "mon nom est l'etat // start index = 12 //length = 6 int noTranslateStartChrIndex = matchNoTranslate.Groups[1].Index; //the length of the matched pattern without spaces , which will be used in determining the translated tokens that will be replaced by their original values int noTranslateMatchLength = matchNoTranslate.Groups[1].Value.Replace(" ", "").Length; int wrdIndx = 0; int chrIndx = 0; int newChrLengthFromMatch = 0; int srcIndex = -1; int newNoTranslateArrayLength = 1; var sourceMessageCharacters = translatedDocument.SourceMessage.ToCharArray(); foreach (string wrd in translatedDocument.SourceTokens) { //if the beginning of the current word equals the beginning of the matched no trasnalate word, then assign the current word index to srcIndex if (chrIndx == noTranslateStartChrIndex) { srcIndex = wrdIndx; } //the following code block does the folowing : //- checks if a match wsa found //- checks if this match length equals the starting matching token length, if yes then this is the only token to process, //otherwise continue the loop and add the next token to the list of tokens to be processed //ex : "mon nom est l'etat" //tokens = {"mon", "nom", "est", "l'", "etat"} //when the loop reaches the token "l'" then srcIndex will = 3, but we don't want to consider only the token "l'" as the no translate token, //instead we want to match the whole "l'etat" string regardless how many tokens it contains ie regardless that "l'etat" is actually composed of 2 tokens "l'" and "etat" //so what these condition is trying to do is make the necessary checks that we got all the matched pattern not just a part of it's tokens! //checks if match was found or not, because srcIndex value changes only in case a match was found ! if (srcIndex != -1) { //checks if we found all the tokens that matches the pattern if (newChrLengthFromMatch + translatedDocument.SourceTokens[wrdIndx].Length >= noTranslateMatchLength) { break; } //if the previous condition fails it means that the next token is also matched in the pattern, so we increase the size of the no translate words array by 1 newNoTranslateArrayLength += 1; //increment newChrLengthFromMatch with the found word size newChrLengthFromMatch += translatedDocument.SourceTokens[wrdIndx].Length; } // the following block of code is used to calculate the next token starting index which could have two cases //the first case is that the current token is followed by a space in this case we increment the next chrIndx by 1 to get the next character after the space //the second case is that the token is followed by the next token without spaces , in this case we calculate chrIndx as chrIndx += wrd.Length without incrementing //assumption : The provided sourceMessage and sourceMessageCharacters doesn't contain any consecutive white spaces, //in our use case this handling is done using the translator output itself using the following line of code in PreprocessMessage function : //textToTranslate = Regex.Replace(textToTranslate, @"\s+", " ");//used to remove multiple spaces in input user message if (chrIndx + wrd.Length < sourceMessageCharacters.Length && sourceMessageCharacters[chrIndx + wrd.Length] == ' ') { chrIndx += wrd.Length + 1; } else { chrIndx += wrd.Length; } wrdIndx++; } //if the loop ends and srcIndex then no match was found if (srcIndex == -1) { return; } //add the no translate words to a new array string[] wrdNoTranslate = new string[newNoTranslateArrayLength]; Array.Copy(translatedDocument.SourceTokens, srcIndex, wrdNoTranslate, 0, newNoTranslateArrayLength); //loop for each of the no translate words and replace it's translation with it's origin foreach (string srcWrd in wrdNoTranslate) { translatedDocument.TranslatedTokens = PostProcessingUtilities.KeepSourceWordInTranslation(translatedDocument.IndexedAlignment, translatedDocument.SourceTokens, translatedDocument.TranslatedTokens, srcIndex); srcIndex++; } }