TranslatedDocument C# (CSharp) Code Examples

Example #1

0

Show file

File: CustomDictionaryPostProcessor.cs Project: thedeveus/botbuilder-dotnet

        /// <summary>
        /// Process the logic for custom dictionary post processor used to handle user custom vocab translation.
        /// </summary>
        /// <param name="translatedDocument">Translated document.</param>
        /// <param name="languageId">Current source language id.</param>
        /// <returns>A <see cref="PostProcessedDocument"/> stores the original translated document state and the newly post processed message.</returns>
        public PostProcessedDocument Process(TranslatedDocument translatedDocument, string languageId)
        {
            // Check if provided custom dictionary for this language is not empty
            if (_userCustomDictionaries.GetLanguageDictionary(languageId).Count > 0)
            {
                string processedResult;
                var    languageDictionary = _userCustomDictionaries.GetLanguageDictionary(languageId);

                // Loop for all the original message tokens, and check if any of these tokens exists in the user custom dictionary,
                // to forcibly overwrite this token's translation with the user provided translation
                for (var i = 0; i < translatedDocument.SourceTokens.Length; i++)
                {
                    if (languageDictionary.ContainsKey(translatedDocument.SourceTokens[i]))
                    {
                        // If a token of the original source message/phrase found in the user dictionary,
                        // replace it's equivalent translated token with the user provided translation
                        // the equivalent translated token can be found using the alignment map in the translated document
                        translatedDocument.TranslatedTokens[translatedDocument.IndexedAlignment[i]] = languageDictionary[translatedDocument.SourceTokens[i]];
                    }
                }

                // Finally return PostProcessedDocument object that holds the orignal TRanslatedDocument and a string that joins all the translated tokens together
                processedResult = PostProcessingUtilities.Join(" ", translatedDocument.TranslatedTokens);
                return(new PostProcessedDocument(translatedDocument, processedResult));
            }
            else
            {
                return(new PostProcessedDocument(translatedDocument, string.Empty));
            }
        }

Example #2

0

Show file

File: PatternsPostProcessor.cs Project: vfsauder/botbuilder-dotnet

        /// <summary>
        /// Substitute the numeric numbers in translated message with their orignal format in source message.
        /// </summary>
        /// <param name="translatedDocument">Translated document.</param>
        private void SubstituteNumericPattern(TranslatedDocument translatedDocument)
        {
            MatchCollection numericMatches = Regex.Matches(translatedDocument.SourceMessage, @"\d+", RegexOptions.Singleline);

            foreach (Match numericMatch in numericMatches)
            {
                int srcIndex = Array.FindIndex(translatedDocument.SourceTokens, row => row == numericMatch.Groups[0].Value);
                translatedDocument.TranslatedTokens = PostProcessingUtilities.KeepSourceWordInTranslation(translatedDocument.IndexedAlignment, translatedDocument.SourceTokens, translatedDocument.TranslatedTokens, srcIndex);
            }
        }

Example #3

0

Show file

File: PatternsPostProcessor.cs Project: thedeveus/botbuilder-dotnet

        /// <summary>
        /// Process the logic for patterns post processor used to handle numbers and no translate list.
        /// </summary>
        /// <param name="translatedDocument">Translated document.</param>
        /// <param name="languageId">Current source language id.</param>
        /// <returns>A <see cref="PostProcessedDocument"/> stores the original translated document state and the newly post processed message.</returns>
        public PostProcessedDocument Process(TranslatedDocument translatedDocument, string languageId)
        {
            // validate function arguments for null and incorrect format
            ValidateParameters(translatedDocument);

            // flag to indicate if the source message contains number , will used for
            var containsNum = Regex.IsMatch(translatedDocument.SourceMessage, @"\d");

            // output variable declaration
            string processedResult;

            // temporary pattern is used to contain two set of patterns :
            //  - the post processed patterns that was configured by the user ie : _processedPatterns and
            //  - the   liternal no translate pattern ie : translatedDocument.LiteranlNoTranslatePhrases , which takes the following regx "<literal>(.*)</literal>" , so the following code checks if this pattern exists in the translated document object to be added to the no translate list
            //  - ex : translatedDocument.sourceMessage = I like my friend <literal>happy</literal> , the literal tag here specifies that the word "happy" shouldn't be translated
            var temporaryPatterns = _processedPatterns[languageId];

            if (translatedDocument.LiteranlNoTranslatePhrases != null && translatedDocument.LiteranlNoTranslatePhrases.Count > 0)
            {
                temporaryPatterns.UnionWith(translatedDocument.LiteranlNoTranslatePhrases);
            }

            if (temporaryPatterns.Count == 0 && !containsNum)
            {
                processedResult = translatedDocument.TargetMessage;
            }

            if (string.IsNullOrWhiteSpace(translatedDocument.RawAlignment))
            {
                processedResult = translatedDocument.TargetMessage;
            }

            // loop for all the patterns and substitute each no translate pattern match with the original source words

            // ex : assuming the pattern = "mon nom est (.+)"
            // and the phrase = "mon nom est l'etat"
            // the original translator output for this phrase would be "My name is the state",
            // after applying the patterns post processor , the output would be : "My name is l'etat"
            foreach (var pattern in temporaryPatterns)
            {
                if (Regex.IsMatch(translatedDocument.SourceMessage, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase))
                {
                    SubstituteNoTranslatePattern(translatedDocument, pattern);
                }
            }

            SubstituteNumericPattern(translatedDocument);
            processedResult = PostProcessingUtilities.Join(" ", translatedDocument.TranslatedTokens);
            return(new PostProcessedDocument(translatedDocument, processedResult));
        }

Example #4

0

Show file

File: PatternsPostProcessor.cs Project: vfsauder/botbuilder-dotnet

        /// <summary>
        /// Validate <see cref="TranslatedDocument"/> object main parameters for null values.
        /// </summary>
        /// <param name="translatedDocument"></param>
        private void ValidateParameters(TranslatedDocument translatedDocument)
        {
            if (translatedDocument == null)
            {
                throw new ArgumentNullException(nameof(translatedDocument));
            }

            if (translatedDocument.SourceMessage == null)
            {
                throw new ArgumentNullException(nameof(translatedDocument.SourceMessage));
            }

            if (translatedDocument.TargetMessage == null)
            {
                throw new ArgumentNullException(nameof(translatedDocument.TargetMessage));
            }
        }

Example #5

0

Show file

File: PostProcessedDocument.cs Project: GuocaiL/QnamakerPractice

 /// <summary>
 /// Initializes a new instance of the <see cref="PostProcessedDocument"/> class using the two states.
 /// </summary>
 /// <param name="translatedDocument">Translated object to be post processed.</param>
 /// <param name="postProcessedMessage">The result message/translation after the post processing.</param>
 public PostProcessedDocument(TranslatedDocument translatedDocument, string postProcessedMessage)
 {
     this.TranslatedDocument   = translatedDocument;
     this.PostProcessedMessage = postProcessedMessage;
 }

Example #6

0

Show file

File: PatternsPostProcessor.cs Project: vfsauder/botbuilder-dotnet

        /// <summary>
        /// Substitutes matched no translate pattern with the original token
        /// </summary>
        /// <param name="translatedDocument">Translated document.</param>
        /// <param name="pattern">The no translate pattern.</param>
        private void SubstituteNoTranslatePattern(TranslatedDocument translatedDocument, string pattern)
        {
            //get the matched no translate pattern
            Match matchNoTranslate = Regex.Match(translatedDocument.SourceMessage, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
            //calculate the boundaries of the pattern match
            //ex : "mon nom est l'etat
            // start index = 12
            //length = 6
            int noTranslateStartChrIndex = matchNoTranslate.Groups[1].Index;
            //the length of the matched pattern without spaces , which will be used in determining the translated tokens that will be replaced by their original values
            int noTranslateMatchLength = matchNoTranslate.Groups[1].Value.Replace(" ", "").Length;
            int wrdIndx = 0;
            int chrIndx = 0;
            int newChrLengthFromMatch = 0;
            int srcIndex = -1;
            int newNoTranslateArrayLength = 1;
            var sourceMessageCharacters   = translatedDocument.SourceMessage.ToCharArray();


            foreach (string wrd in translatedDocument.SourceTokens)
            {
                //if the beginning of the current word equals the beginning of the matched no trasnalate word, then assign the current word index to srcIndex
                if (chrIndx == noTranslateStartChrIndex)
                {
                    srcIndex = wrdIndx;
                }

                //the following code block does the folowing :
                //- checks if a match wsa found
                //- checks if this match length equals the starting matching token length, if yes then this is the only token to process,
                //otherwise continue the loop and add the next token to the list of tokens to be processed
                //ex : "mon nom est l'etat"
                //tokens = {"mon", "nom", "est", "l'", "etat"}
                //when the loop reaches the token "l'" then srcIndex will = 3, but we don't want to consider only the token "l'" as the no translate token,
                //instead we want to match the whole "l'etat" string regardless how many tokens it contains ie regardless that "l'etat" is actually composed of 2 tokens "l'" and "etat"
                //so what these condition is trying to do is make the necessary checks that we got all the matched pattern not just a part of it's tokens!

                //checks if match was found or not, because srcIndex value changes only in case a match was found !
                if (srcIndex != -1)
                {
                    //checks if we found all the tokens that matches the pattern
                    if (newChrLengthFromMatch + translatedDocument.SourceTokens[wrdIndx].Length >= noTranslateMatchLength)
                    {
                        break;
                    }

                    //if the previous condition fails it means that the next token is also matched in the pattern, so we increase the size of the no translate words array by 1
                    newNoTranslateArrayLength += 1;
                    //increment newChrLengthFromMatch with the found word size
                    newChrLengthFromMatch += translatedDocument.SourceTokens[wrdIndx].Length;
                }

                // the following block of code is used to calculate the next token starting index which could have two cases
                //the first case is that the current token is followed by a space in this case we increment the next chrIndx by 1 to get the next character after the space
                //the second case is that the token is followed by the next token without spaces , in this case we calculate chrIndx as chrIndx += wrd.Length without incrementing
                //assumption : The provided sourceMessage and sourceMessageCharacters doesn't contain any consecutive white spaces,
                //in our use case this handling is done using the translator output itself using the following line of code in PreprocessMessage function :
                //textToTranslate = Regex.Replace(textToTranslate, @"\s+", " ");//used to remove multiple spaces in input user message
                if (chrIndx + wrd.Length < sourceMessageCharacters.Length && sourceMessageCharacters[chrIndx + wrd.Length] == ' ')
                {
                    chrIndx += wrd.Length + 1;
                }
                else
                {
                    chrIndx += wrd.Length;
                }
                wrdIndx++;
            }

            //if the loop ends and srcIndex then no match was found
            if (srcIndex == -1)
            {
                return;
            }
            //add the no translate words to a new array
            string[] wrdNoTranslate = new string[newNoTranslateArrayLength];
            Array.Copy(translatedDocument.SourceTokens, srcIndex, wrdNoTranslate, 0, newNoTranslateArrayLength);

            //loop for each of the no translate words and replace it's translation with it's origin
            foreach (string srcWrd in wrdNoTranslate)
            {
                translatedDocument.TranslatedTokens = PostProcessingUtilities.KeepSourceWordInTranslation(translatedDocument.IndexedAlignment, translatedDocument.SourceTokens, translatedDocument.TranslatedTokens, srcIndex);
                srcIndex++;
            }
        }

C# (CSharp) TranslatedDocument Examples