Ejemplo n.º 1
0
        public static string Translate(string inText, string languagePair = LanguagePair)
        {
            // Check if already translated / romanized
            // TODO check japanese punctuation too
            // if (IsTranslated(inText)) return inText;

            // Normalize to convert full-width characters
            inText = inText.Normalize(NormalizationForm.FormKC);

            // Split the text into separate sequential tokens and translate each token
            List <TextToken> textTokens = TextToken.GetTextTokens(inText);

            // Load maps and particles lists once
            string        hirakanjiMapPath = Path.Combine(Maps.DirectoryPath, Maps.HirakanjiLatn);
            List <string> hirakanjiMaps    = new List <string>(File.ReadAllLines(hirakanjiMapPath));

            string        hirakanjiParticlesPath = Path.Combine(Particles.DirectoryPath, Particles.HirakanjiLatn);
            List <string> hirakanjiParticles     = new List <string>(File.ReadAllLines(hirakanjiParticlesPath));

            string        kataMapPath = Path.Combine(Maps.DirectoryPath, Maps.KataEn);
            List <string> kataMaps    = new List <string>(File.ReadAllLines(kataMapPath));

            string        kataParticlesPath = Path.Combine(Particles.DirectoryPath, Particles.KataEn);
            List <string> kataParticles     = new List <string>(File.ReadAllLines(kataParticlesPath));

            // Translate each token and join them back together
            string outText = "";

            foreach (TextToken textToken in textTokens)
            {
                switch (textToken.Type)
                {
                case TokenType.HiraganaKanji:
                    outText += textToken.Translate(hirakanjiMaps, hirakanjiParticles);
                    break;

                case TokenType.Katakana:
                    outText += textToken.Translate(kataMaps, kataParticles);
                    break;

                case TokenType.Latin:
                default:
                    outText += textToken.Translate();
                    break;
                }
            }

            // Normalize
            outText = outText.Normalize(NormalizationForm.FormKC);

            return(outText);
        }
        // Loop through characters in a string and split them into sequential tokens
        // eg. "Cake 01. ヴァンパイア雪降る夜"
        // => ["Cake 01. ", "ヴァンパイア", "雪降る夜"]
        public static List <TextToken> GetTextTokens(string inText)
        {
            List <TextToken> textTokens = new List <TextToken>();

            // Start with arbitrary token type
            TokenType prevCharTokenType = TokenType.Latin;
            TokenType currCharTokenType = prevCharTokenType;

            TextToken currToken = new TextToken(currCharTokenType);

            foreach (char c in inText)
            {
                string cs = c.ToString();

                if (Unicode.IsProlongedChar(c))
                {
                    // Special condition for prolonged sound character
                    currCharTokenType = prevCharTokenType;
                }
                else if (Unicode.IsHiragana(cs) || Unicode.IsKanji(cs))
                {
                    // Hiragana / Kanji
                    currCharTokenType = TokenType.HiraganaKanji;
                }
                else if (Unicode.IsKatakana(cs))
                {
                    // Katakana
                    currCharTokenType = TokenType.Katakana;
                }
                else
                {
                    // Latin or other
                    currCharTokenType = TokenType.Latin;
                }

                // Check if there is a new token
                if (prevCharTokenType == currCharTokenType)
                {
                    // Same token
                    currToken.Text += cs;
                }
                else
                {
                    // New token

                    // Modifies the prefix of the token depending on prev/curr tokens
                    // eg. Add space before curr token
                    string tokenPrefix = "";

                    if (!string.IsNullOrEmpty(currToken.Text))
                    {
                        // Add token to token list if there is text in it
                        textTokens.Add(currToken);

                        // Get token prefix for new token if previous token was not empty
                        if (textTokens.Count > 0)
                        {
                            char prevLastChar = textTokens.Last().Text.Last();
                            tokenPrefix = GetTokenPrefix(prevCharTokenType,
                                                         currCharTokenType,
                                                         prevLastChar, c);
                        }
                    }

                    // Create new token
                    currToken = new TextToken(currCharTokenType, cs, tokenPrefix);

                    prevCharTokenType = currCharTokenType;
                }
            }

            // Add last token to the list
            if (!string.IsNullOrEmpty(currToken.Text))
            {
                textTokens.Add(currToken);
            }

            return(textTokens);
        }