public static string Translate(string inText, string languagePair = LanguagePair) { // Check if already translated / romanized // TODO check japanese punctuation too // if (IsTranslated(inText)) return inText; // Normalize to convert full-width characters inText = inText.Normalize(NormalizationForm.FormKC); // Split the text into separate sequential tokens and translate each token List <TextToken> textTokens = TextToken.GetTextTokens(inText); // Load maps and particles lists once string hirakanjiMapPath = Path.Combine(Maps.DirectoryPath, Maps.HirakanjiLatn); List <string> hirakanjiMaps = new List <string>(File.ReadAllLines(hirakanjiMapPath)); string hirakanjiParticlesPath = Path.Combine(Particles.DirectoryPath, Particles.HirakanjiLatn); List <string> hirakanjiParticles = new List <string>(File.ReadAllLines(hirakanjiParticlesPath)); string kataMapPath = Path.Combine(Maps.DirectoryPath, Maps.KataEn); List <string> kataMaps = new List <string>(File.ReadAllLines(kataMapPath)); string kataParticlesPath = Path.Combine(Particles.DirectoryPath, Particles.KataEn); List <string> kataParticles = new List <string>(File.ReadAllLines(kataParticlesPath)); // Translate each token and join them back together string outText = ""; foreach (TextToken textToken in textTokens) { switch (textToken.Type) { case TokenType.HiraganaKanji: outText += textToken.Translate(hirakanjiMaps, hirakanjiParticles); break; case TokenType.Katakana: outText += textToken.Translate(kataMaps, kataParticles); break; case TokenType.Latin: default: outText += textToken.Translate(); break; } } // Normalize outText = outText.Normalize(NormalizationForm.FormKC); return(outText); }
// Loop through characters in a string and split them into sequential tokens // eg. "Cake 01. ヴァンパイア雪降る夜" // => ["Cake 01. ", "ヴァンパイア", "雪降る夜"] public static List <TextToken> GetTextTokens(string inText) { List <TextToken> textTokens = new List <TextToken>(); // Start with arbitrary token type TokenType prevCharTokenType = TokenType.Latin; TokenType currCharTokenType = prevCharTokenType; TextToken currToken = new TextToken(currCharTokenType); foreach (char c in inText) { string cs = c.ToString(); if (Unicode.IsProlongedChar(c)) { // Special condition for prolonged sound character currCharTokenType = prevCharTokenType; } else if (Unicode.IsHiragana(cs) || Unicode.IsKanji(cs)) { // Hiragana / Kanji currCharTokenType = TokenType.HiraganaKanji; } else if (Unicode.IsKatakana(cs)) { // Katakana currCharTokenType = TokenType.Katakana; } else { // Latin or other currCharTokenType = TokenType.Latin; } // Check if there is a new token if (prevCharTokenType == currCharTokenType) { // Same token currToken.Text += cs; } else { // New token // Modifies the prefix of the token depending on prev/curr tokens // eg. Add space before curr token string tokenPrefix = ""; if (!string.IsNullOrEmpty(currToken.Text)) { // Add token to token list if there is text in it textTokens.Add(currToken); // Get token prefix for new token if previous token was not empty if (textTokens.Count > 0) { char prevLastChar = textTokens.Last().Text.Last(); tokenPrefix = GetTokenPrefix(prevCharTokenType, currCharTokenType, prevLastChar, c); } } // Create new token currToken = new TextToken(currCharTokenType, cs, tokenPrefix); prevCharTokenType = currCharTokenType; } } // Add last token to the list if (!string.IsNullOrEmpty(currToken.Text)) { textTokens.Add(currToken); } return(textTokens); }