/// <summary> /// Attempts to solve cases where the length of the kanji reading matches the length of the /// kana reading. /// </summary> protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v) { if (v.KanjiReading.Length == v.KanaReading.Length) { List <FuriganaPart> parts = new List <FuriganaPart>(); for (int i = 0; i < v.KanjiReading.Length; i++) { if (r.GetKanji(v.KanjiReading[i]) != null) { parts.Add(new FuriganaPart(v.KanaReading[i].ToString(), i)); } else if (!KanaHelper.IsAllKana(v.KanjiReading[i].ToString())) { // Our character is not a kanji and apparently not a kana either. // Stop right there. It's probably a trap. yield break; } else { if (!KanaHelper.AreEquivalent(v.KanjiReading[i].ToString(), v.KanaReading[i].ToString())) { // We are reading kana characters that are not equivalent. Stop. yield break; } } } if (parts.Any()) { yield return(new FuriganaSolution(v, parts)); } } }
/// <summary> /// Recursive method that reads the kanji reading string and attempts to find all the ways the /// kana reading could be cut by matching it with the potential kanji readings. /// </summary> /// <param name="r">Resource set.</param> /// <param name="v">Vocab to solve.</param> /// <param name="currentIndexKanji">Current position in the kanji string. Used for recursion.</param> /// <param name="currentIndexKana">Current position in the kana string. Used for recursion.</param> /// <param name="currentCut">Current furigana parts. Used for recursion.</param> private IEnumerable <FuriganaSolution> TryReading(FuriganaResourceSet r, VocabEntry v, int currentIndexKanji, int currentIndexKana, List <FuriganaPart> currentCut) { if (currentIndexKanji == v.KanjiReading.Length && currentIndexKana == v.KanaReading.Length) { // We successfuly read the word and stopped at the last character in both kanji and kana readings. // Our current cut is valid. Return it. yield return(new FuriganaSolution(v, currentCut)); yield break; } else if (currentIndexKanji >= v.KanjiReading.Length || currentIndexKana >= v.KanaReading.Length) { // Broken case. Do not return anything. yield break; } // Search for special expressions. bool foundSpecialExpressions = false; foreach (FuriganaSolution solution in FindSpecialExpressions(r, v, currentIndexKanji, currentIndexKana, currentCut)) { foundSpecialExpressions = true; yield return(solution); } if (foundSpecialExpressions) { yield break; } // General case. Get the current character and see if it is a kanji. char c = v.KanjiReading[currentIndexKanji]; if (c == '々' && currentIndexKanji > 0) { // Special case: handle the repeater kanji by using the previous character instead. c = v.KanjiReading[currentIndexKanji - 1]; } Kanji k = r.GetKanji(c); if (k != null) { // Read as kanji subpart. foreach (FuriganaSolution solution in ReadAsKanji(r, v, currentIndexKanji, currentIndexKana, currentCut, c, k)) { yield return(solution); } } else { // Read as kana subpart. foreach (FuriganaSolution solution in ReadAsKana(r, v, currentIndexKanji, currentIndexKana, currentCut, c)) { yield return(solution); } } }
/// <summary> /// Attempts to solve furigana in cases where there are no consecutive kanji in the kanji string, /// using regular expressions. /// </summary> protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v) { // We are using both a greedy expression and a lazy expression because we want to make sure // there is only one way to read them. If the result differs with a greedy or a lazy expression, // it means that we have no idea how to read the damn thing. string regGreedy = "^"; string regLazy = "^"; bool consecutiveMarker = false; List <int> kanjiIndexes = new List <int>(4); for (int i = 0; i < v.KanjiReading.Length; i++) { char c = v.KanjiReading[i]; Kanji k = r.GetKanji(c); if (k == null) { // Add the characters to the string. No capture group for kana. regGreedy += string.Format(c.ToString()); regLazy += string.Format(c.ToString()); consecutiveMarker = false; } else if (consecutiveMarker) { // Consecutive kanji. The vocab entry is not eligible for this solution. yield break; } else { // Add the characters inside a capture group for kanji. regGreedy += "(.+)"; regLazy += "(.+?)"; consecutiveMarker = true; kanjiIndexes.Add(i); } } regGreedy += "$"; regLazy += "$"; // Example regex: // For 持ち運ぶ (もちはこぶ) // The regexes would be: // ^(.+)ち(.+)ぶ$ // ^(.+?)ち(.+?)ぶ$ Regex regexGreedy = new Regex(regGreedy); Regex regexLazy = new Regex(regLazy); Match matchGreedy = regexGreedy.Match(v.KanaReading); Match matchLazy = regexLazy.Match(v.KanaReading); if (matchGreedy.Success && matchLazy.Success) { // Obtain both solutions. FuriganaSolution greedySolution = MakeSolutionFromMatch(v, matchGreedy, kanjiIndexes); FuriganaSolution lazySolution = MakeSolutionFromMatch(v, matchLazy, kanjiIndexes); // Are both solutions non-null and equivalent? if (greedySolution != null && lazySolution != null && greedySolution.Equals(lazySolution)) { // Yes they are! Return only one of them of course. // Greedy wins obviously. yield return(greedySolution); } } }
/// <summary> /// Attempts to solve furigana by reading the kana string and attributing kanji a reading based /// not on the readings of the kanji, but on the kana characters that come up. /// </summary> protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v) { // Basically, we are reading the kanji reading character by character, eating the kana from // the kana reading and associating each kanji the piece of kana that comes next. // The thing is, we are taking advantage that kanji readings cannot start with certain // kana (ん and the small characters). // If we just stumbled upon a kanji and the next characters of the kana string are of these // impossible start kana, we can automatically associate them with the kanji. // Now this will work only for a number of vocab, but it does significantly improve the results. // It is especially good for 2-characters compounds that use unusual readings. /// Example: 阿呆陀羅 (あほんだら) /// Read the あ for 阿; /// Read the ほ for 呆; /// Read the ん: it's an impossible start character, so it goes with 呆 as well; /// Read the だ for 陀; /// Read the ら for 羅. string kana = v.KanaReading; List <FuriganaPart> furigana = new List <FuriganaPart>(); for (int i = 0; i < v.KanjiReading.Length; i++) { if (kana.Length == 0) { // We still have characters to browse in our kanji reading, but // there are no more kana to consume. Cannot solve. yield break; } char c = v.KanjiReading[i]; // Check for special expressions bool foundExpression = false; for (int j = v.KanjiReading.Length - 1; j >= i; j--) { string lookup = v.KanjiReading.Substring(i, (j - i) + 1); SpecialExpression expression = r.GetExpression(lookup); if (expression != null) { // We found an expression. foreach (SpecialReading expressionReading in ReadingExpander.GetPotentialSpecialReadings( expression, i == 0, j == v.KanjiReading.Length - 1)) { if (kana.Length >= expressionReading.KanaReading.Length && kana.Substring(0, expressionReading.KanaReading.Length) == expressionReading.KanaReading) { // The reading matches. // Eat the kana chain. furigana.AddRange(expressionReading.Furigana.Furigana .Select(fp => new FuriganaPart(fp.Value, fp.StartIndex + i, fp.EndIndex + i))); kana = kana.Substring(expressionReading.KanaReading.Length); i = j; foundExpression = true; break; } } if (foundExpression) { break; } } } if (foundExpression) { continue; } // Normal process: eat the first character of our kana string. string eaten = kana.First().ToString(); kana = kana.Substring(1); Kanji k = r.GetKanji(c); if (k != null) { // On a kanji case, also eat consecutive "impossible start characters" // (ん, ょ, ゃ, ゅ, っ) while (kana.Length > 0 && ImpossibleCutStart.Contains(kana.First())) { eaten += kana.First(); kana = kana.Substring(1); } furigana.Add(new FuriganaPart(eaten, i)); } else if (!KanaHelper.IsAllKana(c.ToString())) { // The character is neither a kanji or a kana. // Cannot solve. yield break; } else { if (eaten != c.ToString()) { // The character browsed is a kana but is not the // character that we just ate. We made a mistake // in one of the kanji readings, meaning that we... // Cannot solve. yield break; } } } if (kana.Length == 0) { // We consumed the whole kana string. // The case is solved. yield return(new FuriganaSolution(v, furigana)); } }
protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v) { int kanjiCount = v.KanjiReading.Count(c => r.GetKanji(c) != null); if (kanjiCount == 1) { int kanjiIndex = 0; string kanaReading = v.KanaReading; // See if there are only obvious characters around. // Browse the kanji reading and eat characters until we get to // the kanji character. for (int i = 0; i < v.KanjiReading.Length; i++) { char c = v.KanjiReading[i]; Kanji k = r.GetKanji(c); if (k == null) { if (kanaReading.First() == c) { // Remove the first character of the reading. kanaReading = kanaReading.Substring(1); } else { // There is something wrong. Readings don't add up. // Can't solve. yield break; } } else { // We are on the kanji. Skip. kanjiIndex = i; break; } } // Now browse in reverse and eat characters until we get back to // the kanji character. for (int i = v.KanjiReading.Length - 1; i >= 0; i--) { char c = v.KanjiReading[i]; Kanji k = r.GetKanji(c); if (k == null) { if (kanaReading.Last() == c) { // Eat the last character of the reading. kanaReading = kanaReading.Substring(0, kanaReading.Length - 1); } else { // There is something wrong. Readings don't add up. // Can't solve. yield break; } } else { // We are on the kanji. Skip. break; } } // We are done. Our kanaReading contains only what's left when eating the kana // before and after the kanji. It's the reading of our kanji. yield return(new FuriganaSolution(v, new FuriganaPart(kanaReading, kanjiIndex))); } }