Пример #1
0
        /// <summary>
        /// Attempts to solve cases where the length of the kanji reading matches the length of the
        /// kana reading.
        /// </summary>
        protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v)
        {
            if (v.KanjiReading.Length == v.KanaReading.Length)
            {
                List <FuriganaPart> parts = new List <FuriganaPart>();
                for (int i = 0; i < v.KanjiReading.Length; i++)
                {
                    if (r.GetKanji(v.KanjiReading[i]) != null)
                    {
                        parts.Add(new FuriganaPart(v.KanaReading[i].ToString(), i));
                    }
                    else if (!KanaHelper.IsAllKana(v.KanjiReading[i].ToString()))
                    {
                        // Our character is not a kanji and apparently not a kana either.
                        // Stop right there. It's probably a trap.
                        yield break;
                    }
                    else
                    {
                        if (!KanaHelper.AreEquivalent(v.KanjiReading[i].ToString(), v.KanaReading[i].ToString()))
                        {
                            // We are reading kana characters that are not equivalent. Stop.
                            yield break;
                        }
                    }
                }

                if (parts.Any())
                {
                    yield return(new FuriganaSolution(v, parts));
                }
            }
        }
Пример #2
0
        /// <summary>
        /// Recursive method that reads the kanji reading string and attempts to find all the ways the
        /// kana reading could be cut by matching it with the potential kanji readings.
        /// </summary>
        /// <param name="r">Resource set.</param>
        /// <param name="v">Vocab to solve.</param>
        /// <param name="currentIndexKanji">Current position in the kanji string. Used for recursion.</param>
        /// <param name="currentIndexKana">Current position in the kana string. Used for recursion.</param>
        /// <param name="currentCut">Current furigana parts. Used for recursion.</param>
        private IEnumerable <FuriganaSolution> TryReading(FuriganaResourceSet r, VocabEntry v,
                                                          int currentIndexKanji, int currentIndexKana, List <FuriganaPart> currentCut)
        {
            if (currentIndexKanji == v.KanjiReading.Length && currentIndexKana == v.KanaReading.Length)
            {
                // We successfuly read the word and stopped at the last character in both kanji and kana readings.
                // Our current cut is valid. Return it.
                yield return(new FuriganaSolution(v, currentCut));

                yield break;
            }
            else if (currentIndexKanji >= v.KanjiReading.Length || currentIndexKana >= v.KanaReading.Length)
            {
                // Broken case. Do not return anything.
                yield break;
            }

            // Search for special expressions.
            bool foundSpecialExpressions = false;

            foreach (FuriganaSolution solution in FindSpecialExpressions(r, v, currentIndexKanji, currentIndexKana, currentCut))
            {
                foundSpecialExpressions = true;
                yield return(solution);
            }

            if (foundSpecialExpressions)
            {
                yield break;
            }

            // General case. Get the current character and see if it is a kanji.
            char c = v.KanjiReading[currentIndexKanji];

            if (c == '々' && currentIndexKanji > 0)
            {
                // Special case: handle the repeater kanji by using the previous character instead.
                c = v.KanjiReading[currentIndexKanji - 1];
            }
            Kanji k = r.GetKanji(c);

            if (k != null)
            {
                // Read as kanji subpart.
                foreach (FuriganaSolution solution in ReadAsKanji(r, v, currentIndexKanji, currentIndexKana, currentCut, c, k))
                {
                    yield return(solution);
                }
            }
            else
            {
                // Read as kana subpart.
                foreach (FuriganaSolution solution in ReadAsKana(r, v, currentIndexKanji, currentIndexKana, currentCut, c))
                {
                    yield return(solution);
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Attempts to solve furigana in cases where there are no consecutive kanji in the kanji string,
        /// using regular expressions.
        /// </summary>
        protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v)
        {
            // We are using both a greedy expression and a lazy expression because we want to make sure
            // there is only one way to read them. If the result differs with a greedy or a lazy expression,
            // it means that we have no idea how to read the damn thing.
            string     regGreedy         = "^";
            string     regLazy           = "^";
            bool       consecutiveMarker = false;
            List <int> kanjiIndexes      = new List <int>(4);

            for (int i = 0; i < v.KanjiReading.Length; i++)
            {
                char  c = v.KanjiReading[i];
                Kanji k = r.GetKanji(c);
                if (k == null)
                {
                    // Add the characters to the string. No capture group for kana.
                    regGreedy        += string.Format(c.ToString());
                    regLazy          += string.Format(c.ToString());
                    consecutiveMarker = false;
                }
                else if (consecutiveMarker)
                {
                    // Consecutive kanji. The vocab entry is not eligible for this solution.
                    yield break;
                }
                else
                {
                    // Add the characters inside a capture group for kanji.
                    regGreedy        += "(.+)";
                    regLazy          += "(.+?)";
                    consecutiveMarker = true;
                    kanjiIndexes.Add(i);
                }
            }
            regGreedy += "$";
            regLazy   += "$";

            // Example regex:
            // For 持ち運ぶ (もちはこぶ)
            // The regexes would be:
            // ^(.+)ち(.+)ぶ$
            // ^(.+?)ち(.+?)ぶ$

            Regex regexGreedy = new Regex(regGreedy);
            Regex regexLazy   = new Regex(regLazy);
            Match matchGreedy = regexGreedy.Match(v.KanaReading);
            Match matchLazy   = regexLazy.Match(v.KanaReading);

            if (matchGreedy.Success && matchLazy.Success)
            {
                // Obtain both solutions.
                FuriganaSolution greedySolution = MakeSolutionFromMatch(v, matchGreedy, kanjiIndexes);
                FuriganaSolution lazySolution   = MakeSolutionFromMatch(v, matchLazy, kanjiIndexes);

                // Are both solutions non-null and equivalent?
                if (greedySolution != null && lazySolution != null && greedySolution.Equals(lazySolution))
                {
                    // Yes they are! Return only one of them of course.
                    // Greedy wins obviously.
                    yield return(greedySolution);
                }
            }
        }
Пример #4
0
        /// <summary>
        /// Attempts to solve furigana by reading the kana string and attributing kanji a reading based
        /// not on the readings of the kanji, but on the kana characters that come up.
        /// </summary>
        protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v)
        {
            // Basically, we are reading the kanji reading character by character, eating the kana from
            // the kana reading and associating each kanji the piece of kana that comes next.
            // The thing is, we are taking advantage that kanji readings cannot start with certain
            // kana (ん and the small characters).
            // If we just stumbled upon a kanji and the next characters of the kana string are of these
            // impossible start kana, we can automatically associate them with the kanji.
            // Now this will work only for a number of vocab, but it does significantly improve the results.
            // It is especially good for 2-characters compounds that use unusual readings.

            /// Example: 阿呆陀羅 (あほんだら)
            /// Read the あ for 阿;
            /// Read the ほ for 呆;
            /// Read the ん: it's an impossible start character, so it goes with 呆 as well;
            /// Read the だ for 陀;
            /// Read the ら for 羅.

            string kana = v.KanaReading;
            List <FuriganaPart> furigana = new List <FuriganaPart>();

            for (int i = 0; i < v.KanjiReading.Length; i++)
            {
                if (kana.Length == 0)
                {
                    // We still have characters to browse in our kanji reading, but
                    // there are no more kana to consume. Cannot solve.
                    yield break;
                }

                char c = v.KanjiReading[i];
                // Check for special expressions
                bool foundExpression = false;
                for (int j = v.KanjiReading.Length - 1; j >= i; j--)
                {
                    string            lookup     = v.KanjiReading.Substring(i, (j - i) + 1);
                    SpecialExpression expression = r.GetExpression(lookup);
                    if (expression != null)
                    {
                        // We found an expression.
                        foreach (SpecialReading expressionReading in ReadingExpander.GetPotentialSpecialReadings(
                                     expression, i == 0, j == v.KanjiReading.Length - 1))
                        {
                            if (kana.Length >= expressionReading.KanaReading.Length &&
                                kana.Substring(0, expressionReading.KanaReading.Length) == expressionReading.KanaReading)
                            {
                                // The reading matches.
                                // Eat the kana chain.
                                furigana.AddRange(expressionReading.Furigana.Furigana
                                                  .Select(fp => new FuriganaPart(fp.Value, fp.StartIndex + i, fp.EndIndex + i)));
                                kana            = kana.Substring(expressionReading.KanaReading.Length);
                                i               = j;
                                foundExpression = true;
                                break;
                            }
                        }

                        if (foundExpression)
                        {
                            break;
                        }
                    }
                }

                if (foundExpression)
                {
                    continue;
                }

                // Normal process: eat the first character of our kana string.
                string eaten = kana.First().ToString();
                kana = kana.Substring(1);
                Kanji k = r.GetKanji(c);
                if (k != null)
                {
                    // On a kanji case, also eat consecutive "impossible start characters"
                    // (ん, ょ, ゃ, ゅ, っ)
                    while (kana.Length > 0 && ImpossibleCutStart.Contains(kana.First()))
                    {
                        eaten += kana.First();
                        kana   = kana.Substring(1);
                    }

                    furigana.Add(new FuriganaPart(eaten, i));
                }
                else if (!KanaHelper.IsAllKana(c.ToString()))
                {
                    // The character is neither a kanji or a kana.
                    // Cannot solve.
                    yield break;
                }
                else
                {
                    if (eaten != c.ToString())
                    {
                        // The character browsed is a kana but is not the
                        // character that we just ate. We made a mistake
                        // in one of the kanji readings, meaning that we...
                        // Cannot solve.
                        yield break;
                    }
                }
            }

            if (kana.Length == 0)
            {
                // We consumed the whole kana string.
                // The case is solved.
                yield return(new FuriganaSolution(v, furigana));
            }
        }
Пример #5
0
        protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v)
        {
            int kanjiCount = v.KanjiReading.Count(c => r.GetKanji(c) != null);

            if (kanjiCount == 1)
            {
                int kanjiIndex = 0;

                string kanaReading = v.KanaReading;
                // See if there are only obvious characters around.

                // Browse the kanji reading and eat characters until we get to
                // the kanji character.
                for (int i = 0; i < v.KanjiReading.Length; i++)
                {
                    char  c = v.KanjiReading[i];
                    Kanji k = r.GetKanji(c);
                    if (k == null)
                    {
                        if (kanaReading.First() == c)
                        {
                            // Remove the first character of the reading.
                            kanaReading = kanaReading.Substring(1);
                        }
                        else
                        {
                            // There is something wrong. Readings don't add up.
                            // Can't solve.
                            yield break;
                        }
                    }
                    else
                    {
                        // We are on the kanji. Skip.
                        kanjiIndex = i;
                        break;
                    }
                }

                // Now browse in reverse and eat characters until we get back to
                // the kanji character.
                for (int i = v.KanjiReading.Length - 1; i >= 0; i--)
                {
                    char  c = v.KanjiReading[i];
                    Kanji k = r.GetKanji(c);
                    if (k == null)
                    {
                        if (kanaReading.Last() == c)
                        {
                            // Eat the last character of the reading.
                            kanaReading = kanaReading.Substring(0, kanaReading.Length - 1);
                        }
                        else
                        {
                            // There is something wrong. Readings don't add up.
                            // Can't solve.
                            yield break;
                        }
                    }
                    else
                    {
                        // We are on the kanji. Skip.
                        break;
                    }
                }

                // We are done. Our kanaReading contains only what's left when eating the kana
                // before and after the kanji. It's the reading of our kanji.
                yield return(new FuriganaSolution(v, new FuriganaPart(kanaReading, kanjiIndex)));
            }
        }